diff options
Diffstat (limited to 'fs')
-rw-r--r-- | fs/Kconfig | 1 | ||||
-rw-r--r-- | fs/Makefile | 1 | ||||
-rw-r--r-- | fs/erofs/Kconfig | 98 | ||||
-rw-r--r-- | fs/erofs/Makefile | 11 | ||||
-rw-r--r-- | fs/erofs/compress.h | 60 | ||||
-rw-r--r-- | fs/erofs/data.c | 423 | ||||
-rw-r--r-- | fs/erofs/decompressor.c | 358 | ||||
-rw-r--r-- | fs/erofs/dir.c | 139 | ||||
-rw-r--r-- | fs/erofs/erofs_fs.h | 307 | ||||
-rw-r--r-- | fs/erofs/inode.c | 332 | ||||
-rw-r--r-- | fs/erofs/internal.h | 553 | ||||
-rw-r--r-- | fs/erofs/namei.c | 251 | ||||
-rw-r--r-- | fs/erofs/super.c | 669 | ||||
-rw-r--r-- | fs/erofs/tagptr.h | 110 | ||||
-rw-r--r-- | fs/erofs/utils.c | 333 | ||||
-rw-r--r-- | fs/erofs/xattr.c | 703 | ||||
-rw-r--r-- | fs/erofs/xattr.h | 92 | ||||
-rw-r--r-- | fs/erofs/zdata.c | 1432 | ||||
-rw-r--r-- | fs/erofs/zdata.h | 193 | ||||
-rw-r--r-- | fs/erofs/zmap.c | 466 | ||||
-rw-r--r-- | fs/erofs/zpvec.h | 157 |
21 files changed, 6689 insertions, 0 deletions
diff --git a/fs/Kconfig b/fs/Kconfig index bfb1c6095c7a..669d46550e6d 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -261,6 +261,7 @@ source "fs/romfs/Kconfig" source "fs/pstore/Kconfig" source "fs/sysv/Kconfig" source "fs/ufs/Kconfig" +source "fs/erofs/Kconfig" endif # MISC_FILESYSTEMS diff --git a/fs/Makefile b/fs/Makefile index d60089fd689b..b2e4973a0bea 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -130,3 +130,4 @@ obj-$(CONFIG_F2FS_FS) += f2fs/ obj-$(CONFIG_CEPH_FS) += ceph/ obj-$(CONFIG_PSTORE) += pstore/ obj-$(CONFIG_EFIVAR_FS) += efivarfs/ +obj-$(CONFIG_EROFS_FS) += erofs/ diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig new file mode 100644 index 000000000000..16316d1adca3 --- /dev/null +++ b/fs/erofs/Kconfig @@ -0,0 +1,98 @@ +# SPDX-License-Identifier: GPL-2.0-only + +config EROFS_FS + tristate "EROFS filesystem support" + depends on BLOCK + help + EROFS (Enhanced Read-Only File System) is a lightweight + read-only file system with modern designs (eg. page-sized + blocks, inline xattrs/data, etc.) for scenarios which need + high-performance read-only requirements, e.g. Android OS + for mobile phones and LIVECDs. + + It also provides fixed-sized output compression support, + which improves storage density, keeps relatively higher + compression ratios, which is more useful to achieve high + performance for embedded devices with limited memory. + + If unsure, say N. + +config EROFS_FS_DEBUG + bool "EROFS debugging feature" + depends on EROFS_FS + help + Print debugging messages and enable more BUG_ONs which check + filesystem consistency and find potential issues aggressively, + which can be used for Android eng build, for example. + + For daily use, say N. + +config EROFS_FAULT_INJECTION + bool "EROFS fault injection facility" + depends on EROFS_FS + help + Test EROFS to inject faults such as ENOMEM, EIO, and so on. + If unsure, say N. + +config EROFS_FS_XATTR + bool "EROFS extended attributes" + depends on EROFS_FS + default y + help + Extended attributes are name:value pairs associated with inodes by + the kernel or by users (see the attr(5) manual page, or visit + <http://acl.bestbits.at/> for details). + + If unsure, say N. + +config EROFS_FS_POSIX_ACL + bool "EROFS Access Control Lists" + depends on EROFS_FS_XATTR + select FS_POSIX_ACL + default y + help + Posix Access Control Lists (ACLs) support permissions for users and + groups beyond the owner/group/world scheme. + + To learn more about Access Control Lists, visit the POSIX ACLs for + Linux website <http://acl.bestbits.at/>. + + If you don't know what Access Control Lists are, say N. + +config EROFS_FS_SECURITY + bool "EROFS Security Labels" + depends on EROFS_FS_XATTR + default y + help + Security labels provide an access control facility to support Linux + Security Models (LSMs) accepted by AppArmor, SELinux, Smack and TOMOYO + Linux. This option enables an extended attribute handler for file + security labels in the erofs filesystem, so that it requires enabling + the extended attribute support in advance. + + If you are not using a security module, say N. + +config EROFS_FS_ZIP + bool "EROFS Data Compression Support" + depends on EROFS_FS + select LZ4_DECOMPRESS + default y + help + Enable fixed-sized output compression for EROFS. + + If you don't want to enable compression feature, say N. + +config EROFS_FS_CLUSTER_PAGE_LIMIT + int "EROFS Cluster Pages Hard Limit" + depends on EROFS_FS_ZIP + range 1 256 + default "1" + help + Indicates maximum # of pages of a compressed + physical cluster. + + For example, if files in a image were compressed + into 8k-unit, hard limit should not be configured + less than 2. Otherwise, the image will be refused + to mount on this kernel. + diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile new file mode 100644 index 000000000000..46f2aa4ba46c --- /dev/null +++ b/fs/erofs/Makefile @@ -0,0 +1,11 @@ +# SPDX-License-Identifier: GPL-2.0-only + +EROFS_VERSION = "1.0" + +ccflags-y += -DEROFS_VERSION=\"$(EROFS_VERSION)\" + +obj-$(CONFIG_EROFS_FS) += erofs.o +erofs-objs := super.o inode.o data.o namei.o dir.o utils.o +erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o +erofs-$(CONFIG_EROFS_FS_ZIP) += decompressor.o zmap.o zdata.o + diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h new file mode 100644 index 000000000000..07d279fd5d67 --- /dev/null +++ b/fs/erofs/compress.h @@ -0,0 +1,60 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2019 HUAWEI, Inc. + * http://www.huawei.com/ + * Created by Gao Xiang <gaoxiang25@huawei.com> + */ +#ifndef __EROFS_FS_COMPRESS_H +#define __EROFS_FS_COMPRESS_H + +#include "internal.h" + +enum { + Z_EROFS_COMPRESSION_SHIFTED = Z_EROFS_COMPRESSION_MAX, + Z_EROFS_COMPRESSION_RUNTIME_MAX +}; + +struct z_erofs_decompress_req { + struct super_block *sb; + struct page **in, **out; + + unsigned short pageofs_out; + unsigned int inputsize, outputsize; + + /* indicate the algorithm will be used for decompression */ + unsigned int alg; + bool inplace_io, partial_decoding; +}; + +/* + * - 0x5A110C8D ('sallocated', Z_EROFS_MAPPING_STAGING) - + * used to mark temporary allocated pages from other + * file/cached pages and NULL mapping pages. + */ +#define Z_EROFS_MAPPING_STAGING ((void *)0x5A110C8D) + +/* check if a page is marked as staging */ +static inline bool z_erofs_page_is_staging(struct page *page) +{ + return page->mapping == Z_EROFS_MAPPING_STAGING; +} + +static inline bool z_erofs_put_stagingpage(struct list_head *pagepool, + struct page *page) +{ + if (!z_erofs_page_is_staging(page)) + return false; + + /* staging pages should not be used by others at the same time */ + if (page_ref_count(page) > 1) + put_page(page); + else + list_add(&page->lru, pagepool); + return true; +} + +int z_erofs_decompress(struct z_erofs_decompress_req *rq, + struct list_head *pagepool); + +#endif + diff --git a/fs/erofs/data.c b/fs/erofs/data.c new file mode 100644 index 000000000000..fda16ec8863e --- /dev/null +++ b/fs/erofs/data.c @@ -0,0 +1,423 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2017-2018 HUAWEI, Inc. + * http://www.huawei.com/ + * Created by Gao Xiang <gaoxiang25@huawei.com> + */ +#include "internal.h" +#include <linux/prefetch.h> + +#include <trace/events/erofs.h> + +static inline void read_endio(struct bio *bio) +{ + struct super_block *const sb = bio->bi_private; + struct bio_vec *bvec; + blk_status_t err = bio->bi_status; + struct bvec_iter_all iter_all; + + if (time_to_inject(EROFS_SB(sb), FAULT_READ_IO)) { + erofs_show_injection_info(FAULT_READ_IO); + err = BLK_STS_IOERR; + } + + bio_for_each_segment_all(bvec, bio, iter_all) { + struct page *page = bvec->bv_page; + + /* page is already locked */ + DBG_BUGON(PageUptodate(page)); + + if (unlikely(err)) + SetPageError(page); + else + SetPageUptodate(page); + + unlock_page(page); + /* page could be reclaimed now */ + } + bio_put(bio); +} + +/* prio -- true is used for dir */ +struct page *__erofs_get_meta_page(struct super_block *sb, + erofs_blk_t blkaddr, bool prio, bool nofail) +{ + struct inode *const bd_inode = sb->s_bdev->bd_inode; + struct address_space *const mapping = bd_inode->i_mapping; + /* prefer retrying in the allocator to blindly looping below */ + const gfp_t gfp = mapping_gfp_constraint(mapping, ~__GFP_FS) | + (nofail ? __GFP_NOFAIL : 0); + unsigned int io_retries = nofail ? EROFS_IO_MAX_RETRIES_NOFAIL : 0; + struct page *page; + int err; + +repeat: + page = find_or_create_page(mapping, blkaddr, gfp); + if (unlikely(!page)) { + DBG_BUGON(nofail); + return ERR_PTR(-ENOMEM); + } + DBG_BUGON(!PageLocked(page)); + + if (!PageUptodate(page)) { + struct bio *bio; + + bio = erofs_grab_bio(sb, blkaddr, 1, sb, read_endio, nofail); + if (IS_ERR(bio)) { + DBG_BUGON(nofail); + err = PTR_ERR(bio); + goto err_out; + } + + err = bio_add_page(bio, page, PAGE_SIZE, 0); + if (unlikely(err != PAGE_SIZE)) { + err = -EFAULT; + goto err_out; + } + + __submit_bio(bio, REQ_OP_READ, + REQ_META | (prio ? REQ_PRIO : 0)); + + lock_page(page); + + /* this page has been truncated by others */ + if (unlikely(page->mapping != mapping)) { +unlock_repeat: + unlock_page(page); + put_page(page); + goto repeat; + } + + /* more likely a read error */ + if (unlikely(!PageUptodate(page))) { + if (io_retries) { + --io_retries; + goto unlock_repeat; + } + err = -EIO; + goto err_out; + } + } + return page; + +err_out: + unlock_page(page); + put_page(page); + return ERR_PTR(err); +} + +static int erofs_map_blocks_flatmode(struct inode *inode, + struct erofs_map_blocks *map, + int flags) +{ + int err = 0; + erofs_blk_t nblocks, lastblk; + u64 offset = map->m_la; + struct erofs_vnode *vi = EROFS_V(inode); + + trace_erofs_map_blocks_flatmode_enter(inode, map, flags); + + nblocks = DIV_ROUND_UP(inode->i_size, PAGE_SIZE); + lastblk = nblocks - is_inode_flat_inline(inode); + + if (unlikely(offset >= inode->i_size)) { + /* leave out-of-bound access unmapped */ + map->m_flags = 0; + map->m_plen = 0; + goto out; + } + + /* there is no hole in flatmode */ + map->m_flags = EROFS_MAP_MAPPED; + + if (offset < blknr_to_addr(lastblk)) { + map->m_pa = blknr_to_addr(vi->raw_blkaddr) + map->m_la; + map->m_plen = blknr_to_addr(lastblk) - offset; + } else if (is_inode_flat_inline(inode)) { + /* 2 - inode inline B: inode, [xattrs], inline last blk... */ + struct erofs_sb_info *sbi = EROFS_SB(inode->i_sb); + + map->m_pa = iloc(sbi, vi->nid) + vi->inode_isize + + vi->xattr_isize + erofs_blkoff(map->m_la); + map->m_plen = inode->i_size - offset; + + /* inline data should be located in one meta block */ + if (erofs_blkoff(map->m_pa) + map->m_plen > PAGE_SIZE) { + errln("inline data cross block boundary @ nid %llu", + vi->nid); + DBG_BUGON(1); + err = -EFSCORRUPTED; + goto err_out; + } + + map->m_flags |= EROFS_MAP_META; + } else { + errln("internal error @ nid: %llu (size %llu), m_la 0x%llx", + vi->nid, inode->i_size, map->m_la); + DBG_BUGON(1); + err = -EIO; + goto err_out; + } + +out: + map->m_llen = map->m_plen; + +err_out: + trace_erofs_map_blocks_flatmode_exit(inode, map, flags, 0); + return err; +} + +int erofs_map_blocks(struct inode *inode, + struct erofs_map_blocks *map, int flags) +{ + if (unlikely(is_inode_layout_compression(inode))) { + int err = z_erofs_map_blocks_iter(inode, map, flags); + + if (map->mpage) { + put_page(map->mpage); + map->mpage = NULL; + } + return err; + } + return erofs_map_blocks_flatmode(inode, map, flags); +} + +static inline struct bio *erofs_read_raw_page(struct bio *bio, + struct address_space *mapping, + struct page *page, + erofs_off_t *last_block, + unsigned int nblocks, + bool ra) +{ + struct inode *const inode = mapping->host; + struct super_block *const sb = inode->i_sb; + erofs_off_t current_block = (erofs_off_t)page->index; + int err; + + DBG_BUGON(!nblocks); + + if (PageUptodate(page)) { + err = 0; + goto has_updated; + } + + /* note that for readpage case, bio also equals to NULL */ + if (bio && + /* not continuous */ + *last_block + 1 != current_block) { +submit_bio_retry: + __submit_bio(bio, REQ_OP_READ, 0); + bio = NULL; + } + + if (!bio) { + struct erofs_map_blocks map = { + .m_la = blknr_to_addr(current_block), + }; + erofs_blk_t blknr; + unsigned int blkoff; + + err = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW); + if (unlikely(err)) + goto err_out; + + /* zero out the holed page */ + if (unlikely(!(map.m_flags & EROFS_MAP_MAPPED))) { + zero_user_segment(page, 0, PAGE_SIZE); + SetPageUptodate(page); + + /* imply err = 0, see erofs_map_blocks */ + goto has_updated; + } + + /* for RAW access mode, m_plen must be equal to m_llen */ + DBG_BUGON(map.m_plen != map.m_llen); + + blknr = erofs_blknr(map.m_pa); + blkoff = erofs_blkoff(map.m_pa); + + /* deal with inline page */ + if (map.m_flags & EROFS_MAP_META) { + void *vsrc, *vto; + struct page *ipage; + + DBG_BUGON(map.m_plen > PAGE_SIZE); + + ipage = erofs_get_meta_page(inode->i_sb, blknr, 0); + + if (IS_ERR(ipage)) { + err = PTR_ERR(ipage); + goto err_out; + } + + vsrc = kmap_atomic(ipage); + vto = kmap_atomic(page); + memcpy(vto, vsrc + blkoff, map.m_plen); + memset(vto + map.m_plen, 0, PAGE_SIZE - map.m_plen); + kunmap_atomic(vto); + kunmap_atomic(vsrc); + flush_dcache_page(page); + + SetPageUptodate(page); + /* TODO: could we unlock the page earlier? */ + unlock_page(ipage); + put_page(ipage); + + /* imply err = 0, see erofs_map_blocks */ + goto has_updated; + } + + /* pa must be block-aligned for raw reading */ + DBG_BUGON(erofs_blkoff(map.m_pa)); + + /* max # of continuous pages */ + if (nblocks > DIV_ROUND_UP(map.m_plen, PAGE_SIZE)) + nblocks = DIV_ROUND_UP(map.m_plen, PAGE_SIZE); + if (nblocks > BIO_MAX_PAGES) + nblocks = BIO_MAX_PAGES; + + bio = erofs_grab_bio(sb, blknr, nblocks, sb, + read_endio, false); + if (IS_ERR(bio)) { + err = PTR_ERR(bio); + bio = NULL; + goto err_out; + } + } + + err = bio_add_page(bio, page, PAGE_SIZE, 0); + /* out of the extent or bio is full */ + if (err < PAGE_SIZE) + goto submit_bio_retry; + + *last_block = current_block; + + /* shift in advance in case of it followed by too many gaps */ + if (bio->bi_iter.bi_size >= bio->bi_max_vecs * PAGE_SIZE) { + /* err should reassign to 0 after submitting */ + err = 0; + goto submit_bio_out; + } + + return bio; + +err_out: + /* for sync reading, set page error immediately */ + if (!ra) { + SetPageError(page); + ClearPageUptodate(page); + } +has_updated: + unlock_page(page); + + /* if updated manually, continuous pages has a gap */ + if (bio) +submit_bio_out: + __submit_bio(bio, REQ_OP_READ, 0); + + return unlikely(err) ? ERR_PTR(err) : NULL; +} + +/* + * since we dont have write or truncate flows, so no inode + * locking needs to be held at the moment. + */ +static int erofs_raw_access_readpage(struct file *file, struct page *page) +{ + erofs_off_t last_block; + struct bio *bio; + + trace_erofs_readpage(page, true); + + bio = erofs_read_raw_page(NULL, page->mapping, + page, &last_block, 1, false); + + if (IS_ERR(bio)) + return PTR_ERR(bio); + + DBG_BUGON(bio); /* since we have only one bio -- must be NULL */ + return 0; +} + +static int erofs_raw_access_readpages(struct file *filp, + struct address_space *mapping, + struct list_head *pages, + unsigned int nr_pages) +{ + erofs_off_t last_block; + struct bio *bio = NULL; + gfp_t gfp = readahead_gfp_mask(mapping); + struct page *page = list_last_entry(pages, struct page, lru); + + trace_erofs_readpages(mapping->host, page, nr_pages, true); + + for (; nr_pages; --nr_pages) { + page = list_entry(pages->prev, struct page, lru); + + prefetchw(&page->flags); + list_del(&page->lru); + + if (!add_to_page_cache_lru(page, mapping, page->index, gfp)) { + bio = erofs_read_raw_page(bio, mapping, page, + &last_block, nr_pages, true); + + /* all the page errors are ignored when readahead */ + if (IS_ERR(bio)) { + pr_err("%s, readahead error at page %lu of nid %llu\n", + __func__, page->index, + EROFS_V(mapping->host)->nid); + + bio = NULL; + } + } + + /* pages could still be locked */ + put_page(page); + } + DBG_BUGON(!list_empty(pages)); + + /* the rare case (end in gaps) */ + if (unlikely(bio)) + __submit_bio(bio, REQ_OP_READ, 0); + return 0; +} + +static int erofs_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh, int create) +{ + struct erofs_map_blocks map = { + .m_la = iblock << 9, + }; + int err; + + err = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW); + if (err) + return err; + + if (map.m_flags & EROFS_MAP_MAPPED) + bh->b_blocknr = erofs_blknr(map.m_pa); + + return err; +} + +static sector_t erofs_bmap(struct address_space *mapping, sector_t block) +{ + struct inode *inode = mapping->host; + + if (is_inode_flat_inline(inode)) { + erofs_blk_t blks = i_size_read(inode) >> LOG_BLOCK_SIZE; + + if (block >> LOG_SECTORS_PER_BLOCK >= blks) + return 0; + } + + return generic_block_bmap(mapping, block, erofs_get_block); +} + +/* for uncompressed (aligned) files and raw access for other files */ +const struct address_space_operations erofs_raw_access_aops = { + .readpage = erofs_raw_access_readpage, + .readpages = erofs_raw_access_readpages, + .bmap = erofs_bmap, +}; + diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c new file mode 100644 index 000000000000..5f4b7f302863 --- /dev/null +++ b/fs/erofs/decompressor.c @@ -0,0 +1,358 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2019 HUAWEI, Inc. + * http://www.huawei.com/ + * Created by Gao Xiang <gaoxiang25@huawei.com> + */ +#include "compress.h" +#include <linux/module.h> +#include <linux/lz4.h> + +#ifndef LZ4_DISTANCE_MAX /* history window size */ +#define LZ4_DISTANCE_MAX 65535 /* set to maximum value by default */ +#endif + +#define LZ4_MAX_DISTANCE_PAGES (DIV_ROUND_UP(LZ4_DISTANCE_MAX, PAGE_SIZE) + 1) +#ifndef LZ4_DECOMPRESS_INPLACE_MARGIN +#define LZ4_DECOMPRESS_INPLACE_MARGIN(srcsize) (((srcsize) >> 8) + 32) +#endif + +struct z_erofs_decompressor { + /* + * if destpages have sparsed pages, fill them with bounce pages. + * it also check whether destpages indicate continuous physical memory. + */ + int (*prepare_destpages)(struct z_erofs_decompress_req *rq, + struct list_head *pagepool); + int (*decompress)(struct z_erofs_decompress_req *rq, u8 *out); + char *name; +}; + +static bool use_vmap; +module_param(use_vmap, bool, 0444); +MODULE_PARM_DESC(use_vmap, "Use vmap() instead of vm_map_ram() (default 0)"); + +static int lz4_prepare_destpages(struct z_erofs_decompress_req *rq, + struct list_head *pagepool) +{ + const unsigned int nr = + PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT; + struct page *availables[LZ4_MAX_DISTANCE_PAGES] = { NULL }; + unsigned long bounced[DIV_ROUND_UP(LZ4_MAX_DISTANCE_PAGES, + BITS_PER_LONG)] = { 0 }; + void *kaddr = NULL; + unsigned int i, j, top; + + top = 0; + for (i = j = 0; i < nr; ++i, ++j) { + struct page *const page = rq->out[i]; + struct page *victim; + + if (j >= LZ4_MAX_DISTANCE_PAGES) + j = 0; + + /* 'valid' bounced can only be tested after a complete round */ + if (test_bit(j, bounced)) { + DBG_BUGON(i < LZ4_MAX_DISTANCE_PAGES); + DBG_BUGON(top >= LZ4_MAX_DISTANCE_PAGES); + availables[top++] = rq->out[i - LZ4_MAX_DISTANCE_PAGES]; + } + + if (page) { + __clear_bit(j, bounced); + if (kaddr) { + if (kaddr + PAGE_SIZE == page_address(page)) + kaddr += PAGE_SIZE; + else + kaddr = NULL; + } else if (!i) { + kaddr = page_address(page); + } + continue; + } + kaddr = NULL; + __set_bit(j, bounced); + + if (top) { + victim = availables[--top]; + get_page(victim); + } else { + victim = erofs_allocpage(pagepool, GFP_KERNEL, false); + if (unlikely(!victim)) + return -ENOMEM; + victim->mapping = Z_EROFS_MAPPING_STAGING; + } + rq->out[i] = victim; + } + return kaddr ? 1 : 0; +} + +static void *generic_copy_inplace_data(struct z_erofs_decompress_req *rq, + u8 *src, unsigned int pageofs_in) +{ + /* + * if in-place decompression is ongoing, those decompressed + * pages should be copied in order to avoid being overlapped. + */ + struct page **in = rq->in; + u8 *const tmp = erofs_get_pcpubuf(0); + u8 *tmpp = tmp; + unsigned int inlen = rq->inputsize - pageofs_in; + unsigned int count = min_t(uint, inlen, PAGE_SIZE - pageofs_in); + + while (tmpp < tmp + inlen) { + if (!src) + src = kmap_atomic(*in); + memcpy(tmpp, src + pageofs_in, count); + kunmap_atomic(src); + src = NULL; + tmpp += count; + pageofs_in = 0; + count = PAGE_SIZE; + ++in; + } + return tmp; +} + +static int lz4_decompress(struct z_erofs_decompress_req *rq, u8 *out) +{ + unsigned int inputmargin, inlen; + u8 *src; + bool copied, support_0padding; + int ret; + + if (rq->inputsize > PAGE_SIZE) + return -EOPNOTSUPP; + + src = kmap_atomic(*rq->in); + inputmargin = 0; + support_0padding = false; + + /* decompression inplace is only safe when 0padding is enabled */ + if (EROFS_SB(rq->sb)->requirements & EROFS_REQUIREMENT_LZ4_0PADDING) { + support_0padding = true; + + while (!src[inputmargin & ~PAGE_MASK]) + if (!(++inputmargin & ~PAGE_MASK)) + break; + + if (inputmargin >= rq->inputsize) { + kunmap_atomic(src); + return -EIO; + } + } + + copied = false; + inlen = rq->inputsize - inputmargin; + if (rq->inplace_io) { + const uint oend = (rq->pageofs_out + + rq->outputsize) & ~PAGE_MASK; + const uint nr = PAGE_ALIGN(rq->pageofs_out + + rq->outputsize) >> PAGE_SHIFT; + + if (rq->partial_decoding || !support_0padding || + rq->out[nr - 1] != rq->in[0] || + rq->inputsize - oend < + LZ4_DECOMPRESS_INPLACE_MARGIN(inlen)) { + src = generic_copy_inplace_data(rq, src, inputmargin); + inputmargin = 0; + copied = true; + } + } + + ret = LZ4_decompress_safe_partial(src + inputmargin, out, + inlen, rq->outputsize, + rq->outputsize); + if (ret < 0) { + errln("%s, failed to decompress, in[%p, %u, %u] out[%p, %u]", + __func__, src + inputmargin, inlen, inputmargin, + out, rq->outputsize); + WARN_ON(1); + print_hex_dump(KERN_DEBUG, "[ in]: ", DUMP_PREFIX_OFFSET, + 16, 1, src + inputmargin, inlen, true); + print_hex_dump(KERN_DEBUG, "[out]: ", DUMP_PREFIX_OFFSET, + 16, 1, out, rq->outputsize, true); + ret = -EIO; + } + + if (copied) + erofs_put_pcpubuf(src); + else + kunmap_atomic(src); + return ret; +} + +static struct z_erofs_decompressor decompressors[] = { + [Z_EROFS_COMPRESSION_SHIFTED] = { + .name = "shifted" + }, + [Z_EROFS_COMPRESSION_LZ4] = { + .prepare_destpages = lz4_prepare_destpages, + .decompress = lz4_decompress, + .name = "lz4" + }, +}; + +static void copy_from_pcpubuf(struct page **out, const char *dst, + unsigned short pageofs_out, + unsigned int outputsize) +{ + const char *end = dst + outputsize; + const unsigned int righthalf = PAGE_SIZE - pageofs_out; + const char *cur = dst - pageofs_out; + + while (cur < end) { + struct page *const page = *out++; + + if (page) { + char *buf = kmap_atomic(page); + + if (cur >= dst) { + memcpy(buf, cur, min_t(uint, PAGE_SIZE, + end - cur)); + } else { + memcpy(buf + pageofs_out, cur + pageofs_out, + min_t(uint, righthalf, end - cur)); + } + kunmap_atomic(buf); + } + cur += PAGE_SIZE; + } +} + +static void *erofs_vmap(struct page **pages, unsigned int count) +{ + int i = 0; + + if (use_vmap) + return vmap(pages, count, VM_MAP, PAGE_KERNEL); + + while (1) { + void *addr = vm_map_ram(pages, count, -1, PAGE_KERNEL); + + /* retry two more times (totally 3 times) */ + if (addr || ++i >= 3) + return addr; + vm_unmap_aliases(); + } + return NULL; +} + +static void erofs_vunmap(const void *mem, unsigned int count) +{ + if (!use_vmap) + vm_unmap_ram(mem, count); + else + vunmap(mem); +} + +static int decompress_generic(struct z_erofs_decompress_req *rq, + struct list_head *pagepool) +{ + const unsigned int nrpages_out = + PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT; + const struct z_erofs_decompressor *alg = decompressors + rq->alg; + unsigned int dst_maptype; + void *dst; + int ret; + + if (nrpages_out == 1 && !rq->inplace_io) { + DBG_BUGON(!*rq->out); + dst = kmap_atomic(*rq->out); + dst_maptype = 0; + goto dstmap_out; + } + + /* + * For the case of small output size (especially much less + * than PAGE_SIZE), memcpy the decompressed data rather than + * compressed data is preferred. + */ + if (rq->outputsize <= PAGE_SIZE * 7 / 8) { + dst = erofs_get_pcpubuf(0); + if (IS_ERR(dst)) + return PTR_ERR(dst); + + rq->inplace_io = false; + ret = alg->decompress(rq, dst); + if (!ret) + copy_from_pcpubuf(rq->out, dst, rq->pageofs_out, + rq->outputsize); + + erofs_put_pcpubuf(dst); + return ret; + } + + ret = alg->prepare_destpages(rq, pagepool); + if (ret < 0) { + return ret; + } else if (ret) { + dst = page_address(*rq->out); + dst_maptype = 1; + goto dstmap_out; + } + + dst = erofs_vmap(rq->out, nrpages_out); + if (!dst) + return -ENOMEM; + dst_maptype = 2; + +dstmap_out: + ret = alg->decompress(rq, dst + rq->pageofs_out); + + if (!dst_maptype) + kunmap_atomic(dst); + else if (dst_maptype == 2) + erofs_vunmap(dst, nrpages_out); + return ret; +} + +static int shifted_decompress(const struct z_erofs_decompress_req *rq, + struct list_head *pagepool) +{ + const unsigned int nrpages_out = + PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT; + const unsigned int righthalf = PAGE_SIZE - rq->pageofs_out; + unsigned char *src, *dst; + + if (nrpages_out > 2) { + DBG_BUGON(1); + return -EIO; + } + + if (rq->out[0] == *rq->in) { + DBG_BUGON(nrpages_out != 1); + return 0; + } + + src = kmap_atomic(*rq->in); + if (!rq->out[0]) { + dst = NULL; + } else { + dst = kmap_atomic(rq->out[0]); + memcpy(dst + rq->pageofs_out, src, righthalf); + } + + if (rq->out[1] == *rq->in) { + memmove(src, src + righthalf, rq->pageofs_out); + } else if (nrpages_out == 2) { + if (dst) + kunmap_atomic(dst); + DBG_BUGON(!rq->out[1]); + dst = kmap_atomic(rq->out[1]); + memcpy(dst, src + righthalf, rq->pageofs_out); + } + if (dst) + kunmap_atomic(dst); + kunmap_atomic(src); + return 0; +} + +int z_erofs_decompress(struct z_erofs_decompress_req *rq, + struct list_head *pagepool) +{ + if (rq->alg == Z_EROFS_COMPRESSION_SHIFTED) + return shifted_decompress(rq, pagepool); + return decompress_generic(rq, pagepool); +} + diff --git a/fs/erofs/dir.c b/fs/erofs/dir.c new file mode 100644 index 000000000000..1976e60e5174 --- /dev/null +++ b/fs/erofs/dir.c @@ -0,0 +1,139 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2017-2018 HUAWEI, Inc. + * http://www.huawei.com/ + * Created by Gao Xiang <gaoxiang25@huawei.com> + */ +#include "internal.h" + +static void debug_one_dentry(unsigned char d_type, const char *de_name, + unsigned int de_namelen) +{ +#ifdef CONFIG_EROFS_FS_DEBUG + /* since the on-disk name could not have the trailing '\0' */ + unsigned char dbg_namebuf[EROFS_NAME_LEN + 1]; + + memcpy(dbg_namebuf, de_name, de_namelen); + dbg_namebuf[de_namelen] = '\0'; + + debugln("found dirent %s de_len %u d_type %d", dbg_namebuf, + de_namelen, d_type); +#endif +} + +static int erofs_fill_dentries(struct inode *dir, struct dir_context *ctx, + void *dentry_blk, unsigned int *ofs, + unsigned int nameoff, unsigned int maxsize) +{ + struct erofs_dirent *de = dentry_blk + *ofs; + const struct erofs_dirent *end = dentry_blk + nameoff; + + while (de < end) { + const char *de_name; + unsigned int de_namelen; + unsigned char d_type; + + d_type = fs_ftype_to_dtype(de->file_type); + + nameoff = le16_to_cpu(de->nameoff); + de_name = (char *)dentry_blk + nameoff; + + /* the last dirent in the block? */ + if (de + 1 >= end) + de_namelen = strnlen(de_name, maxsize - nameoff); + else + de_namelen = le16_to_cpu(de[1].nameoff) - nameoff; + + /* a corrupted entry is found */ + if (unlikely(nameoff + de_namelen > maxsize || + de_namelen > EROFS_NAME_LEN)) { + errln("bogus dirent @ nid %llu", EROFS_V(dir)->nid); + DBG_BUGON(1); + return -EFSCORRUPTED; + } + + debug_one_dentry(d_type, de_name, de_namelen); + if (!dir_emit(ctx, de_name, de_namelen, + le64_to_cpu(de->nid), d_type)) + /* stopped by some reason */ + return 1; + ++de; + *ofs += sizeof(struct erofs_dirent); + } + *ofs = maxsize; + return 0; +} + +static int erofs_readdir(struct file *f, struct dir_context *ctx) +{ + struct inode *dir = file_inode(f); + struct address_space *mapping = dir->i_mapping; + const size_t dirsize = i_size_read(dir); + unsigned int i = ctx->pos / EROFS_BLKSIZ; + unsigned int ofs = ctx->pos % EROFS_BLKSIZ; + int err = 0; + bool initial = true; + + while (ctx->pos < dirsize) { + struct page *dentry_page; + struct erofs_dirent *de; + unsigned int nameoff, maxsize; + + dentry_page = read_mapping_page(mapping, i, NULL); + if (dentry_page == ERR_PTR(-ENOMEM)) { + err = -ENOMEM; + break; + } else if (IS_ERR(dentry_page)) { + errln("fail to readdir of logical block %u of nid %llu", + i, EROFS_V(dir)->nid); + err = -EFSCORRUPTED; + break; + } + + de = (struct erofs_dirent *)kmap(dentry_page); + + nameoff = le16_to_cpu(de->nameoff); + + if (unlikely(nameoff < sizeof(struct erofs_dirent) || + nameoff >= PAGE_SIZE)) { + errln("%s, invalid de[0].nameoff %u @ nid %llu", + __func__, nameoff, EROFS_V(dir)->nid); + err = -EFSCORRUPTED; + goto skip_this; + } + + maxsize = min_t(unsigned int, + dirsize - ctx->pos + ofs, PAGE_SIZE); + + /* search dirents at the arbitrary position */ + if (unlikely(initial)) { + initial = false; + + ofs = roundup(ofs, sizeof(struct erofs_dirent)); + if (unlikely(ofs >= nameoff)) + goto skip_this; + } + + err = erofs_fill_dentries(dir, ctx, de, &ofs, + nameoff, maxsize); +skip_this: + kunmap(dentry_page); + + put_page(dentry_page); + + ctx->pos = blknr_to_addr(i) + ofs; + + if (unlikely(err)) + break; + ++i; + ofs = 0; + } + return err < 0 ? err : 0; +} + +const struct file_operations erofs_dir_fops = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .iterate_shared = erofs_readdir, +}; + diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h new file mode 100644 index 000000000000..afa7d45ca958 --- /dev/null +++ b/fs/erofs/erofs_fs.h @@ -0,0 +1,307 @@ +/* SPDX-License-Identifier: GPL-2.0-only OR Apache-2.0 */ +/* + * Copyright (C) 2017-2018 HUAWEI, Inc. + * http://www.huawei.com/ + * Created by Gao Xiang <gaoxiang25@huawei.com> + */ +#ifndef __EROFS_FS_H +#define __EROFS_FS_H + +/* Enhanced(Extended) ROM File System */ +#define EROFS_SUPER_OFFSET 1024 + +/* + * Any bits that aren't in EROFS_ALL_REQUIREMENTS should be + * incompatible with this kernel version. + */ +#define EROFS_REQUIREMENT_LZ4_0PADDING 0x00000001 +#define EROFS_ALL_REQUIREMENTS EROFS_REQUIREMENT_LZ4_0PADDING + +struct erofs_super_block { +/* 0 */__le32 magic; /* in the little endian */ +/* 4 */__le32 checksum; /* crc32c(super_block) */ +/* 8 */__le32 features; /* (aka. feature_compat) */ +/* 12 */__u8 blkszbits; /* support block_size == PAGE_SIZE only */ +/* 13 */__u8 reserved; + +/* 14 */__le16 root_nid; +/* 16 */__le64 inos; /* total valid ino # (== f_files - f_favail) */ + +/* 24 */__le64 build_time; /* inode v1 time derivation */ +/* 32 */__le32 build_time_nsec; +/* 36 */__le32 blocks; /* used for statfs */ +/* 40 */__le32 meta_blkaddr; +/* 44 */__le32 xattr_blkaddr; +/* 48 */__u8 uuid[16]; /* 128-bit uuid for volume */ +/* 64 */__u8 volume_name[16]; /* volume name */ +/* 80 */__le32 requirements; /* (aka. feature_incompat) */ + +/* 84 */__u8 reserved2[44]; +} __packed; /* 128 bytes */ + +/* + * erofs inode data mapping: + * 0 - inode plain without inline data A: + * inode, [xattrs], ... | ... | no-holed data + * 1 - inode VLE compression B (legacy): + * inode, [xattrs], extents ... | ... + * 2 - inode plain with inline data C: + * inode, [xattrs], last_inline_data, ... | ... | no-holed data + * 3 - inode compression D: + * inode, [xattrs], map_header, extents ... | ... + * 4~7 - reserved + */ +enum { + EROFS_INODE_FLAT_PLAIN, + EROFS_INODE_FLAT_COMPRESSION_LEGACY, + EROFS_INODE_FLAT_INLINE, + EROFS_INODE_FLAT_COMPRESSION, + EROFS_INODE_LAYOUT_MAX +}; + +static inline bool erofs_inode_is_data_compressed(unsigned int datamode) +{ + if (datamode == EROFS_INODE_FLAT_COMPRESSION) + return true; + return datamode == EROFS_INODE_FLAT_COMPRESSION_LEGACY; +} + +/* bit definitions of inode i_advise */ +#define EROFS_I_VERSION_BITS 1 +#define EROFS_I_DATA_MAPPING_BITS 3 + +#define EROFS_I_VERSION_BIT 0 +#define EROFS_I_DATA_MAPPING_BIT 1 + +struct erofs_inode_v1 { +/* 0 */__le16 i_advise; + +/* 1 header + n-1 * 4 bytes inline xattr to keep continuity */ +/* 2 */__le16 i_xattr_icount; +/* 4 */__le16 i_mode; +/* 6 */__le16 i_nlink; +/* 8 */__le32 i_size; +/* 12 */__le32 i_reserved; +/* 16 */union { + /* file total compressed blocks for data mapping 1 */ + __le32 compressed_blocks; + __le32 raw_blkaddr; + + /* for device files, used to indicate old/new device # */ + __le32 rdev; + } i_u __packed; +/* 20 */__le32 i_ino; /* only used for 32-bit stat compatibility */ +/* 24 */__le16 i_uid; +/* 26 */__le16 i_gid; +/* 28 */__le32 i_reserved2; +} __packed; + +/* 32 bytes on-disk inode */ +#define EROFS_INODE_LAYOUT_V1 0 +/* 64 bytes on-disk inode */ +#define EROFS_INODE_LAYOUT_V2 1 + +struct erofs_inode_v2 { +/* 0 */__le16 i_advise; + +/* 1 header + n-1 * 4 bytes inline xattr to keep continuity */ +/* 2 */__le16 i_xattr_icount; +/* 4 */__le16 i_mode; +/* 6 */__le16 i_reserved; +/* 8 */__le64 i_size; +/* 16 */union { + /* file total compressed blocks for data mapping 1 */ + __le32 compressed_blocks; + __le32 raw_blkaddr; + + /* for device files, used to indicate old/new device # */ + __le32 rdev; + } i_u __packed; + + /* only used for 32-bit stat compatibility */ +/* 20 */__le32 i_ino; + +/* 24 */__le32 i_uid; +/* 28 */__le32 i_gid; +/* 32 */__le64 i_ctime; +/* 40 */__le32 i_ctime_nsec; +/* 44 */__le32 i_nlink; +/* 48 */__u8 i_reserved2[16]; +} __packed; /* 64 bytes */ + +#define EROFS_MAX_SHARED_XATTRS (128) +/* h_shared_count between 129 ... 255 are special # */ +#define EROFS_SHARED_XATTR_EXTENT (255) + +/* + * inline xattrs (n == i_xattr_icount): + * erofs_xattr_ibody_header(1) + (n - 1) * 4 bytes + * 12 bytes / \ + * / \ + * /-----------------------\ + * | erofs_xattr_entries+ | + * +-----------------------+ + * inline xattrs must starts in erofs_xattr_ibody_header, + * for read-only fs, no need to introduce h_refcount + */ +struct erofs_xattr_ibody_header { + __le32 h_reserved; + __u8 h_shared_count; + __u8 h_reserved2[7]; + __le32 h_shared_xattrs[0]; /* shared xattr id array */ +} __packed; + +/* Name indexes */ +#define EROFS_XATTR_INDEX_USER 1 +#define EROFS_XATTR_INDEX_POSIX_ACL_ACCESS 2 +#define EROFS_XATTR_INDEX_POSIX_ACL_DEFAULT 3 +#define EROFS_XATTR_INDEX_TRUSTED 4 +#define EROFS_XATTR_INDEX_LUSTRE 5 +#define EROFS_XATTR_INDEX_SECURITY 6 + +/* xattr entry (for both inline & shared xattrs) */ +struct erofs_xattr_entry { + __u8 e_name_len; /* length of name */ + __u8 e_name_index; /* attribute name index */ + __le16 e_value_size; /* size of attribute value */ + /* followed by e_name and e_value */ + char e_name[0]; /* attribute name */ +} __packed; + +#define ondisk_xattr_ibody_size(count) ({\ + u32 __count = le16_to_cpu(count); \ + ((__count) == 0) ? 0 : \ + sizeof(struct erofs_xattr_ibody_header) + \ + sizeof(__u32) * ((__count) - 1); }) + +#define EROFS_XATTR_ALIGN(size) round_up(size, sizeof(struct erofs_xattr_entry)) +#define EROFS_XATTR_ENTRY_SIZE(entry) EROFS_XATTR_ALIGN( \ + sizeof(struct erofs_xattr_entry) + \ + (entry)->e_name_len + le16_to_cpu((entry)->e_value_size)) + +/* available compression algorithm types */ +enum { + Z_EROFS_COMPRESSION_LZ4, + Z_EROFS_COMPRESSION_MAX +}; + +/* + * bit 0 : COMPACTED_2B indexes (0 - off; 1 - on) + * e.g. for 4k logical cluster size, 4B if compacted 2B is off; + * (4B) + 2B + (4B) if compacted 2B is on. + */ +#define Z_EROFS_ADVISE_COMPACTED_2B_BIT 0 + +#define Z_EROFS_ADVISE_COMPACTED_2B (1 << Z_EROFS_ADVISE_COMPACTED_2B_BIT) + +struct z_erofs_map_header { + __le32 h_reserved1; + __le16 h_advise; + /* + * bit 0-3 : algorithm type of head 1 (logical cluster type 01); + * bit 4-7 : algorithm type of head 2 (logical cluster type 11). + */ + __u8 h_algorithmtype; + /* + * bit 0-2 : logical cluster bits - 12, e.g. 0 for 4096; + * bit 3-4 : (physical - logical) cluster bits of head 1: + * For example, if logical clustersize = 4096, 1 for 8192. + * bit 5-7 : (physical - logical) cluster bits of head 2. + */ + __u8 h_clusterbits; +}; + +#define Z_EROFS_VLE_LEGACY_HEADER_PADDING 8 + +/* + * Z_EROFS Variable-sized Logical Extent cluster type: + * 0 - literal (uncompressed) cluster + * 1 - compressed cluster (for the head logical cluster) + * 2 - compressed cluster (for the other logical clusters) + * + * In detail, + * 0 - literal (uncompressed) cluster, + * di_advise = 0 + * di_clusterofs = the literal data offset of the cluster + * di_blkaddr = the blkaddr of the literal cluster + * + * 1 - compressed cluster (for the head logical cluster) + * di_advise = 1 + * di_clusterofs = the decompressed data offset of the cluster + * di_blkaddr = the blkaddr of the compressed cluster + * + * 2 - compressed cluster (for the other logical clusters) + * di_advise = 2 + * di_clusterofs = + * the decompressed data offset in its own head cluster + * di_u.delta[0] = distance to its corresponding head cluster + * di_u.delta[1] = distance to its corresponding tail cluster + * (di_advise could be 0, 1 or 2) + */ +enum { + Z_EROFS_VLE_CLUSTER_TYPE_PLAIN, + Z_EROFS_VLE_CLUSTER_TYPE_HEAD, + Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD, + Z_EROFS_VLE_CLUSTER_TYPE_RESERVED, + Z_EROFS_VLE_CLUSTER_TYPE_MAX +}; + +#define Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS 2 +#define Z_EROFS_VLE_DI_CLUSTER_TYPE_BIT 0 + +struct z_erofs_vle_decompressed_index { + __le16 di_advise; + /* where to decompress in the head cluster */ + __le16 di_clusterofs; + + union { + /* for the head cluster */ + __le32 blkaddr; + /* + * for the rest clusters + * eg. for 4k page-sized cluster, maximum 4K*64k = 256M) + * [0] - pointing to the head cluster + * [1] - pointing to the tail cluster + */ + __le16 delta[2]; + } di_u __packed; /* 8 bytes */ +} __packed; + +#define Z_EROFS_VLE_LEGACY_INDEX_ALIGN(size) \ + (round_up(size, sizeof(struct z_erofs_vle_decompressed_index)) + \ + sizeof(struct z_erofs_map_header) + Z_EROFS_VLE_LEGACY_HEADER_PADDING) + +/* dirent sorts in alphabet order, thus we can do binary search */ +struct erofs_dirent { + __le64 nid; /* 0, node number */ + __le16 nameoff; /* 8, start offset of file name */ + __u8 file_type; /* 10, file type */ + __u8 reserved; /* 11, reserved */ +} __packed; + +/* + * EROFS file types should match generic FT_* types and + * it seems no need to add BUILD_BUG_ONs since potential + * unmatchness will break other fses as well... + */ + +#define EROFS_NAME_LEN 255 + +/* check the EROFS on-disk layout strictly at compile time */ +static inline void erofs_check_ondisk_layout_definitions(void) +{ + BUILD_BUG_ON(sizeof(struct erofs_super_block) != 128); + BUILD_BUG_ON(sizeof(struct erofs_inode_v1) != 32); + BUILD_BUG_ON(sizeof(struct erofs_inode_v2) != 64); + BUILD_BUG_ON(sizeof(struct erofs_xattr_ibody_header) != 12); + BUILD_BUG_ON(sizeof(struct erofs_xattr_entry) != 4); + BUILD_BUG_ON(sizeof(struct z_erofs_map_header) != 8); + BUILD_BUG_ON(sizeof(struct z_erofs_vle_decompressed_index) != 8); + BUILD_BUG_ON(sizeof(struct erofs_dirent) != 12); + + BUILD_BUG_ON(BIT(Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS) < + Z_EROFS_VLE_CLUSTER_TYPE_MAX - 1); +} + +#endif + diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c new file mode 100644 index 000000000000..80f4fe919ee7 --- /dev/null +++ b/fs/erofs/inode.c @@ -0,0 +1,332 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2017-2018 HUAWEI, Inc. + * http://www.huawei.com/ + * Created by Gao Xiang <gaoxiang25@huawei.com> + */ +#include "xattr.h" + +#include <trace/events/erofs.h> + +/* no locking */ +static int read_inode(struct inode *inode, void *data) +{ + struct erofs_vnode *vi = EROFS_V(inode); + struct erofs_inode_v1 *v1 = data; + const unsigned int advise = le16_to_cpu(v1->i_advise); + erofs_blk_t nblks = 0; + + vi->datamode = __inode_data_mapping(advise); + + if (unlikely(vi->datamode >= EROFS_INODE_LAYOUT_MAX)) { + errln("unsupported data mapping %u of nid %llu", + vi->datamode, vi->nid); + DBG_BUGON(1); + return -EOPNOTSUPP; + } + + if (__inode_version(advise) == EROFS_INODE_LAYOUT_V2) { + struct erofs_inode_v2 *v2 = data; + + vi->inode_isize = sizeof(struct erofs_inode_v2); + vi->xattr_isize = ondisk_xattr_ibody_size(v2->i_xattr_icount); + + inode->i_mode = le16_to_cpu(v2->i_mode); + if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode)) + vi->raw_blkaddr = le32_to_cpu(v2->i_u.raw_blkaddr); + else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) + inode->i_rdev = + new_decode_dev(le32_to_cpu(v2->i_u.rdev)); + else if (S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) + inode->i_rdev = 0; + else + goto bogusimode; + + i_uid_write(inode, le32_to_cpu(v2->i_uid)); + i_gid_write(inode, le32_to_cpu(v2->i_gid)); + set_nlink(inode, le32_to_cpu(v2->i_nlink)); + + /* ns timestamp */ + inode->i_mtime.tv_sec = inode->i_ctime.tv_sec = + le64_to_cpu(v2->i_ctime); + inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec = + le32_to_cpu(v2->i_ctime_nsec); + + inode->i_size = le64_to_cpu(v2->i_size); + + /* total blocks for compressed files */ + if (is_inode_layout_compression(inode)) + nblks = le32_to_cpu(v2->i_u.compressed_blocks); + } else if (__inode_version(advise) == EROFS_INODE_LAYOUT_V1) { + struct erofs_sb_info *sbi = EROFS_SB(inode->i_sb); + + vi->inode_isize = sizeof(struct erofs_inode_v1); + vi->xattr_isize = ondisk_xattr_ibody_size(v1->i_xattr_icount); + + inode->i_mode = le16_to_cpu(v1->i_mode); + if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode)) + vi->raw_blkaddr = le32_to_cpu(v1->i_u.raw_blkaddr); + else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) + inode->i_rdev = + new_decode_dev(le32_to_cpu(v1->i_u.rdev)); + else if (S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) + inode->i_rdev = 0; + else + goto bogusimode; + + i_uid_write(inode, le16_to_cpu(v1->i_uid)); + i_gid_write(inode, le16_to_cpu(v1->i_gid)); + set_nlink(inode, le16_to_cpu(v1->i_nlink)); + + /* use build time to derive all file time */ + inode->i_mtime.tv_sec = inode->i_ctime.tv_sec = + sbi->build_time; + inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec = + sbi->build_time_nsec; + + inode->i_size = le32_to_cpu(v1->i_size); + if (is_inode_layout_compression(inode)) + nblks = le32_to_cpu(v1->i_u.compressed_blocks); + } else { + errln("unsupported on-disk inode version %u of nid %llu", + __inode_version(advise), vi->nid); + DBG_BUGON(1); + return -EOPNOTSUPP; + } + + if (!nblks) + /* measure inode.i_blocks as generic filesystems */ + inode->i_blocks = roundup(inode->i_size, EROFS_BLKSIZ) >> 9; + else + inode->i_blocks = nblks << LOG_SECTORS_PER_BLOCK; + return 0; + +bogusimode: + errln("bogus i_mode (%o) @ nid %llu", inode->i_mode, vi->nid); + DBG_BUGON(1); + return -EFSCORRUPTED; +} + +/* + * try_lock can be required since locking order is: + * file data(fs_inode) + * meta(bd_inode) + * but the majority of the callers is "iget", + * in that case we are pretty sure no deadlock since + * no data operations exist. However I tend to + * try_lock since it takes no much overhead and + * will success immediately. + */ +static int fill_inline_data(struct inode *inode, void *data, + unsigned int m_pofs) +{ + struct erofs_vnode *vi = EROFS_V(inode); + struct erofs_sb_info *sbi = EROFS_I_SB(inode); + + /* should be inode inline C */ + if (!is_inode_flat_inline(inode)) + return 0; + + /* fast symlink (following ext4) */ + if (S_ISLNK(inode->i_mode) && inode->i_size < PAGE_SIZE) { + char *lnk = erofs_kmalloc(sbi, inode->i_size + 1, GFP_KERNEL); + + if (unlikely(!lnk)) + return -ENOMEM; + + m_pofs += vi->inode_isize + vi->xattr_isize; + + /* inline symlink data shouldn't across page boundary as well */ + if (unlikely(m_pofs + inode->i_size > PAGE_SIZE)) { + kfree(lnk); + errln("inline data cross block boundary @ nid %llu", + vi->nid); + DBG_BUGON(1); + return -EFSCORRUPTED; + } + + /* get in-page inline data */ + memcpy(lnk, data + m_pofs, inode->i_size); + lnk[inode->i_size] = '\0'; + + inode->i_link = lnk; + set_inode_fast_symlink(inode); + } + return 0; +} + +static int fill_inode(struct inode *inode, int isdir) +{ + struct erofs_sb_info *sbi = EROFS_SB(inode->i_sb); + struct erofs_vnode *vi = EROFS_V(inode); + struct page *page; + void *data; + int err; + erofs_blk_t blkaddr; + unsigned int ofs; + erofs_off_t inode_loc; + + trace_erofs_fill_inode(inode, isdir); + inode_loc = iloc(sbi, vi->nid); + blkaddr = erofs_blknr(inode_loc); + ofs = erofs_blkoff(inode_loc); + + debugln("%s, reading inode nid %llu at %u of blkaddr %u", + __func__, vi->nid, ofs, blkaddr); + + page = erofs_get_meta_page(inode->i_sb, blkaddr, isdir); + + if (IS_ERR(page)) { + errln("failed to get inode (nid: %llu) page, err %ld", + vi->nid, PTR_ERR(page)); + return PTR_ERR(page); + } + + DBG_BUGON(!PageUptodate(page)); + data = page_address(page); + + err = read_inode(inode, data + ofs); + if (!err) { + /* setup the new inode */ + if (S_ISREG(inode->i_mode)) { + inode->i_op = &erofs_generic_iops; + inode->i_fop = &generic_ro_fops; + } else if (S_ISDIR(inode->i_mode)) { + inode->i_op = &erofs_dir_iops; + inode->i_fop = &erofs_dir_fops; + } else if (S_ISLNK(inode->i_mode)) { + /* by default, page_get_link is used for symlink */ + inode->i_op = &erofs_symlink_iops; + inode_nohighmem(inode); + } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || + S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { + inode->i_op = &erofs_generic_iops; + init_special_inode(inode, inode->i_mode, inode->i_rdev); + goto out_unlock; + } else { + err = -EFSCORRUPTED; + goto out_unlock; + } + + if (is_inode_layout_compression(inode)) { + err = z_erofs_fill_inode(inode); + goto out_unlock; + } + + inode->i_mapping->a_ops = &erofs_raw_access_aops; + + /* fill last page if inline data is available */ + err = fill_inline_data(inode, data, ofs); + } + +out_unlock: + unlock_page(page); + put_page(page); + return err; +} + +/* + * erofs nid is 64bits, but i_ino is 'unsigned long', therefore + * we should do more for 32-bit platform to find the right inode. + */ +#if BITS_PER_LONG == 32 +static int erofs_ilookup_test_actor(struct inode *inode, void *opaque) +{ + const erofs_nid_t nid = *(erofs_nid_t *)opaque; + + return EROFS_V(inode)->nid == nid; +} + +static int erofs_iget_set_actor(struct inode *inode, void *opaque) +{ + const erofs_nid_t nid = *(erofs_nid_t *)opaque; + + inode->i_ino = erofs_inode_hash(nid); + return 0; +} +#endif + +static inline struct inode *erofs_iget_locked(struct super_block *sb, + erofs_nid_t nid) +{ + const unsigned long hashval = erofs_inode_hash(nid); + +#if BITS_PER_LONG >= 64 + /* it is safe to use iget_locked for >= 64-bit platform */ + return iget_locked(sb, hashval); +#else + return iget5_locked(sb, hashval, erofs_ilookup_test_actor, + erofs_iget_set_actor, &nid); +#endif +} + +struct inode *erofs_iget(struct super_block *sb, + erofs_nid_t nid, + bool isdir) +{ + struct inode *inode = erofs_iget_locked(sb, nid); + + if (unlikely(!inode)) + return ERR_PTR(-ENOMEM); + + if (inode->i_state & I_NEW) { + int err; + struct erofs_vnode *vi = EROFS_V(inode); + + vi->nid = nid; + + err = fill_inode(inode, isdir); + if (likely(!err)) + unlock_new_inode(inode); + else { + iget_failed(inode); + inode = ERR_PTR(err); + } + } + return inode; +} + +int erofs_getattr(const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int query_flags) +{ + struct inode *const inode = d_inode(path->dentry); + + if (is_inode_layout_compression(inode)) + stat->attributes |= STATX_ATTR_COMPRESSED; + + stat->attributes |= STATX_ATTR_IMMUTABLE; + stat->attributes_mask |= (STATX_ATTR_COMPRESSED | + STATX_ATTR_IMMUTABLE); + + generic_fillattr(inode, stat); + return 0; +} + +const struct inode_operations erofs_generic_iops = { + .getattr = erofs_getattr, +#ifdef CONFIG_EROFS_FS_XATTR + .listxattr = erofs_listxattr, +#endif + .get_acl = erofs_get_acl, +}; + +const struct inode_operations erofs_symlink_iops = { + .get_link = page_get_link, + .getattr = erofs_getattr, +#ifdef CONFIG_EROFS_FS_XATTR + .listxattr = erofs_listxattr, +#endif + .get_acl = erofs_get_acl, +}; + +const struct inode_operations erofs_fast_symlink_iops = { + .get_link = simple_get_link, + .getattr = erofs_getattr, +#ifdef CONFIG_EROFS_FS_XATTR + .listxattr = erofs_listxattr, +#endif + .get_acl = erofs_get_acl, +}; + diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h new file mode 100644 index 000000000000..620b73fcc416 --- /dev/null +++ b/fs/erofs/internal.h @@ -0,0 +1,553 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2017-2018 HUAWEI, Inc. + * http://www.huawei.com/ + * Created by Gao Xiang <gaoxiang25@huawei.com> + */ +#ifndef __EROFS_INTERNAL_H +#define __EROFS_INTERNAL_H + +#include <linux/fs.h> +#include <linux/dcache.h> +#include <linux/mm.h> +#include <linux/pagemap.h> +#include <linux/bio.h> +#include <linux/buffer_head.h> +#include <linux/magic.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include "erofs_fs.h" + +/* redefine pr_fmt "erofs: " */ +#undef pr_fmt +#define pr_fmt(fmt) "erofs: " fmt + +#define errln(x, ...) pr_err(x "\n", ##__VA_ARGS__) +#define infoln(x, ...) pr_info(x "\n", ##__VA_ARGS__) +#ifdef CONFIG_EROFS_FS_DEBUG +#define debugln(x, ...) pr_debug(x "\n", ##__VA_ARGS__) +#define DBG_BUGON BUG_ON +#else +#define debugln(x, ...) ((void)0) +#define DBG_BUGON(x) ((void)(x)) +#endif /* !CONFIG_EROFS_FS_DEBUG */ + +enum { + FAULT_KMALLOC, + FAULT_READ_IO, + FAULT_MAX, +}; + +#ifdef CONFIG_EROFS_FAULT_INJECTION +extern const char *erofs_fault_name[FAULT_MAX]; +#define IS_FAULT_SET(fi, type) ((fi)->inject_type & (1 << (type))) + +struct erofs_fault_info { + atomic_t inject_ops; + unsigned int inject_rate; + unsigned int inject_type; +}; +#endif /* CONFIG_EROFS_FAULT_INJECTION */ + +/* EROFS_SUPER_MAGIC_V1 to represent the whole file system */ +#define EROFS_SUPER_MAGIC EROFS_SUPER_MAGIC_V1 + +typedef u64 erofs_nid_t; +typedef u64 erofs_off_t; +/* data type for filesystem-wide blocks number */ +typedef u32 erofs_blk_t; + +struct erofs_sb_info { +#ifdef CONFIG_EROFS_FS_ZIP + /* list for all registered superblocks, mainly for shrinker */ + struct list_head list; + struct mutex umount_mutex; + + /* the dedicated workstation for compression */ + struct radix_tree_root workstn_tree; + + /* threshold for decompression synchronously */ + unsigned int max_sync_decompress_pages; + + unsigned int shrinker_run_no; + + /* current strategy of how to use managed cache */ + unsigned char cache_strategy; + + /* pseudo inode to manage cached pages */ + struct inode *managed_cache; +#endif /* CONFIG_EROFS_FS_ZIP */ + u32 blocks; + u32 meta_blkaddr; +#ifdef CONFIG_EROFS_FS_XATTR + u32 xattr_blkaddr; +#endif + + /* inode slot unit size in bit shift */ + unsigned char islotbits; + + u32 build_time_nsec; + u64 build_time; + + /* what we really care is nid, rather than ino.. */ + erofs_nid_t root_nid; + /* used for statfs, f_files - f_favail */ + u64 inos; + + u8 uuid[16]; /* 128-bit uuid for volume */ + u8 volume_name[16]; /* volume name */ + u32 requirements; + + unsigned int mount_opt; + +#ifdef CONFIG_EROFS_FAULT_INJECTION + struct erofs_fault_info fault_info; /* For fault injection */ +#endif +}; + +#ifdef CONFIG_EROFS_FAULT_INJECTION +#define erofs_show_injection_info(type) \ + infoln("inject %s in %s of %pS", erofs_fault_name[type], \ + __func__, __builtin_return_address(0)) + +static inline bool time_to_inject(struct erofs_sb_info *sbi, int type) +{ + struct erofs_fault_info *ffi = &sbi->fault_info; + + if (!ffi->inject_rate) + return false; + + if (!IS_FAULT_SET(ffi, type)) + return false; + + atomic_inc(&ffi->inject_ops); + if (atomic_read(&ffi->inject_ops) >= ffi->inject_rate) { + atomic_set(&ffi->inject_ops, 0); + return true; + } + return false; +} +#else +static inline bool time_to_inject(struct erofs_sb_info *sbi, int type) +{ + return false; +} + +static inline void erofs_show_injection_info(int type) +{ +} +#endif /* !CONFIG_EROFS_FAULT_INJECTION */ + +static inline void *erofs_kmalloc(struct erofs_sb_info *sbi, + size_t size, gfp_t flags) +{ + if (time_to_inject(sbi, FAULT_KMALLOC)) { + erofs_show_injection_info(FAULT_KMALLOC); + return NULL; + } + return kmalloc(size, flags); +} + +#define EROFS_SB(sb) ((struct erofs_sb_info *)(sb)->s_fs_info) +#define EROFS_I_SB(inode) ((struct erofs_sb_info *)(inode)->i_sb->s_fs_info) + +/* Mount flags set via mount options or defaults */ +#define EROFS_MOUNT_XATTR_USER 0x00000010 +#define EROFS_MOUNT_POSIX_ACL 0x00000020 +#define EROFS_MOUNT_FAULT_INJECTION 0x00000040 + +#define clear_opt(sbi, option) ((sbi)->mount_opt &= ~EROFS_MOUNT_##option) +#define set_opt(sbi, option) ((sbi)->mount_opt |= EROFS_MOUNT_##option) +#define test_opt(sbi, option) ((sbi)->mount_opt & EROFS_MOUNT_##option) + +#ifdef CONFIG_EROFS_FS_ZIP +enum { + EROFS_ZIP_CACHE_DISABLED, + EROFS_ZIP_CACHE_READAHEAD, + EROFS_ZIP_CACHE_READAROUND +}; + +#define EROFS_LOCKED_MAGIC (INT_MIN | 0xE0F510CCL) + +/* basic unit of the workstation of a super_block */ +struct erofs_workgroup { + /* the workgroup index in the workstation */ + pgoff_t index; + + /* overall workgroup reference count */ + atomic_t refcount; +}; + +#if defined(CONFIG_SMP) +static inline bool erofs_workgroup_try_to_freeze(struct erofs_workgroup *grp, + int val) +{ + preempt_disable(); + if (val != atomic_cmpxchg(&grp->refcount, val, EROFS_LOCKED_MAGIC)) { + preempt_enable(); + return false; + } + return true; +} + +static inline void erofs_workgroup_unfreeze(struct erofs_workgroup *grp, + int orig_val) +{ + /* + * other observers should notice all modifications + * in the freezing period. + */ + smp_mb(); + atomic_set(&grp->refcount, orig_val); + preempt_enable(); +} + +static inline int erofs_wait_on_workgroup_freezed(struct erofs_workgroup *grp) +{ + return atomic_cond_read_relaxed(&grp->refcount, + VAL != EROFS_LOCKED_MAGIC); +} +#else +static inline bool erofs_workgroup_try_to_freeze(struct erofs_workgroup *grp, + int val) +{ + preempt_disable(); + /* no need to spin on UP platforms, let's just disable preemption. */ + if (val != atomic_read(&grp->refcount)) { + preempt_enable(); + return false; + } + return true; +} + +static inline void erofs_workgroup_unfreeze(struct erofs_workgroup *grp, + int orig_val) +{ + preempt_enable(); +} + +static inline int erofs_wait_on_workgroup_freezed(struct erofs_workgroup *grp) +{ + int v = atomic_read(&grp->refcount); + + /* workgroup is never freezed on uniprocessor systems */ + DBG_BUGON(v == EROFS_LOCKED_MAGIC); + return v; +} +#endif /* !CONFIG_SMP */ + +/* hard limit of pages per compressed cluster */ +#define Z_EROFS_CLUSTER_MAX_PAGES (CONFIG_EROFS_FS_CLUSTER_PAGE_LIMIT) +#define EROFS_PCPUBUF_NR_PAGES Z_EROFS_CLUSTER_MAX_PAGES +#else +#define EROFS_PCPUBUF_NR_PAGES 0 +#endif /* !CONFIG_EROFS_FS_ZIP */ + +/* we strictly follow PAGE_SIZE and no buffer head yet */ +#define LOG_BLOCK_SIZE PAGE_SHIFT + +#undef LOG_SECTORS_PER_BLOCK +#define LOG_SECTORS_PER_BLOCK (PAGE_SHIFT - 9) + +#undef SECTORS_PER_BLOCK +#define SECTORS_PER_BLOCK (1 << SECTORS_PER_BLOCK) + +#define EROFS_BLKSIZ (1 << LOG_BLOCK_SIZE) + +#if (EROFS_BLKSIZ % 4096 || !EROFS_BLKSIZ) +#error erofs cannot be used in this platform +#endif + +#define EROFS_IO_MAX_RETRIES_NOFAIL 5 + +#define ROOT_NID(sb) ((sb)->root_nid) + +#define erofs_blknr(addr) ((addr) / EROFS_BLKSIZ) +#define erofs_blkoff(addr) ((addr) % EROFS_BLKSIZ) +#define blknr_to_addr(nr) ((erofs_off_t)(nr) * EROFS_BLKSIZ) + +static inline erofs_off_t iloc(struct erofs_sb_info *sbi, erofs_nid_t nid) +{ + return blknr_to_addr(sbi->meta_blkaddr) + (nid << sbi->islotbits); +} + +/* atomic flag definitions */ +#define EROFS_V_EA_INITED_BIT 0 +#define EROFS_V_Z_INITED_BIT 1 + +/* bitlock definitions (arranged in reverse order) */ +#define EROFS_V_BL_XATTR_BIT (BITS_PER_LONG - 1) +#define EROFS_V_BL_Z_BIT (BITS_PER_LONG - 2) + +struct erofs_vnode { + erofs_nid_t nid; + + /* atomic flags (including bitlocks) */ + unsigned long flags; + + unsigned char datamode; + unsigned char inode_isize; + unsigned short xattr_isize; + + unsigned int xattr_shared_count; + unsigned int *xattr_shared_xattrs; + + union { + erofs_blk_t raw_blkaddr; +#ifdef CONFIG_EROFS_FS_ZIP + struct { + unsigned short z_advise; + unsigned char z_algorithmtype[2]; + unsigned char z_logical_clusterbits; + unsigned char z_physical_clusterbits[2]; + }; +#endif /* CONFIG_EROFS_FS_ZIP */ + }; + /* the corresponding vfs inode */ + struct inode vfs_inode; +}; + +#define EROFS_V(ptr) \ + container_of(ptr, struct erofs_vnode, vfs_inode) + +#define __inode_advise(x, bit, bits) \ + (((x) >> (bit)) & ((1 << (bits)) - 1)) + +#define __inode_version(advise) \ + __inode_advise(advise, EROFS_I_VERSION_BIT, \ + EROFS_I_VERSION_BITS) + +#define __inode_data_mapping(advise) \ + __inode_advise(advise, EROFS_I_DATA_MAPPING_BIT,\ + EROFS_I_DATA_MAPPING_BITS) + +static inline unsigned long inode_datablocks(struct inode *inode) +{ + /* since i_size cannot be changed */ + return DIV_ROUND_UP(inode->i_size, EROFS_BLKSIZ); +} + +static inline bool is_inode_layout_compression(struct inode *inode) +{ + return erofs_inode_is_data_compressed(EROFS_V(inode)->datamode); +} + +static inline bool is_inode_flat_inline(struct inode *inode) +{ + return EROFS_V(inode)->datamode == EROFS_INODE_FLAT_INLINE; +} + +extern const struct super_operations erofs_sops; + +extern const struct address_space_operations erofs_raw_access_aops; +#ifdef CONFIG_EROFS_FS_ZIP +extern const struct address_space_operations z_erofs_vle_normalaccess_aops; +#endif + +/* + * Logical to physical block mapping, used by erofs_map_blocks() + * + * Different with other file systems, it is used for 2 access modes: + * + * 1) RAW access mode: + * + * Users pass a valid (m_lblk, m_lofs -- usually 0) pair, + * and get the valid m_pblk, m_pofs and the longest m_len(in bytes). + * + * Note that m_lblk in the RAW access mode refers to the number of + * the compressed ondisk block rather than the uncompressed + * in-memory block for the compressed file. + * + * m_pofs equals to m_lofs except for the inline data page. + * + * 2) Normal access mode: + * + * If the inode is not compressed, it has no difference with + * the RAW access mode. However, if the inode is compressed, + * users should pass a valid (m_lblk, m_lofs) pair, and get + * the needed m_pblk, m_pofs, m_len to get the compressed data + * and the updated m_lblk, m_lofs which indicates the start + * of the corresponding uncompressed data in the file. + */ +enum { + BH_Zipped = BH_PrivateStart, + BH_FullMapped, +}; + +/* Has a disk mapping */ +#define EROFS_MAP_MAPPED (1 << BH_Mapped) +/* Located in metadata (could be copied from bd_inode) */ +#define EROFS_MAP_META (1 << BH_Meta) +/* The extent has been compressed */ +#define EROFS_MAP_ZIPPED (1 << BH_Zipped) +/* The length of extent is full */ +#define EROFS_MAP_FULL_MAPPED (1 << BH_FullMapped) + +struct erofs_map_blocks { + erofs_off_t m_pa, m_la; + u64 m_plen, m_llen; + + unsigned int m_flags; + + struct page *mpage; +}; + +/* Flags used by erofs_map_blocks() */ +#define EROFS_GET_BLOCKS_RAW 0x0001 + +/* zmap.c */ +#ifdef CONFIG_EROFS_FS_ZIP +int z_erofs_fill_inode(struct inode *inode); +int z_erofs_map_blocks_iter(struct inode *inode, + struct erofs_map_blocks *map, + int flags); +#else +static inline int z_erofs_fill_inode(struct inode *inode) { return -EOPNOTSUPP; } +static inline int z_erofs_map_blocks_iter(struct inode *inode, + struct erofs_map_blocks *map, + int flags) +{ + return -EOPNOTSUPP; +} +#endif /* !CONFIG_EROFS_FS_ZIP */ + +/* data.c */ +static inline struct bio *erofs_grab_bio(struct super_block *sb, + erofs_blk_t blkaddr, + unsigned int nr_pages, + void *bi_private, bio_end_io_t endio, + bool nofail) +{ + const gfp_t gfp = GFP_NOIO; + struct bio *bio; + + do { + if (nr_pages == 1) { + bio = bio_alloc(gfp | (nofail ? __GFP_NOFAIL : 0), 1); + if (unlikely(!bio)) { + DBG_BUGON(nofail); + return ERR_PTR(-ENOMEM); + } + break; + } + bio = bio_alloc(gfp, nr_pages); + nr_pages /= 2; + } while (unlikely(!bio)); + + bio->bi_end_io = endio; + bio_set_dev(bio, sb->s_bdev); + bio->bi_iter.bi_sector = (sector_t)blkaddr << LOG_SECTORS_PER_BLOCK; + bio->bi_private = bi_private; + return bio; +} + +static inline void __submit_bio(struct bio *bio, unsigned int op, + unsigned int op_flags) +{ + bio_set_op_attrs(bio, op, op_flags); + submit_bio(bio); +} + +struct page *__erofs_get_meta_page(struct super_block *sb, erofs_blk_t blkaddr, + bool prio, bool nofail); + +static inline struct page *erofs_get_meta_page(struct super_block *sb, + erofs_blk_t blkaddr, bool prio) +{ + return __erofs_get_meta_page(sb, blkaddr, prio, false); +} + +int erofs_map_blocks(struct inode *, struct erofs_map_blocks *, int); + +static inline struct page *erofs_get_inline_page(struct inode *inode, + erofs_blk_t blkaddr) +{ + return erofs_get_meta_page(inode->i_sb, blkaddr, + S_ISDIR(inode->i_mode)); +} + +/* inode.c */ +static inline unsigned long erofs_inode_hash(erofs_nid_t nid) +{ +#if BITS_PER_LONG == 32 + return (nid >> 32) ^ (nid & 0xffffffff); +#else + return nid; +#endif +} + +extern const struct inode_operations erofs_generic_iops; +extern const struct inode_operations erofs_symlink_iops; +extern const struct inode_operations erofs_fast_symlink_iops; + +static inline void set_inode_fast_symlink(struct inode *inode) +{ + inode->i_op = &erofs_fast_symlink_iops; +} + +static inline bool is_inode_fast_symlink(struct inode *inode) +{ + return inode->i_op == &erofs_fast_symlink_iops; +} + +struct inode *erofs_iget(struct super_block *sb, erofs_nid_t nid, bool dir); +int erofs_getattr(const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int query_flags); + +/* namei.c */ +extern const struct inode_operations erofs_dir_iops; + +int erofs_namei(struct inode *dir, struct qstr *name, + erofs_nid_t *nid, unsigned int *d_type); + +/* dir.c */ +extern const struct file_operations erofs_dir_fops; + +/* utils.c / zdata.c */ +struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp, bool nofail); + +#if (EROFS_PCPUBUF_NR_PAGES > 0) +void *erofs_get_pcpubuf(unsigned int pagenr); +#define erofs_put_pcpubuf(buf) do { \ + (void)&(buf); \ + preempt_enable(); \ +} while (0) +#else +static inline void *erofs_get_pcpubuf(unsigned int pagenr) +{ + return ERR_PTR(-EOPNOTSUPP); +} + +#define erofs_put_pcpubuf(buf) do {} while (0) +#endif + +#ifdef CONFIG_EROFS_FS_ZIP +int erofs_workgroup_put(struct erofs_workgroup *grp); +struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb, + pgoff_t index, bool *tag); +int erofs_register_workgroup(struct super_block *sb, + struct erofs_workgroup *grp, bool tag); +void erofs_workgroup_free_rcu(struct erofs_workgroup *grp); +void erofs_shrinker_register(struct super_block *sb); +void erofs_shrinker_unregister(struct super_block *sb); +int __init erofs_init_shrinker(void); +void erofs_exit_shrinker(void); +int __init z_erofs_init_zip_subsystem(void); +void z_erofs_exit_zip_subsystem(void); +int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, + struct erofs_workgroup *egrp); +int erofs_try_to_free_cached_page(struct address_space *mapping, + struct page *page); +#else +static inline void erofs_shrinker_register(struct super_block *sb) {} +static inline void erofs_shrinker_unregister(struct super_block *sb) {} +static inline int erofs_init_shrinker(void) { return 0; } +static inline void erofs_exit_shrinker(void) {} +static inline int z_erofs_init_zip_subsystem(void) { return 0; } +static inline void z_erofs_exit_zip_subsystem(void) {} +#endif /* !CONFIG_EROFS_FS_ZIP */ + +#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ + +#endif /* __EROFS_INTERNAL_H */ + diff --git a/fs/erofs/namei.c b/fs/erofs/namei.c new file mode 100644 index 000000000000..8832b5d95d91 --- /dev/null +++ b/fs/erofs/namei.c @@ -0,0 +1,251 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2017-2018 HUAWEI, Inc. + * http://www.huawei.com/ + * Created by Gao Xiang <gaoxiang25@huawei.com> + */ +#include "xattr.h" + +#include <trace/events/erofs.h> + +struct erofs_qstr { + const unsigned char *name; + const unsigned char *end; +}; + +/* based on the end of qn is accurate and it must have the trailing '\0' */ +static inline int dirnamecmp(const struct erofs_qstr *qn, + const struct erofs_qstr *qd, + unsigned int *matched) +{ + unsigned int i = *matched; + + /* + * on-disk error, let's only BUG_ON in the debugging mode. + * otherwise, it will return 1 to just skip the invalid name + * and go on (in consideration of the lookup performance). + */ + DBG_BUGON(qd->name > qd->end); + + /* qd could not have trailing '\0' */ + /* However it is absolutely safe if < qd->end */ + while (qd->name + i < qd->end && qd->name[i] != '\0') { + if (qn->name[i] != qd->name[i]) { + *matched = i; + return qn->name[i] > qd->name[i] ? 1 : -1; + } + ++i; + } + *matched = i; + /* See comments in __d_alloc on the terminating NUL character */ + return qn->name[i] == '\0' ? 0 : 1; +} + +#define nameoff_from_disk(off, sz) (le16_to_cpu(off) & ((sz) - 1)) + +static struct erofs_dirent *find_target_dirent(struct erofs_qstr *name, + u8 *data, + unsigned int dirblksize, + const int ndirents) +{ + int head, back; + unsigned int startprfx, endprfx; + struct erofs_dirent *const de = (struct erofs_dirent *)data; + + /* since the 1st dirent has been evaluated previously */ + head = 1; + back = ndirents - 1; + startprfx = endprfx = 0; + + while (head <= back) { + const int mid = head + (back - head) / 2; + const int nameoff = nameoff_from_disk(de[mid].nameoff, + dirblksize); + unsigned int matched = min(startprfx, endprfx); + struct erofs_qstr dname = { + .name = data + nameoff, + .end = unlikely(mid >= ndirents - 1) ? + data + dirblksize : + data + nameoff_from_disk(de[mid + 1].nameoff, + dirblksize) + }; + + /* string comparison without already matched prefix */ + int ret = dirnamecmp(name, &dname, &matched); + + if (unlikely(!ret)) { + return de + mid; + } else if (ret > 0) { + head = mid + 1; + startprfx = matched; + } else { + back = mid - 1; + endprfx = matched; + } + } + + return ERR_PTR(-ENOENT); +} + +static struct page *find_target_block_classic(struct inode *dir, + struct erofs_qstr *name, + int *_ndirents) +{ + unsigned int startprfx, endprfx; + int head, back; + struct address_space *const mapping = dir->i_mapping; + struct page *candidate = ERR_PTR(-ENOENT); + + startprfx = endprfx = 0; + head = 0; + back = inode_datablocks(dir) - 1; + + while (head <= back) { + const int mid = head + (back - head) / 2; + struct page *page = read_mapping_page(mapping, mid, NULL); + + if (!IS_ERR(page)) { + struct erofs_dirent *de = kmap_atomic(page); + const int nameoff = nameoff_from_disk(de->nameoff, + EROFS_BLKSIZ); + const int ndirents = nameoff / sizeof(*de); + int diff; + unsigned int matched; + struct erofs_qstr dname; + + if (unlikely(!ndirents)) { + kunmap_atomic(de); + put_page(page); + errln("corrupted dir block %d @ nid %llu", + mid, EROFS_V(dir)->nid); + DBG_BUGON(1); + page = ERR_PTR(-EFSCORRUPTED); + goto out; + } + + matched = min(startprfx, endprfx); + + dname.name = (u8 *)de + nameoff; + if (ndirents == 1) + dname.end = (u8 *)de + EROFS_BLKSIZ; + else + dname.end = (u8 *)de + + nameoff_from_disk(de[1].nameoff, + EROFS_BLKSIZ); + + /* string comparison without already matched prefix */ + diff = dirnamecmp(name, &dname, &matched); + kunmap_atomic(de); + + if (unlikely(!diff)) { + *_ndirents = 0; + goto out; + } else if (diff > 0) { + head = mid + 1; + startprfx = matched; + + if (!IS_ERR(candidate)) + put_page(candidate); + candidate = page; + *_ndirents = ndirents; + } else { + put_page(page); + + back = mid - 1; + endprfx = matched; + } + continue; + } +out: /* free if the candidate is valid */ + if (!IS_ERR(candidate)) + put_page(candidate); + return page; + } + return candidate; +} + +int erofs_namei(struct inode *dir, + struct qstr *name, + erofs_nid_t *nid, unsigned int *d_type) +{ + int ndirents; + struct page *page; + void *data; + struct erofs_dirent *de; + struct erofs_qstr qn; + + if (unlikely(!dir->i_size)) + return -ENOENT; + + qn.name = name->name; + qn.end = name->name + name->len; + + ndirents = 0; + page = find_target_block_classic(dir, &qn, &ndirents); + + if (IS_ERR(page)) + return PTR_ERR(page); + + data = kmap_atomic(page); + /* the target page has been mapped */ + if (ndirents) + de = find_target_dirent(&qn, data, EROFS_BLKSIZ, ndirents); + else + de = (struct erofs_dirent *)data; + + if (!IS_ERR(de)) { + *nid = le64_to_cpu(de->nid); + *d_type = de->file_type; + } + + kunmap_atomic(data); + put_page(page); + + return PTR_ERR_OR_ZERO(de); +} + +/* NOTE: i_mutex is already held by vfs */ +static struct dentry *erofs_lookup(struct inode *dir, + struct dentry *dentry, + unsigned int flags) +{ + int err; + erofs_nid_t nid; + unsigned int d_type; + struct inode *inode; + + DBG_BUGON(!d_really_is_negative(dentry)); + /* dentry must be unhashed in lookup, no need to worry about */ + DBG_BUGON(!d_unhashed(dentry)); + + trace_erofs_lookup(dir, dentry, flags); + + /* file name exceeds fs limit */ + if (unlikely(dentry->d_name.len > EROFS_NAME_LEN)) + return ERR_PTR(-ENAMETOOLONG); + + /* false uninitialized warnings on gcc 4.8.x */ + err = erofs_namei(dir, &dentry->d_name, &nid, &d_type); + + if (err == -ENOENT) { + /* negative dentry */ + inode = NULL; + } else if (unlikely(err)) { + inode = ERR_PTR(err); + } else { + debugln("%s, %s (nid %llu) found, d_type %u", __func__, + dentry->d_name.name, nid, d_type); + inode = erofs_iget(dir->i_sb, nid, d_type == FT_DIR); + } + return d_splice_alias(inode, dentry); +} + +const struct inode_operations erofs_dir_iops = { + .lookup = erofs_lookup, + .getattr = erofs_getattr, +#ifdef CONFIG_EROFS_FS_XATTR + .listxattr = erofs_listxattr, +#endif + .get_acl = erofs_get_acl, +}; + diff --git a/fs/erofs/super.c b/fs/erofs/super.c new file mode 100644 index 000000000000..6d3a9bcb8daa --- /dev/null +++ b/fs/erofs/super.c @@ -0,0 +1,669 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2017-2018 HUAWEI, Inc. + * http://www.huawei.com/ + * Created by Gao Xiang <gaoxiang25@huawei.com> + */ +#include <linux/module.h> +#include <linux/buffer_head.h> +#include <linux/statfs.h> +#include <linux/parser.h> +#include <linux/seq_file.h> +#include "xattr.h" + +#define CREATE_TRACE_POINTS +#include <trace/events/erofs.h> + +static struct kmem_cache *erofs_inode_cachep __read_mostly; + +static void init_once(void *ptr) +{ + struct erofs_vnode *vi = ptr; + + inode_init_once(&vi->vfs_inode); +} + +static int __init erofs_init_inode_cache(void) +{ + erofs_inode_cachep = kmem_cache_create("erofs_inode", + sizeof(struct erofs_vnode), 0, + SLAB_RECLAIM_ACCOUNT, + init_once); + + return erofs_inode_cachep ? 0 : -ENOMEM; +} + +static void erofs_exit_inode_cache(void) +{ + kmem_cache_destroy(erofs_inode_cachep); +} + +static struct inode *alloc_inode(struct super_block *sb) +{ + struct erofs_vnode *vi = + kmem_cache_alloc(erofs_inode_cachep, GFP_KERNEL); + + if (!vi) + return NULL; + + /* zero out everything except vfs_inode */ + memset(vi, 0, offsetof(struct erofs_vnode, vfs_inode)); + return &vi->vfs_inode; +} + +static void free_inode(struct inode *inode) +{ + struct erofs_vnode *vi = EROFS_V(inode); + + /* be careful RCU symlink path (see ext4_inode_info->i_data)! */ + if (is_inode_fast_symlink(inode)) + kfree(inode->i_link); + + kfree(vi->xattr_shared_xattrs); + + kmem_cache_free(erofs_inode_cachep, vi); +} + +static bool check_layout_compatibility(struct super_block *sb, + struct erofs_super_block *layout) +{ + const unsigned int requirements = le32_to_cpu(layout->requirements); + + EROFS_SB(sb)->requirements = requirements; + + /* check if current kernel meets all mandatory requirements */ + if (requirements & (~EROFS_ALL_REQUIREMENTS)) { + errln("unidentified requirements %x, please upgrade kernel version", + requirements & ~EROFS_ALL_REQUIREMENTS); + return false; + } + return true; +} + +static int superblock_read(struct super_block *sb) +{ + struct erofs_sb_info *sbi; + struct buffer_head *bh; + struct erofs_super_block *layout; + unsigned int blkszbits; + int ret; + + bh = sb_bread(sb, 0); + + if (!bh) { + errln("cannot read erofs superblock"); + return -EIO; + } + + sbi = EROFS_SB(sb); + layout = (struct erofs_super_block *)((u8 *)bh->b_data + + EROFS_SUPER_OFFSET); + + ret = -EINVAL; + if (le32_to_cpu(layout->magic) != EROFS_SUPER_MAGIC_V1) { + errln("cannot find valid erofs superblock"); + goto out; + } + + blkszbits = layout->blkszbits; + /* 9(512 bytes) + LOG_SECTORS_PER_BLOCK == LOG_BLOCK_SIZE */ + if (unlikely(blkszbits != LOG_BLOCK_SIZE)) { + errln("blksize %u isn't supported on this platform", + 1 << blkszbits); + goto out; + } + + if (!check_layout_compatibility(sb, layout)) + goto out; + + sbi->blocks = le32_to_cpu(layout->blocks); + sbi->meta_blkaddr = le32_to_cpu(layout->meta_blkaddr); +#ifdef CONFIG_EROFS_FS_XATTR + sbi->xattr_blkaddr = le32_to_cpu(layout->xattr_blkaddr); +#endif + sbi->islotbits = ffs(sizeof(struct erofs_inode_v1)) - 1; + sbi->root_nid = le16_to_cpu(layout->root_nid); + sbi->inos = le64_to_cpu(layout->inos); + + sbi->build_time = le64_to_cpu(layout->build_time); + sbi->build_time_nsec = le32_to_cpu(layout->build_time_nsec); + + memcpy(&sb->s_uuid, layout->uuid, sizeof(layout->uuid)); + + ret = strscpy(sbi->volume_name, layout->volume_name, + sizeof(layout->volume_name)); + if (ret < 0) { /* -E2BIG */ + errln("bad volume name without NIL terminator"); + ret = -EFSCORRUPTED; + goto out; + } + ret = 0; +out: + brelse(bh); + return ret; +} + +#ifdef CONFIG_EROFS_FAULT_INJECTION +const char *erofs_fault_name[FAULT_MAX] = { + [FAULT_KMALLOC] = "kmalloc", + [FAULT_READ_IO] = "read IO error", +}; + +static void __erofs_build_fault_attr(struct erofs_sb_info *sbi, + unsigned int rate) +{ + struct erofs_fault_info *ffi = &sbi->fault_info; + + if (rate) { + atomic_set(&ffi->inject_ops, 0); + ffi->inject_rate = rate; + ffi->inject_type = (1 << FAULT_MAX) - 1; + } else { + memset(ffi, 0, sizeof(struct erofs_fault_info)); + } + + set_opt(sbi, FAULT_INJECTION); +} + +static int erofs_build_fault_attr(struct erofs_sb_info *sbi, + substring_t *args) +{ + int rate = 0; + + if (args->from && match_int(args, &rate)) + return -EINVAL; + + __erofs_build_fault_attr(sbi, rate); + return 0; +} + +static unsigned int erofs_get_fault_rate(struct erofs_sb_info *sbi) +{ + return sbi->fault_info.inject_rate; +} +#else +static void __erofs_build_fault_attr(struct erofs_sb_info *sbi, + unsigned int rate) +{ +} + +static int erofs_build_fault_attr(struct erofs_sb_info *sbi, + substring_t *args) +{ + infoln("fault_injection options not supported"); + return 0; +} + +static unsigned int erofs_get_fault_rate(struct erofs_sb_info *sbi) +{ + return 0; +} +#endif + +#ifdef CONFIG_EROFS_FS_ZIP +static int erofs_build_cache_strategy(struct erofs_sb_info *sbi, + substring_t *args) +{ + const char *cs = match_strdup(args); + int err = 0; + + if (!cs) { + errln("Not enough memory to store cache strategy"); + return -ENOMEM; + } + + if (!strcmp(cs, "disabled")) { + sbi->cache_strategy = EROFS_ZIP_CACHE_DISABLED; + } else if (!strcmp(cs, "readahead")) { + sbi->cache_strategy = EROFS_ZIP_CACHE_READAHEAD; + } else if (!strcmp(cs, "readaround")) { + sbi->cache_strategy = EROFS_ZIP_CACHE_READAROUND; + } else { + errln("Unrecognized cache strategy \"%s\"", cs); + err = -EINVAL; + } + kfree(cs); + return err; +} +#else +static int erofs_build_cache_strategy(struct erofs_sb_info *sbi, + substring_t *args) +{ + infoln("EROFS compression is disabled, so cache strategy is ignored"); + return 0; +} +#endif + +/* set up default EROFS parameters */ +static void default_options(struct erofs_sb_info *sbi) +{ +#ifdef CONFIG_EROFS_FS_ZIP + sbi->cache_strategy = EROFS_ZIP_CACHE_READAROUND; + sbi->max_sync_decompress_pages = 3; +#endif +#ifdef CONFIG_EROFS_FS_XATTR + set_opt(sbi, XATTR_USER); +#endif +#ifdef CONFIG_EROFS_FS_POSIX_ACL + set_opt(sbi, POSIX_ACL); +#endif +} + +enum { + Opt_user_xattr, + Opt_nouser_xattr, + Opt_acl, + Opt_noacl, + Opt_fault_injection, + Opt_cache_strategy, + Opt_err +}; + +static match_table_t erofs_tokens = { + {Opt_user_xattr, "user_xattr"}, + {Opt_nouser_xattr, "nouser_xattr"}, + {Opt_acl, "acl"}, + {Opt_noacl, "noacl"}, + {Opt_fault_injection, "fault_injection=%u"}, + {Opt_cache_strategy, "cache_strategy=%s"}, + {Opt_err, NULL} +}; + +static int parse_options(struct super_block *sb, char *options) +{ + substring_t args[MAX_OPT_ARGS]; + char *p; + int err; + + if (!options) + return 0; + + while ((p = strsep(&options, ","))) { + int token; + + if (!*p) + continue; + + args[0].to = args[0].from = NULL; + token = match_token(p, erofs_tokens, args); + + switch (token) { +#ifdef CONFIG_EROFS_FS_XATTR + case Opt_user_xattr: + set_opt(EROFS_SB(sb), XATTR_USER); + break; + case Opt_nouser_xattr: + clear_opt(EROFS_SB(sb), XATTR_USER); + break; +#else + case Opt_user_xattr: + infoln("user_xattr options not supported"); + break; + case Opt_nouser_xattr: + infoln("nouser_xattr options not supported"); + break; +#endif +#ifdef CONFIG_EROFS_FS_POSIX_ACL + case Opt_acl: + set_opt(EROFS_SB(sb), POSIX_ACL); + break; + case Opt_noacl: + clear_opt(EROFS_SB(sb), POSIX_ACL); + break; +#else + case Opt_acl: + infoln("acl options not supported"); + break; + case Opt_noacl: + infoln("noacl options not supported"); + break; +#endif + case Opt_fault_injection: + err = erofs_build_fault_attr(EROFS_SB(sb), args); + if (err) + return err; + break; + case Opt_cache_strategy: + err = erofs_build_cache_strategy(EROFS_SB(sb), args); + if (err) + return err; + break; + default: + errln("Unrecognized mount option \"%s\" or missing value", p); + return -EINVAL; + } + } + return 0; +} + +#ifdef CONFIG_EROFS_FS_ZIP +static const struct address_space_operations managed_cache_aops; + +static int managed_cache_releasepage(struct page *page, gfp_t gfp_mask) +{ + int ret = 1; /* 0 - busy */ + struct address_space *const mapping = page->mapping; + + DBG_BUGON(!PageLocked(page)); + DBG_BUGON(mapping->a_ops != &managed_cache_aops); + + if (PagePrivate(page)) + ret = erofs_try_to_free_cached_page(mapping, page); + + return ret; +} + +static void managed_cache_invalidatepage(struct page *page, + unsigned int offset, + unsigned int length) +{ + const unsigned int stop = length + offset; + + DBG_BUGON(!PageLocked(page)); + + /* Check for potential overflow in debug mode */ + DBG_BUGON(stop > PAGE_SIZE || stop < length); + + if (offset == 0 && stop == PAGE_SIZE) + while (!managed_cache_releasepage(page, GFP_NOFS)) + cond_resched(); +} + +static const struct address_space_operations managed_cache_aops = { + .releasepage = managed_cache_releasepage, + .invalidatepage = managed_cache_invalidatepage, +}; + +static int erofs_init_managed_cache(struct super_block *sb) +{ + struct erofs_sb_info *const sbi = EROFS_SB(sb); + struct inode *const inode = new_inode(sb); + + if (unlikely(!inode)) + return -ENOMEM; + + set_nlink(inode, 1); + inode->i_size = OFFSET_MAX; + + inode->i_mapping->a_ops = &managed_cache_aops; + mapping_set_gfp_mask(inode->i_mapping, + GFP_NOFS | __GFP_HIGHMEM | __GFP_MOVABLE); + sbi->managed_cache = inode; + return 0; +} +#else +static int erofs_init_managed_cache(struct super_block *sb) { return 0; } +#endif + +static int erofs_fill_super(struct super_block *sb, void *data, int silent) +{ + struct inode *inode; + struct erofs_sb_info *sbi; + int err; + + infoln("fill_super, device -> %s", sb->s_id); + infoln("options -> %s", (char *)data); + + sb->s_magic = EROFS_SUPER_MAGIC; + + if (unlikely(!sb_set_blocksize(sb, EROFS_BLKSIZ))) { + errln("failed to set erofs blksize"); + return -EINVAL; + } + + sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); + if (unlikely(!sbi)) + return -ENOMEM; + + sb->s_fs_info = sbi; + err = superblock_read(sb); + if (err) + return err; + + sb->s_flags |= SB_RDONLY | SB_NOATIME; + sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_time_gran = 1; + + sb->s_op = &erofs_sops; + +#ifdef CONFIG_EROFS_FS_XATTR + sb->s_xattr = erofs_xattr_handlers; +#endif + /* set erofs default mount options */ + default_options(sbi); + + err = parse_options(sb, data); + if (unlikely(err)) + return err; + + if (!silent) + infoln("root inode @ nid %llu", ROOT_NID(sbi)); + + if (test_opt(sbi, POSIX_ACL)) + sb->s_flags |= SB_POSIXACL; + else + sb->s_flags &= ~SB_POSIXACL; + +#ifdef CONFIG_EROFS_FS_ZIP + INIT_RADIX_TREE(&sbi->workstn_tree, GFP_ATOMIC); +#endif + + /* get the root inode */ + inode = erofs_iget(sb, ROOT_NID(sbi), true); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + if (unlikely(!S_ISDIR(inode->i_mode))) { + errln("rootino(nid %llu) is not a directory(i_mode %o)", + ROOT_NID(sbi), inode->i_mode); + iput(inode); + return -EINVAL; + } + + sb->s_root = d_make_root(inode); + if (unlikely(!sb->s_root)) + return -ENOMEM; + + erofs_shrinker_register(sb); + /* sb->s_umount is already locked, SB_ACTIVE and SB_BORN are not set */ + err = erofs_init_managed_cache(sb); + if (unlikely(err)) + return err; + + if (!silent) + infoln("mounted on %s with opts: %s.", sb->s_id, (char *)data); + return 0; +} + +static struct dentry *erofs_mount(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data) +{ + return mount_bdev(fs_type, flags, dev_name, data, erofs_fill_super); +} + +/* + * could be triggered after deactivate_locked_super() + * is called, thus including umount and failed to initialize. + */ +static void erofs_kill_sb(struct super_block *sb) +{ + struct erofs_sb_info *sbi; + + WARN_ON(sb->s_magic != EROFS_SUPER_MAGIC); + infoln("unmounting for %s", sb->s_id); + + kill_block_super(sb); + + sbi = EROFS_SB(sb); + if (!sbi) + return; + kfree(sbi); + sb->s_fs_info = NULL; +} + +/* called when ->s_root is non-NULL */ +static void erofs_put_super(struct super_block *sb) +{ + struct erofs_sb_info *const sbi = EROFS_SB(sb); + + DBG_BUGON(!sbi); + + erofs_shrinker_unregister(sb); +#ifdef CONFIG_EROFS_FS_ZIP + iput(sbi->managed_cache); + sbi->managed_cache = NULL; +#endif +} + +static struct file_system_type erofs_fs_type = { + .owner = THIS_MODULE, + .name = "erofs", + .mount = erofs_mount, + .kill_sb = erofs_kill_sb, + .fs_flags = FS_REQUIRES_DEV, +}; +MODULE_ALIAS_FS("erofs"); + +static int __init erofs_module_init(void) +{ + int err; + + erofs_check_ondisk_layout_definitions(); + infoln("initializing erofs " EROFS_VERSION); + + err = erofs_init_inode_cache(); + if (err) + goto icache_err; + + err = erofs_init_shrinker(); + if (err) + goto shrinker_err; + + err = z_erofs_init_zip_subsystem(); + if (err) + goto zip_err; + + err = register_filesystem(&erofs_fs_type); + if (err) + goto fs_err; + + infoln("successfully to initialize erofs"); + return 0; + +fs_err: + z_erofs_exit_zip_subsystem(); +zip_err: + erofs_exit_shrinker(); +shrinker_err: + erofs_exit_inode_cache(); +icache_err: + return err; +} + +static void __exit erofs_module_exit(void) +{ + unregister_filesystem(&erofs_fs_type); + z_erofs_exit_zip_subsystem(); + erofs_exit_shrinker(); + erofs_exit_inode_cache(); + infoln("successfully finalize erofs"); +} + +/* get filesystem statistics */ +static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *sb = dentry->d_sb; + struct erofs_sb_info *sbi = EROFS_SB(sb); + u64 id = huge_encode_dev(sb->s_bdev->bd_dev); + + buf->f_type = sb->s_magic; + buf->f_bsize = EROFS_BLKSIZ; + buf->f_blocks = sbi->blocks; + buf->f_bfree = buf->f_bavail = 0; + + buf->f_files = ULLONG_MAX; + buf->f_ffree = ULLONG_MAX - sbi->inos; + + buf->f_namelen = EROFS_NAME_LEN; + + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); + return 0; +} + +static int erofs_show_options(struct seq_file *seq, struct dentry *root) +{ + struct erofs_sb_info *sbi __maybe_unused = EROFS_SB(root->d_sb); + +#ifdef CONFIG_EROFS_FS_XATTR + if (test_opt(sbi, XATTR_USER)) + seq_puts(seq, ",user_xattr"); + else + seq_puts(seq, ",nouser_xattr"); +#endif +#ifdef CONFIG_EROFS_FS_POSIX_ACL + if (test_opt(sbi, POSIX_ACL)) + seq_puts(seq, ",acl"); + else + seq_puts(seq, ",noacl"); +#endif + if (test_opt(sbi, FAULT_INJECTION)) + seq_printf(seq, ",fault_injection=%u", + erofs_get_fault_rate(sbi)); +#ifdef CONFIG_EROFS_FS_ZIP + if (sbi->cache_strategy == EROFS_ZIP_CACHE_DISABLED) { + seq_puts(seq, ",cache_strategy=disabled"); + } else if (sbi->cache_strategy == EROFS_ZIP_CACHE_READAHEAD) { + seq_puts(seq, ",cache_strategy=readahead"); + } else if (sbi->cache_strategy == EROFS_ZIP_CACHE_READAROUND) { + seq_puts(seq, ",cache_strategy=readaround"); + } else { + seq_puts(seq, ",cache_strategy=(unknown)"); + DBG_BUGON(1); + } +#endif + return 0; +} + +static int erofs_remount(struct super_block *sb, int *flags, char *data) +{ + struct erofs_sb_info *sbi = EROFS_SB(sb); + unsigned int org_mnt_opt = sbi->mount_opt; + unsigned int org_inject_rate = erofs_get_fault_rate(sbi); + int err; + + DBG_BUGON(!sb_rdonly(sb)); + err = parse_options(sb, data); + if (err) + goto out; + + if (test_opt(sbi, POSIX_ACL)) + sb->s_flags |= SB_POSIXACL; + else + sb->s_flags &= ~SB_POSIXACL; + + *flags |= SB_RDONLY; + return 0; +out: + __erofs_build_fault_attr(sbi, org_inject_rate); + sbi->mount_opt = org_mnt_opt; + + return err; +} + +const struct super_operations erofs_sops = { + .put_super = erofs_put_super, + .alloc_inode = alloc_inode, + .free_inode = free_inode, + .statfs = erofs_statfs, + .show_options = erofs_show_options, + .remount_fs = erofs_remount, +}; + +module_init(erofs_module_init); +module_exit(erofs_module_exit); + +MODULE_DESCRIPTION("Enhanced ROM File System"); +MODULE_AUTHOR("Gao Xiang, Chao Yu, Miao Xie, CONSUMER BG, HUAWEI Inc."); +MODULE_LICENSE("GPL"); + diff --git a/fs/erofs/tagptr.h b/fs/erofs/tagptr.h new file mode 100644 index 000000000000..a72897c86744 --- /dev/null +++ b/fs/erofs/tagptr.h @@ -0,0 +1,110 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * A tagged pointer implementation + * + * Copyright (C) 2018 Gao Xiang <gaoxiang25@huawei.com> + */ +#ifndef __EROFS_FS_TAGPTR_H +#define __EROFS_FS_TAGPTR_H + +#include <linux/types.h> +#include <linux/build_bug.h> + +/* + * the name of tagged pointer types are tagptr{1, 2, 3...}_t + * avoid directly using the internal structs __tagptr{1, 2, 3...} + */ +#define __MAKE_TAGPTR(n) \ +typedef struct __tagptr##n { \ + uintptr_t v; \ +} tagptr##n##_t; + +__MAKE_TAGPTR(1) +__MAKE_TAGPTR(2) +__MAKE_TAGPTR(3) +__MAKE_TAGPTR(4) + +#undef __MAKE_TAGPTR + +extern void __compiletime_error("bad tagptr tags") + __bad_tagptr_tags(void); + +extern void __compiletime_error("bad tagptr type") + __bad_tagptr_type(void); + +/* fix the broken usage of "#define tagptr2_t tagptr3_t" by users */ +#define __tagptr_mask_1(ptr, n) \ + __builtin_types_compatible_p(typeof(ptr), struct __tagptr##n) ? \ + (1UL << (n)) - 1 : + +#define __tagptr_mask(ptr) (\ + __tagptr_mask_1(ptr, 1) ( \ + __tagptr_mask_1(ptr, 2) ( \ + __tagptr_mask_1(ptr, 3) ( \ + __tagptr_mask_1(ptr, 4) ( \ + __bad_tagptr_type(), 0))))) + +/* generate a tagged pointer from a raw value */ +#define tagptr_init(type, val) \ + ((typeof(type)){ .v = (uintptr_t)(val) }) + +/* + * directly cast a tagged pointer to the native pointer type, which + * could be used for backward compatibility of existing code. + */ +#define tagptr_cast_ptr(tptr) ((void *)(tptr).v) + +/* encode tagged pointers */ +#define tagptr_fold(type, ptr, _tags) ({ \ + const typeof(_tags) tags = (_tags); \ + if (__builtin_constant_p(tags) && (tags & ~__tagptr_mask(type))) \ + __bad_tagptr_tags(); \ +tagptr_init(type, (uintptr_t)(ptr) | tags); }) + +/* decode tagged pointers */ +#define tagptr_unfold_ptr(tptr) \ + ((void *)((tptr).v & ~__tagptr_mask(tptr))) + +#define tagptr_unfold_tags(tptr) \ + ((tptr).v & __tagptr_mask(tptr)) + +/* operations for the tagger pointer */ +#define tagptr_eq(_tptr1, _tptr2) ({ \ + typeof(_tptr1) tptr1 = (_tptr1); \ + typeof(_tptr2) tptr2 = (_tptr2); \ + (void)(&tptr1 == &tptr2); \ +(tptr1).v == (tptr2).v; }) + +/* lock-free CAS operation */ +#define tagptr_cmpxchg(_ptptr, _o, _n) ({ \ + typeof(_ptptr) ptptr = (_ptptr); \ + typeof(_o) o = (_o); \ + typeof(_n) n = (_n); \ + (void)(&o == &n); \ + (void)(&o == ptptr); \ +tagptr_init(o, cmpxchg(&ptptr->v, o.v, n.v)); }) + +/* wrap WRITE_ONCE if atomic update is needed */ +#define tagptr_replace_tags(_ptptr, tags) ({ \ + typeof(_ptptr) ptptr = (_ptptr); \ + *ptptr = tagptr_fold(*ptptr, tagptr_unfold_ptr(*ptptr), tags); \ +*ptptr; }) + +#define tagptr_set_tags(_ptptr, _tags) ({ \ + typeof(_ptptr) ptptr = (_ptptr); \ + const typeof(_tags) tags = (_tags); \ + if (__builtin_constant_p(tags) && (tags & ~__tagptr_mask(*ptptr))) \ + __bad_tagptr_tags(); \ + ptptr->v |= tags; \ +*ptptr; }) + +#define tagptr_clear_tags(_ptptr, _tags) ({ \ + typeof(_ptptr) ptptr = (_ptptr); \ + const typeof(_tags) tags = (_tags); \ + if (__builtin_constant_p(tags) && (tags & ~__tagptr_mask(*ptptr))) \ + __bad_tagptr_tags(); \ + ptptr->v &= ~tags; \ +*ptptr; }) + +#endif /* __EROFS_FS_TAGPTR_H */ + diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c new file mode 100644 index 000000000000..1dd041aa0f5a --- /dev/null +++ b/fs/erofs/utils.c @@ -0,0 +1,333 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2018 HUAWEI, Inc. + * http://www.huawei.com/ + * Created by Gao Xiang <gaoxiang25@huawei.com> + */ +#include "internal.h" +#include <linux/pagevec.h> + +struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp, bool nofail) +{ + struct page *page; + + if (!list_empty(pool)) { + page = lru_to_page(pool); + DBG_BUGON(page_ref_count(page) != 1); + list_del(&page->lru); + } else { + page = alloc_pages(gfp | (nofail ? __GFP_NOFAIL : 0), 0); + } + return page; +} + +#if (EROFS_PCPUBUF_NR_PAGES > 0) +static struct { + u8 data[PAGE_SIZE * EROFS_PCPUBUF_NR_PAGES]; +} ____cacheline_aligned_in_smp erofs_pcpubuf[NR_CPUS]; + +void *erofs_get_pcpubuf(unsigned int pagenr) +{ + preempt_disable(); + return &erofs_pcpubuf[smp_processor_id()].data[pagenr * PAGE_SIZE]; +} +#endif + +#ifdef CONFIG_EROFS_FS_ZIP +/* global shrink count (for all mounted EROFS instances) */ +static atomic_long_t erofs_global_shrink_cnt; + +#define __erofs_workgroup_get(grp) atomic_inc(&(grp)->refcount) +#define __erofs_workgroup_put(grp) atomic_dec(&(grp)->refcount) + +static int erofs_workgroup_get(struct erofs_workgroup *grp) +{ + int o; + +repeat: + o = erofs_wait_on_workgroup_freezed(grp); + if (unlikely(o <= 0)) + return -1; + + if (unlikely(atomic_cmpxchg(&grp->refcount, o, o + 1) != o)) + goto repeat; + + /* decrease refcount paired by erofs_workgroup_put */ + if (unlikely(o == 1)) + atomic_long_dec(&erofs_global_shrink_cnt); + return 0; +} + +struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb, + pgoff_t index, bool *tag) +{ + struct erofs_sb_info *sbi = EROFS_SB(sb); + struct erofs_workgroup *grp; + +repeat: + rcu_read_lock(); + grp = radix_tree_lookup(&sbi->workstn_tree, index); + if (grp) { + *tag = xa_pointer_tag(grp); + grp = xa_untag_pointer(grp); + + if (erofs_workgroup_get(grp)) { + /* prefer to relax rcu read side */ + rcu_read_unlock(); + goto repeat; + } + + DBG_BUGON(index != grp->index); + } + rcu_read_unlock(); + return grp; +} + +int erofs_register_workgroup(struct super_block *sb, + struct erofs_workgroup *grp, + bool tag) +{ + struct erofs_sb_info *sbi; + int err; + + /* grp shouldn't be broken or used before */ + if (unlikely(atomic_read(&grp->refcount) != 1)) { + DBG_BUGON(1); + return -EINVAL; + } + + err = radix_tree_preload(GFP_NOFS); + if (err) + return err; + + sbi = EROFS_SB(sb); + xa_lock(&sbi->workstn_tree); + + grp = xa_tag_pointer(grp, tag); + + /* + * Bump up reference count before making this workgroup + * visible to other users in order to avoid potential UAF + * without serialized by workstn_lock. + */ + __erofs_workgroup_get(grp); + + err = radix_tree_insert(&sbi->workstn_tree, grp->index, grp); + if (unlikely(err)) + /* + * it's safe to decrease since the workgroup isn't visible + * and refcount >= 2 (cannot be freezed). + */ + __erofs_workgroup_put(grp); + + xa_unlock(&sbi->workstn_tree); + radix_tree_preload_end(); + return err; +} + +static void __erofs_workgroup_free(struct erofs_workgroup *grp) +{ + atomic_long_dec(&erofs_global_shrink_cnt); + erofs_workgroup_free_rcu(grp); +} + +int erofs_workgroup_put(struct erofs_workgroup *grp) +{ + int count = atomic_dec_return(&grp->refcount); + + if (count == 1) + atomic_long_inc(&erofs_global_shrink_cnt); + else if (!count) + __erofs_workgroup_free(grp); + return count; +} + +static void erofs_workgroup_unfreeze_final(struct erofs_workgroup *grp) +{ + erofs_workgroup_unfreeze(grp, 0); + __erofs_workgroup_free(grp); +} + +static bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi, + struct erofs_workgroup *grp, + bool cleanup) +{ + /* + * If managed cache is on, refcount of workgroups + * themselves could be < 0 (freezed). In other words, + * there is no guarantee that all refcounts > 0. + */ + if (!erofs_workgroup_try_to_freeze(grp, 1)) + return false; + + /* + * Note that all cached pages should be unattached + * before deleted from the radix tree. Otherwise some + * cached pages could be still attached to the orphan + * old workgroup when the new one is available in the tree. + */ + if (erofs_try_to_free_all_cached_pages(sbi, grp)) { + erofs_workgroup_unfreeze(grp, 1); + return false; + } + + /* + * It's impossible to fail after the workgroup is freezed, + * however in order to avoid some race conditions, add a + * DBG_BUGON to observe this in advance. + */ + DBG_BUGON(xa_untag_pointer(radix_tree_delete(&sbi->workstn_tree, + grp->index)) != grp); + + /* + * If managed cache is on, last refcount should indicate + * the related workstation. + */ + erofs_workgroup_unfreeze_final(grp); + return true; +} + +static unsigned long erofs_shrink_workstation(struct erofs_sb_info *sbi, + unsigned long nr_shrink, + bool cleanup) +{ + pgoff_t first_index = 0; + void *batch[PAGEVEC_SIZE]; + unsigned int freed = 0; + + int i, found; +repeat: + xa_lock(&sbi->workstn_tree); + + found = radix_tree_gang_lookup(&sbi->workstn_tree, + batch, first_index, PAGEVEC_SIZE); + + for (i = 0; i < found; ++i) { + struct erofs_workgroup *grp = xa_untag_pointer(batch[i]); + + first_index = grp->index + 1; + + /* try to shrink each valid workgroup */ + if (!erofs_try_to_release_workgroup(sbi, grp, cleanup)) + continue; + + ++freed; + if (unlikely(!--nr_shrink)) + break; + } + xa_unlock(&sbi->workstn_tree); + + if (i && nr_shrink) + goto repeat; + return freed; +} + +/* protected by 'erofs_sb_list_lock' */ +static unsigned int shrinker_run_no; + +/* protects the mounted 'erofs_sb_list' */ +static DEFINE_SPINLOCK(erofs_sb_list_lock); +static LIST_HEAD(erofs_sb_list); + +void erofs_shrinker_register(struct super_block *sb) +{ + struct erofs_sb_info *sbi = EROFS_SB(sb); + + mutex_init(&sbi->umount_mutex); + + spin_lock(&erofs_sb_list_lock); + list_add(&sbi->list, &erofs_sb_list); + spin_unlock(&erofs_sb_list_lock); +} + +void erofs_shrinker_unregister(struct super_block *sb) +{ + struct erofs_sb_info *const sbi = EROFS_SB(sb); + + mutex_lock(&sbi->umount_mutex); + erofs_shrink_workstation(sbi, ~0UL, true); + + spin_lock(&erofs_sb_list_lock); + list_del(&sbi->list); + spin_unlock(&erofs_sb_list_lock); + mutex_unlock(&sbi->umount_mutex); +} + +static unsigned long erofs_shrink_count(struct shrinker *shrink, + struct shrink_control *sc) +{ + return atomic_long_read(&erofs_global_shrink_cnt); +} + +static unsigned long erofs_shrink_scan(struct shrinker *shrink, + struct shrink_control *sc) +{ + struct erofs_sb_info *sbi; + struct list_head *p; + + unsigned long nr = sc->nr_to_scan; + unsigned int run_no; + unsigned long freed = 0; + + spin_lock(&erofs_sb_list_lock); + do { + run_no = ++shrinker_run_no; + } while (run_no == 0); + + /* Iterate over all mounted superblocks and try to shrink them */ + p = erofs_sb_list.next; + while (p != &erofs_sb_list) { + sbi = list_entry(p, struct erofs_sb_info, list); + + /* + * We move the ones we do to the end of the list, so we stop + * when we see one we have already done. + */ + if (sbi->shrinker_run_no == run_no) + break; + + if (!mutex_trylock(&sbi->umount_mutex)) { + p = p->next; + continue; + } + + spin_unlock(&erofs_sb_list_lock); + sbi->shrinker_run_no = run_no; + + freed += erofs_shrink_workstation(sbi, nr, false); + + spin_lock(&erofs_sb_list_lock); + /* Get the next list element before we move this one */ + p = p->next; + + /* + * Move this one to the end of the list to provide some + * fairness. + */ + list_move_tail(&sbi->list, &erofs_sb_list); + mutex_unlock(&sbi->umount_mutex); + + if (freed >= nr) + break; + } + spin_unlock(&erofs_sb_list_lock); + return freed; +} + +static struct shrinker erofs_shrinker_info = { + .scan_objects = erofs_shrink_scan, + .count_objects = erofs_shrink_count, + .seeks = DEFAULT_SEEKS, +}; + +int __init erofs_init_shrinker(void) +{ + return register_shrinker(&erofs_shrinker_info); +} + +void erofs_exit_shrinker(void) +{ + unregister_shrinker(&erofs_shrinker_info); +} +#endif /* !CONFIG_EROFS_FS_ZIP */ + diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c new file mode 100644 index 000000000000..a8286998a079 --- /dev/null +++ b/fs/erofs/xattr.c @@ -0,0 +1,703 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2017-2018 HUAWEI, Inc. + * http://www.huawei.com/ + * Created by Gao Xiang <gaoxiang25@huawei.com> + */ +#include <linux/security.h> +#include "xattr.h" + +struct xattr_iter { + struct super_block *sb; + struct page *page; + void *kaddr; + + erofs_blk_t blkaddr; + unsigned int ofs; +}; + +static inline void xattr_iter_end(struct xattr_iter *it, bool atomic) +{ + /* the only user of kunmap() is 'init_inode_xattrs' */ + if (unlikely(!atomic)) + kunmap(it->page); + else + kunmap_atomic(it->kaddr); + + unlock_page(it->page); + put_page(it->page); +} + +static inline void xattr_iter_end_final(struct xattr_iter *it) +{ + if (!it->page) + return; + + xattr_iter_end(it, true); +} + +static int init_inode_xattrs(struct inode *inode) +{ + struct erofs_vnode *const vi = EROFS_V(inode); + struct xattr_iter it; + unsigned int i; + struct erofs_xattr_ibody_header *ih; + struct super_block *sb; + struct erofs_sb_info *sbi; + bool atomic_map; + int ret = 0; + + /* the most case is that xattrs of this inode are initialized. */ + if (test_bit(EROFS_V_EA_INITED_BIT, &vi->flags)) + return 0; + + if (wait_on_bit_lock(&vi->flags, EROFS_V_BL_XATTR_BIT, TASK_KILLABLE)) + return -ERESTARTSYS; + + /* someone has initialized xattrs for us? */ + if (test_bit(EROFS_V_EA_INITED_BIT, &vi->flags)) + goto out_unlock; + + /* + * bypass all xattr operations if ->xattr_isize is not greater than + * sizeof(struct erofs_xattr_ibody_header), in detail: + * 1) it is not enough to contain erofs_xattr_ibody_header then + * ->xattr_isize should be 0 (it means no xattr); + * 2) it is just to contain erofs_xattr_ibody_header, which is on-disk + * undefined right now (maybe use later with some new sb feature). + */ + if (vi->xattr_isize == sizeof(struct erofs_xattr_ibody_header)) { + errln("xattr_isize %d of nid %llu is not supported yet", + vi->xattr_isize, vi->nid); + ret = -EOPNOTSUPP; + goto out_unlock; + } else if (vi->xattr_isize < sizeof(struct erofs_xattr_ibody_header)) { + if (unlikely(vi->xattr_isize)) { + errln("bogus xattr ibody @ nid %llu", vi->nid); + DBG_BUGON(1); + ret = -EFSCORRUPTED; + goto out_unlock; /* xattr ondisk layout error */ + } + ret = -ENOATTR; + goto out_unlock; + } + + sb = inode->i_sb; + sbi = EROFS_SB(sb); + it.blkaddr = erofs_blknr(iloc(sbi, vi->nid) + vi->inode_isize); + it.ofs = erofs_blkoff(iloc(sbi, vi->nid) + vi->inode_isize); + + it.page = erofs_get_inline_page(inode, it.blkaddr); + if (IS_ERR(it.page)) { + ret = PTR_ERR(it.page); + goto out_unlock; + } + + /* read in shared xattr array (non-atomic, see kmalloc below) */ + it.kaddr = kmap(it.page); + atomic_map = false; + + ih = (struct erofs_xattr_ibody_header *)(it.kaddr + it.ofs); + + vi->xattr_shared_count = ih->h_shared_count; + vi->xattr_shared_xattrs = kmalloc_array(vi->xattr_shared_count, + sizeof(uint), GFP_KERNEL); + if (!vi->xattr_shared_xattrs) { + xattr_iter_end(&it, atomic_map); + ret = -ENOMEM; + goto out_unlock; + } + + /* let's skip ibody header */ + it.ofs += sizeof(struct erofs_xattr_ibody_header); + + for (i = 0; i < vi->xattr_shared_count; ++i) { + if (unlikely(it.ofs >= EROFS_BLKSIZ)) { + /* cannot be unaligned */ + DBG_BUGON(it.ofs != EROFS_BLKSIZ); + xattr_iter_end(&it, atomic_map); + + it.page = erofs_get_meta_page(sb, ++it.blkaddr, + S_ISDIR(inode->i_mode)); + if (IS_ERR(it.page)) { + kfree(vi->xattr_shared_xattrs); + vi->xattr_shared_xattrs = NULL; + ret = PTR_ERR(it.page); + goto out_unlock; + } + + it.kaddr = kmap_atomic(it.page); + atomic_map = true; + it.ofs = 0; + } + vi->xattr_shared_xattrs[i] = + le32_to_cpu(*(__le32 *)(it.kaddr + it.ofs)); + it.ofs += sizeof(__le32); + } + xattr_iter_end(&it, atomic_map); + + set_bit(EROFS_V_EA_INITED_BIT, &vi->flags); + +out_unlock: + clear_and_wake_up_bit(EROFS_V_BL_XATTR_BIT, &vi->flags); + return ret; +} + +/* + * the general idea for these return values is + * if 0 is returned, go on processing the current xattr; + * 1 (> 0) is returned, skip this round to process the next xattr; + * -err (< 0) is returned, an error (maybe ENOXATTR) occurred + * and need to be handled + */ +struct xattr_iter_handlers { + int (*entry)(struct xattr_iter *_it, struct erofs_xattr_entry *entry); + int (*name)(struct xattr_iter *_it, unsigned int processed, char *buf, + unsigned int len); + int (*alloc_buffer)(struct xattr_iter *_it, unsigned int value_sz); + void (*value)(struct xattr_iter *_it, unsigned int processed, char *buf, + unsigned int len); +}; + +static inline int xattr_iter_fixup(struct xattr_iter *it) +{ + if (it->ofs < EROFS_BLKSIZ) + return 0; + + xattr_iter_end(it, true); + + it->blkaddr += erofs_blknr(it->ofs); + + it->page = erofs_get_meta_page(it->sb, it->blkaddr, false); + if (IS_ERR(it->page)) { + int err = PTR_ERR(it->page); + + it->page = NULL; + return err; + } + + it->kaddr = kmap_atomic(it->page); + it->ofs = erofs_blkoff(it->ofs); + return 0; +} + +static int inline_xattr_iter_begin(struct xattr_iter *it, + struct inode *inode) +{ + struct erofs_vnode *const vi = EROFS_V(inode); + struct erofs_sb_info *const sbi = EROFS_SB(inode->i_sb); + unsigned int xattr_header_sz, inline_xattr_ofs; + + xattr_header_sz = inlinexattr_header_size(inode); + if (unlikely(xattr_header_sz >= vi->xattr_isize)) { + DBG_BUGON(xattr_header_sz > vi->xattr_isize); + return -ENOATTR; + } + + inline_xattr_ofs = vi->inode_isize + xattr_header_sz; + + it->blkaddr = erofs_blknr(iloc(sbi, vi->nid) + inline_xattr_ofs); + it->ofs = erofs_blkoff(iloc(sbi, vi->nid) + inline_xattr_ofs); + + it->page = erofs_get_inline_page(inode, it->blkaddr); + if (IS_ERR(it->page)) + return PTR_ERR(it->page); + + it->kaddr = kmap_atomic(it->page); + return vi->xattr_isize - xattr_header_sz; +} + +/* + * Regardless of success or failure, `xattr_foreach' will end up with + * `ofs' pointing to the next xattr item rather than an arbitrary position. + */ +static int xattr_foreach(struct xattr_iter *it, + const struct xattr_iter_handlers *op, + unsigned int *tlimit) +{ + struct erofs_xattr_entry entry; + unsigned int value_sz, processed, slice; + int err; + + /* 0. fixup blkaddr, ofs, ipage */ + err = xattr_iter_fixup(it); + if (err) + return err; + + /* + * 1. read xattr entry to the memory, + * since we do EROFS_XATTR_ALIGN + * therefore entry should be in the page + */ + entry = *(struct erofs_xattr_entry *)(it->kaddr + it->ofs); + if (tlimit) { + unsigned int entry_sz = EROFS_XATTR_ENTRY_SIZE(&entry); + + /* xattr on-disk corruption: xattr entry beyond xattr_isize */ + if (unlikely(*tlimit < entry_sz)) { + DBG_BUGON(1); + return -EFSCORRUPTED; + } + *tlimit -= entry_sz; + } + + it->ofs += sizeof(struct erofs_xattr_entry); + value_sz = le16_to_cpu(entry.e_value_size); + + /* handle entry */ + err = op->entry(it, &entry); + if (err) { + it->ofs += entry.e_name_len + value_sz; + goto out; + } + + /* 2. handle xattr name (ofs will finally be at the end of name) */ + processed = 0; + + while (processed < entry.e_name_len) { + if (it->ofs >= EROFS_BLKSIZ) { + DBG_BUGON(it->ofs > EROFS_BLKSIZ); + + err = xattr_iter_fixup(it); + if (err) + goto out; + it->ofs = 0; + } + + slice = min_t(unsigned int, PAGE_SIZE - it->ofs, + entry.e_name_len - processed); + + /* handle name */ + err = op->name(it, processed, it->kaddr + it->ofs, slice); + if (err) { + it->ofs += entry.e_name_len - processed + value_sz; + goto out; + } + + it->ofs += slice; + processed += slice; + } + + /* 3. handle xattr value */ + processed = 0; + + if (op->alloc_buffer) { + err = op->alloc_buffer(it, value_sz); + if (err) { + it->ofs += value_sz; + goto out; + } + } + + while (processed < value_sz) { + if (it->ofs >= EROFS_BLKSIZ) { + DBG_BUGON(it->ofs > EROFS_BLKSIZ); + + err = xattr_iter_fixup(it); + if (err) + goto out; + it->ofs = 0; + } + + slice = min_t(unsigned int, PAGE_SIZE - it->ofs, + value_sz - processed); + op->value(it, processed, it->kaddr + it->ofs, slice); + it->ofs += slice; + processed += slice; + } + +out: + /* xattrs should be 4-byte aligned (on-disk constraint) */ + it->ofs = EROFS_XATTR_ALIGN(it->ofs); + return err < 0 ? err : 0; +} + +struct getxattr_iter { + struct xattr_iter it; + + char *buffer; + int buffer_size, index; + struct qstr name; +}; + +static int xattr_entrymatch(struct xattr_iter *_it, + struct erofs_xattr_entry *entry) +{ + struct getxattr_iter *it = container_of(_it, struct getxattr_iter, it); + + return (it->index != entry->e_name_index || + it->name.len != entry->e_name_len) ? -ENOATTR : 0; +} + +static int xattr_namematch(struct xattr_iter *_it, + unsigned int processed, char *buf, unsigned int len) +{ + struct getxattr_iter *it = container_of(_it, struct getxattr_iter, it); + + return memcmp(buf, it->name.name + processed, len) ? -ENOATTR : 0; +} + +static int xattr_checkbuffer(struct xattr_iter *_it, + unsigned int value_sz) +{ + struct getxattr_iter *it = container_of(_it, struct getxattr_iter, it); + int err = it->buffer_size < value_sz ? -ERANGE : 0; + + it->buffer_size = value_sz; + return !it->buffer ? 1 : err; +} + +static void xattr_copyvalue(struct xattr_iter *_it, + unsigned int processed, + char *buf, unsigned int len) +{ + struct getxattr_iter *it = container_of(_it, struct getxattr_iter, it); + + memcpy(it->buffer + processed, buf, len); +} + +static const struct xattr_iter_handlers find_xattr_handlers = { + .entry = xattr_entrymatch, + .name = xattr_namematch, + .alloc_buffer = xattr_checkbuffer, + .value = xattr_copyvalue +}; + +static int inline_getxattr(struct inode *inode, struct getxattr_iter *it) +{ + int ret; + unsigned int remaining; + + ret = inline_xattr_iter_begin(&it->it, inode); + if (ret < 0) + return ret; + + remaining = ret; + while (remaining) { + ret = xattr_foreach(&it->it, &find_xattr_handlers, &remaining); + if (ret != -ENOATTR) + break; + } + xattr_iter_end_final(&it->it); + + return ret ? ret : it->buffer_size; +} + +static int shared_getxattr(struct inode *inode, struct getxattr_iter *it) +{ + struct erofs_vnode *const vi = EROFS_V(inode); + struct super_block *const sb = inode->i_sb; + struct erofs_sb_info *const sbi = EROFS_SB(sb); + unsigned int i; + int ret = -ENOATTR; + + for (i = 0; i < vi->xattr_shared_count; ++i) { + erofs_blk_t blkaddr = + xattrblock_addr(sbi, vi->xattr_shared_xattrs[i]); + + it->it.ofs = xattrblock_offset(sbi, vi->xattr_shared_xattrs[i]); + + if (!i || blkaddr != it->it.blkaddr) { + if (i) + xattr_iter_end(&it->it, true); + + it->it.page = erofs_get_meta_page(sb, blkaddr, false); + if (IS_ERR(it->it.page)) + return PTR_ERR(it->it.page); + + it->it.kaddr = kmap_atomic(it->it.page); + it->it.blkaddr = blkaddr; + } + + ret = xattr_foreach(&it->it, &find_xattr_handlers, NULL); + if (ret != -ENOATTR) + break; + } + if (vi->xattr_shared_count) + xattr_iter_end_final(&it->it); + + return ret ? ret : it->buffer_size; +} + +static bool erofs_xattr_user_list(struct dentry *dentry) +{ + return test_opt(EROFS_SB(dentry->d_sb), XATTR_USER); +} + +static bool erofs_xattr_trusted_list(struct dentry *dentry) +{ + return capable(CAP_SYS_ADMIN); +} + +int erofs_getxattr(struct inode *inode, int index, + const char *name, + void *buffer, size_t buffer_size) +{ + int ret; + struct getxattr_iter it; + + if (unlikely(!name)) + return -EINVAL; + + ret = init_inode_xattrs(inode); + if (ret) + return ret; + + it.index = index; + + it.name.len = strlen(name); + if (it.name.len > EROFS_NAME_LEN) + return -ERANGE; + it.name.name = name; + + it.buffer = buffer; + it.buffer_size = buffer_size; + + it.it.sb = inode->i_sb; + ret = inline_getxattr(inode, &it); + if (ret == -ENOATTR) + ret = shared_getxattr(inode, &it); + return ret; +} + +static int erofs_xattr_generic_get(const struct xattr_handler *handler, + struct dentry *unused, struct inode *inode, + const char *name, void *buffer, size_t size) +{ + struct erofs_sb_info *const sbi = EROFS_I_SB(inode); + + switch (handler->flags) { + case EROFS_XATTR_INDEX_USER: + if (!test_opt(sbi, XATTR_USER)) + return -EOPNOTSUPP; + break; + case EROFS_XATTR_INDEX_TRUSTED: + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + break; + case EROFS_XATTR_INDEX_SECURITY: + break; + default: + return -EINVAL; + } + + return erofs_getxattr(inode, handler->flags, name, buffer, size); +} + +const struct xattr_handler erofs_xattr_user_handler = { + .prefix = XATTR_USER_PREFIX, + .flags = EROFS_XATTR_INDEX_USER, + .list = erofs_xattr_user_list, + .get = erofs_xattr_generic_get, +}; + +const struct xattr_handler erofs_xattr_trusted_handler = { + .prefix = XATTR_TRUSTED_PREFIX, + .flags = EROFS_XATTR_INDEX_TRUSTED, + .list = erofs_xattr_trusted_list, + .get = erofs_xattr_generic_get, +}; + +#ifdef CONFIG_EROFS_FS_SECURITY +const struct xattr_handler __maybe_unused erofs_xattr_security_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .flags = EROFS_XATTR_INDEX_SECURITY, + .get = erofs_xattr_generic_get, +}; +#endif + +const struct xattr_handler *erofs_xattr_handlers[] = { + &erofs_xattr_user_handler, +#ifdef CONFIG_EROFS_FS_POSIX_ACL + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, +#endif + &erofs_xattr_trusted_handler, +#ifdef CONFIG_EROFS_FS_SECURITY + &erofs_xattr_security_handler, +#endif + NULL, +}; + +struct listxattr_iter { + struct xattr_iter it; + + struct dentry *dentry; + char *buffer; + int buffer_size, buffer_ofs; +}; + +static int xattr_entrylist(struct xattr_iter *_it, + struct erofs_xattr_entry *entry) +{ + struct listxattr_iter *it = + container_of(_it, struct listxattr_iter, it); + unsigned int prefix_len; + const char *prefix; + + const struct xattr_handler *h = + erofs_xattr_handler(entry->e_name_index); + + if (!h || (h->list && !h->list(it->dentry))) + return 1; + + prefix = xattr_prefix(h); + prefix_len = strlen(prefix); + + if (!it->buffer) { + it->buffer_ofs += prefix_len + entry->e_name_len + 1; + return 1; + } + + if (it->buffer_ofs + prefix_len + + entry->e_name_len + 1 > it->buffer_size) + return -ERANGE; + + memcpy(it->buffer + it->buffer_ofs, prefix, prefix_len); + it->buffer_ofs += prefix_len; + return 0; +} + +static int xattr_namelist(struct xattr_iter *_it, + unsigned int processed, char *buf, unsigned int len) +{ + struct listxattr_iter *it = + container_of(_it, struct listxattr_iter, it); + + memcpy(it->buffer + it->buffer_ofs, buf, len); + it->buffer_ofs += len; + return 0; +} + +static int xattr_skipvalue(struct xattr_iter *_it, + unsigned int value_sz) +{ + struct listxattr_iter *it = + container_of(_it, struct listxattr_iter, it); + + it->buffer[it->buffer_ofs++] = '\0'; + return 1; +} + +static const struct xattr_iter_handlers list_xattr_handlers = { + .entry = xattr_entrylist, + .name = xattr_namelist, + .alloc_buffer = xattr_skipvalue, + .value = NULL +}; + +static int inline_listxattr(struct listxattr_iter *it) +{ + int ret; + unsigned int remaining; + + ret = inline_xattr_iter_begin(&it->it, d_inode(it->dentry)); + if (ret < 0) + return ret; + + remaining = ret; + while (remaining) { + ret = xattr_foreach(&it->it, &list_xattr_handlers, &remaining); + if (ret) + break; + } + xattr_iter_end_final(&it->it); + return ret ? ret : it->buffer_ofs; +} + +static int shared_listxattr(struct listxattr_iter *it) +{ + struct inode *const inode = d_inode(it->dentry); + struct erofs_vnode *const vi = EROFS_V(inode); + struct super_block *const sb = inode->i_sb; + struct erofs_sb_info *const sbi = EROFS_SB(sb); + unsigned int i; + int ret = 0; + + for (i = 0; i < vi->xattr_shared_count; ++i) { + erofs_blk_t blkaddr = + xattrblock_addr(sbi, vi->xattr_shared_xattrs[i]); + + it->it.ofs = xattrblock_offset(sbi, vi->xattr_shared_xattrs[i]); + if (!i || blkaddr != it->it.blkaddr) { + if (i) + xattr_iter_end(&it->it, true); + + it->it.page = erofs_get_meta_page(sb, blkaddr, false); + if (IS_ERR(it->it.page)) + return PTR_ERR(it->it.page); + + it->it.kaddr = kmap_atomic(it->it.page); + it->it.blkaddr = blkaddr; + } + + ret = xattr_foreach(&it->it, &list_xattr_handlers, NULL); + if (ret) + break; + } + if (vi->xattr_shared_count) + xattr_iter_end_final(&it->it); + + return ret ? ret : it->buffer_ofs; +} + +ssize_t erofs_listxattr(struct dentry *dentry, + char *buffer, size_t buffer_size) +{ + int ret; + struct listxattr_iter it; + + ret = init_inode_xattrs(d_inode(dentry)); + if (ret) + return ret; + + it.dentry = dentry; + it.buffer = buffer; + it.buffer_size = buffer_size; + it.buffer_ofs = 0; + + it.it.sb = dentry->d_sb; + + ret = inline_listxattr(&it); + if (ret < 0 && ret != -ENOATTR) + return ret; + return shared_listxattr(&it); +} + +#ifdef CONFIG_EROFS_FS_POSIX_ACL +struct posix_acl *erofs_get_acl(struct inode *inode, int type) +{ + struct posix_acl *acl; + int prefix, rc; + char *value = NULL; + + switch (type) { + case ACL_TYPE_ACCESS: + prefix = EROFS_XATTR_INDEX_POSIX_ACL_ACCESS; + break; + case ACL_TYPE_DEFAULT: + prefix = EROFS_XATTR_INDEX_POSIX_ACL_DEFAULT; + break; + default: + return ERR_PTR(-EINVAL); + } + + rc = erofs_getxattr(inode, prefix, "", NULL, 0); + if (rc > 0) { + value = kmalloc(rc, GFP_KERNEL); + if (!value) + return ERR_PTR(-ENOMEM); + rc = erofs_getxattr(inode, prefix, "", value, rc); + } + + if (rc == -ENOATTR) + acl = NULL; + else if (rc < 0) + acl = ERR_PTR(rc); + else + acl = posix_acl_from_xattr(&init_user_ns, value, rc); + kfree(value); + return acl; +} +#endif + diff --git a/fs/erofs/xattr.h b/fs/erofs/xattr.h new file mode 100644 index 000000000000..c5ca47d814dd --- /dev/null +++ b/fs/erofs/xattr.h @@ -0,0 +1,92 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2017-2018 HUAWEI, Inc. + * http://www.huawei.com/ + * Created by Gao Xiang <gaoxiang25@huawei.com> + */ +#ifndef __EROFS_XATTR_H +#define __EROFS_XATTR_H + +#include "internal.h" +#include <linux/posix_acl_xattr.h> +#include <linux/xattr.h> + +/* Attribute not found */ +#define ENOATTR ENODATA + +static inline unsigned int inlinexattr_header_size(struct inode *inode) +{ + return sizeof(struct erofs_xattr_ibody_header) + + sizeof(u32) * EROFS_V(inode)->xattr_shared_count; +} + +static inline erofs_blk_t xattrblock_addr(struct erofs_sb_info *sbi, + unsigned int xattr_id) +{ +#ifdef CONFIG_EROFS_FS_XATTR + return sbi->xattr_blkaddr + + xattr_id * sizeof(__u32) / EROFS_BLKSIZ; +#else + return 0; +#endif +} + +static inline unsigned int xattrblock_offset(struct erofs_sb_info *sbi, + unsigned int xattr_id) +{ + return (xattr_id * sizeof(__u32)) % EROFS_BLKSIZ; +} + +#ifdef CONFIG_EROFS_FS_XATTR +extern const struct xattr_handler erofs_xattr_user_handler; +extern const struct xattr_handler erofs_xattr_trusted_handler; +#ifdef CONFIG_EROFS_FS_SECURITY +extern const struct xattr_handler erofs_xattr_security_handler; +#endif + +static inline const struct xattr_handler *erofs_xattr_handler(unsigned int idx) +{ +static const struct xattr_handler *xattr_handler_map[] = { + [EROFS_XATTR_INDEX_USER] = &erofs_xattr_user_handler, +#ifdef CONFIG_EROFS_FS_POSIX_ACL + [EROFS_XATTR_INDEX_POSIX_ACL_ACCESS] = &posix_acl_access_xattr_handler, + [EROFS_XATTR_INDEX_POSIX_ACL_DEFAULT] = + &posix_acl_default_xattr_handler, +#endif + [EROFS_XATTR_INDEX_TRUSTED] = &erofs_xattr_trusted_handler, +#ifdef CONFIG_EROFS_FS_SECURITY + [EROFS_XATTR_INDEX_SECURITY] = &erofs_xattr_security_handler, +#endif +}; + + return idx && idx < ARRAY_SIZE(xattr_handler_map) ? + xattr_handler_map[idx] : NULL; +} + +extern const struct xattr_handler *erofs_xattr_handlers[]; + +int erofs_getxattr(struct inode *, int, const char *, void *, size_t); +ssize_t erofs_listxattr(struct dentry *, char *, size_t); +#else +static inline int erofs_getxattr(struct inode *inode, int index, + const char *name, void *buffer, + size_t buffer_size) +{ + return -EOPNOTSUPP; +} + +static inline ssize_t erofs_listxattr(struct dentry *dentry, + char *buffer, size_t buffer_size) +{ + return -EOPNOTSUPP; +} +#endif /* !CONFIG_EROFS_FS_XATTR */ + +#ifdef CONFIG_EROFS_FS_POSIX_ACL +struct posix_acl *erofs_get_acl(struct inode *inode, int type); +#else +#define erofs_get_acl (NULL) +#endif + +#endif + diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c new file mode 100644 index 000000000000..b32ad585237c --- /dev/null +++ b/fs/erofs/zdata.c @@ -0,0 +1,1432 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2018 HUAWEI, Inc. + * http://www.huawei.com/ + * Created by Gao Xiang <gaoxiang25@huawei.com> + */ +#include "zdata.h" +#include "compress.h" +#include <linux/prefetch.h> + +#include <trace/events/erofs.h> + +/* + * a compressed_pages[] placeholder in order to avoid + * being filled with file pages for in-place decompression. + */ +#define PAGE_UNALLOCATED ((void *)0x5F0E4B1D) + +/* how to allocate cached pages for a pcluster */ +enum z_erofs_cache_alloctype { + DONTALLOC, /* don't allocate any cached pages */ + DELAYEDALLOC, /* delayed allocation (at the time of submitting io) */ +}; + +/* + * tagged pointer with 1-bit tag for all compressed pages + * tag 0 - the page is just found with an extra page reference + */ +typedef tagptr1_t compressed_page_t; + +#define tag_compressed_page_justfound(page) \ + tagptr_fold(compressed_page_t, page, 1) + +static struct workqueue_struct *z_erofs_workqueue __read_mostly; +static struct kmem_cache *pcluster_cachep __read_mostly; + +void z_erofs_exit_zip_subsystem(void) +{ + destroy_workqueue(z_erofs_workqueue); + kmem_cache_destroy(pcluster_cachep); +} + +static inline int init_unzip_workqueue(void) +{ + const unsigned int onlinecpus = num_possible_cpus(); + const unsigned int flags = WQ_UNBOUND | WQ_HIGHPRI | WQ_CPU_INTENSIVE; + + /* + * no need to spawn too many threads, limiting threads could minimum + * scheduling overhead, perhaps per-CPU threads should be better? + */ + z_erofs_workqueue = alloc_workqueue("erofs_unzipd", flags, + onlinecpus + onlinecpus / 4); + return z_erofs_workqueue ? 0 : -ENOMEM; +} + +static void init_once(void *ptr) +{ + struct z_erofs_pcluster *pcl = ptr; + struct z_erofs_collection *cl = z_erofs_primarycollection(pcl); + unsigned int i; + + mutex_init(&cl->lock); + cl->nr_pages = 0; + cl->vcnt = 0; + for (i = 0; i < Z_EROFS_CLUSTER_MAX_PAGES; ++i) + pcl->compressed_pages[i] = NULL; +} + +static void init_always(struct z_erofs_pcluster *pcl) +{ + struct z_erofs_collection *cl = z_erofs_primarycollection(pcl); + + atomic_set(&pcl->obj.refcount, 1); + + DBG_BUGON(cl->nr_pages); + DBG_BUGON(cl->vcnt); +} + +int __init z_erofs_init_zip_subsystem(void) +{ + pcluster_cachep = kmem_cache_create("erofs_compress", + Z_EROFS_WORKGROUP_SIZE, 0, + SLAB_RECLAIM_ACCOUNT, init_once); + if (pcluster_cachep) { + if (!init_unzip_workqueue()) + return 0; + + kmem_cache_destroy(pcluster_cachep); + } + return -ENOMEM; +} + +enum z_erofs_collectmode { + COLLECT_SECONDARY, + COLLECT_PRIMARY, + /* + * The current collection was the tail of an exist chain, in addition + * that the previous processed chained collections are all decided to + * be hooked up to it. + * A new chain will be created for the remaining collections which are + * not processed yet, therefore different from COLLECT_PRIMARY_FOLLOWED, + * the next collection cannot reuse the whole page safely in + * the following scenario: + * ________________________________________________________________ + * | tail (partial) page | head (partial) page | + * | (belongs to the next cl) | (belongs to the current cl) | + * |_______PRIMARY_FOLLOWED_______|________PRIMARY_HOOKED___________| + */ + COLLECT_PRIMARY_HOOKED, + COLLECT_PRIMARY_FOLLOWED_NOINPLACE, + /* + * The current collection has been linked with the owned chain, and + * could also be linked with the remaining collections, which means + * if the processing page is the tail page of the collection, thus + * the current collection can safely use the whole page (since + * the previous collection is under control) for in-place I/O, as + * illustrated below: + * ________________________________________________________________ + * | tail (partial) page | head (partial) page | + * | (of the current cl) | (of the previous collection) | + * | PRIMARY_FOLLOWED or | | + * |_____PRIMARY_HOOKED___|____________PRIMARY_FOLLOWED____________| + * + * [ (*) the above page can be used as inplace I/O. ] + */ + COLLECT_PRIMARY_FOLLOWED, +}; + +struct z_erofs_collector { + struct z_erofs_pagevec_ctor vector; + + struct z_erofs_pcluster *pcl, *tailpcl; + struct z_erofs_collection *cl; + struct page **compressedpages; + z_erofs_next_pcluster_t owned_head; + + enum z_erofs_collectmode mode; +}; + +struct z_erofs_decompress_frontend { + struct inode *const inode; + + struct z_erofs_collector clt; + struct erofs_map_blocks map; + + /* used for applying cache strategy on the fly */ + bool backmost; + erofs_off_t headoffset; +}; + +#define COLLECTOR_INIT() { \ + .owned_head = Z_EROFS_PCLUSTER_TAIL, \ + .mode = COLLECT_PRIMARY_FOLLOWED } + +#define DECOMPRESS_FRONTEND_INIT(__i) { \ + .inode = __i, .clt = COLLECTOR_INIT(), \ + .backmost = true, } + +static struct page *z_pagemap_global[Z_EROFS_VMAP_GLOBAL_PAGES]; +static DEFINE_MUTEX(z_pagemap_global_lock); + +static void preload_compressed_pages(struct z_erofs_collector *clt, + struct address_space *mc, + enum z_erofs_cache_alloctype type, + struct list_head *pagepool) +{ + const struct z_erofs_pcluster *pcl = clt->pcl; + const unsigned int clusterpages = BIT(pcl->clusterbits); + struct page **pages = clt->compressedpages; + pgoff_t index = pcl->obj.index + (pages - pcl->compressed_pages); + bool standalone = true; + + if (clt->mode < COLLECT_PRIMARY_FOLLOWED) + return; + + for (; pages < pcl->compressed_pages + clusterpages; ++pages) { + struct page *page; + compressed_page_t t; + + /* the compressed page was loaded before */ + if (READ_ONCE(*pages)) + continue; + + page = find_get_page(mc, index); + + if (page) { + t = tag_compressed_page_justfound(page); + } else if (type == DELAYEDALLOC) { + t = tagptr_init(compressed_page_t, PAGE_UNALLOCATED); + } else { /* DONTALLOC */ + if (standalone) + clt->compressedpages = pages; + standalone = false; + continue; + } + + if (!cmpxchg_relaxed(pages, NULL, tagptr_cast_ptr(t))) + continue; + + if (page) + put_page(page); + } + + if (standalone) /* downgrade to PRIMARY_FOLLOWED_NOINPLACE */ + clt->mode = COLLECT_PRIMARY_FOLLOWED_NOINPLACE; +} + +/* called by erofs_shrinker to get rid of all compressed_pages */ +int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, + struct erofs_workgroup *grp) +{ + struct z_erofs_pcluster *const pcl = + container_of(grp, struct z_erofs_pcluster, obj); + struct address_space *const mapping = MNGD_MAPPING(sbi); + const unsigned int clusterpages = BIT(pcl->clusterbits); + int i; + + /* + * refcount of workgroup is now freezed as 1, + * therefore no need to worry about available decompression users. + */ + for (i = 0; i < clusterpages; ++i) { + struct page *page = pcl->compressed_pages[i]; + + if (!page) + continue; + + /* block other users from reclaiming or migrating the page */ + if (!trylock_page(page)) + return -EBUSY; + + if (unlikely(page->mapping != mapping)) + continue; + + /* barrier is implied in the following 'unlock_page' */ + WRITE_ONCE(pcl->compressed_pages[i], NULL); + set_page_private(page, 0); + ClearPagePrivate(page); + + unlock_page(page); + put_page(page); + } + return 0; +} + +int erofs_try_to_free_cached_page(struct address_space *mapping, + struct page *page) +{ + struct z_erofs_pcluster *const pcl = (void *)page_private(page); + const unsigned int clusterpages = BIT(pcl->clusterbits); + int ret = 0; /* 0 - busy */ + + if (erofs_workgroup_try_to_freeze(&pcl->obj, 1)) { + unsigned int i; + + for (i = 0; i < clusterpages; ++i) { + if (pcl->compressed_pages[i] == page) { + WRITE_ONCE(pcl->compressed_pages[i], NULL); + ret = 1; + break; + } + } + erofs_workgroup_unfreeze(&pcl->obj, 1); + + if (ret) { + ClearPagePrivate(page); + put_page(page); + } + } + return ret; +} + +/* page_type must be Z_EROFS_PAGE_TYPE_EXCLUSIVE */ +static inline bool try_inplace_io(struct z_erofs_collector *clt, + struct page *page) +{ + struct z_erofs_pcluster *const pcl = clt->pcl; + const unsigned int clusterpages = BIT(pcl->clusterbits); + + while (clt->compressedpages < pcl->compressed_pages + clusterpages) { + if (!cmpxchg(clt->compressedpages++, NULL, page)) + return true; + } + return false; +} + +/* callers must be with collection lock held */ +static int z_erofs_attach_page(struct z_erofs_collector *clt, + struct page *page, + enum z_erofs_page_type type) +{ + int ret; + bool occupied; + + /* give priority for inplaceio */ + if (clt->mode >= COLLECT_PRIMARY && + type == Z_EROFS_PAGE_TYPE_EXCLUSIVE && + try_inplace_io(clt, page)) + return 0; + + ret = z_erofs_pagevec_enqueue(&clt->vector, + page, type, &occupied); + clt->cl->vcnt += (unsigned int)ret; + + return ret ? 0 : -EAGAIN; +} + +static enum z_erofs_collectmode +try_to_claim_pcluster(struct z_erofs_pcluster *pcl, + z_erofs_next_pcluster_t *owned_head) +{ + /* let's claim these following types of pclusters */ +retry: + if (pcl->next == Z_EROFS_PCLUSTER_NIL) { + /* type 1, nil pcluster */ + if (cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_NIL, + *owned_head) != Z_EROFS_PCLUSTER_NIL) + goto retry; + + *owned_head = &pcl->next; + /* lucky, I am the followee :) */ + return COLLECT_PRIMARY_FOLLOWED; + } else if (pcl->next == Z_EROFS_PCLUSTER_TAIL) { + /* + * type 2, link to the end of a existing open chain, + * be careful that its submission itself is governed + * by the original owned chain. + */ + if (cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL, + *owned_head) != Z_EROFS_PCLUSTER_TAIL) + goto retry; + *owned_head = Z_EROFS_PCLUSTER_TAIL; + return COLLECT_PRIMARY_HOOKED; + } + return COLLECT_PRIMARY; /* :( better luck next time */ +} + +static struct z_erofs_collection *cllookup(struct z_erofs_collector *clt, + struct inode *inode, + struct erofs_map_blocks *map) +{ + struct erofs_workgroup *grp; + struct z_erofs_pcluster *pcl; + struct z_erofs_collection *cl; + unsigned int length; + bool tag; + + grp = erofs_find_workgroup(inode->i_sb, map->m_pa >> PAGE_SHIFT, &tag); + if (!grp) + return NULL; + + pcl = container_of(grp, struct z_erofs_pcluster, obj); + if (clt->owned_head == &pcl->next || pcl == clt->tailpcl) { + DBG_BUGON(1); + erofs_workgroup_put(grp); + return ERR_PTR(-EFSCORRUPTED); + } + + cl = z_erofs_primarycollection(pcl); + if (unlikely(cl->pageofs != (map->m_la & ~PAGE_MASK))) { + DBG_BUGON(1); + erofs_workgroup_put(grp); + return ERR_PTR(-EFSCORRUPTED); + } + + length = READ_ONCE(pcl->length); + if (length & Z_EROFS_PCLUSTER_FULL_LENGTH) { + if ((map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT) > length) { + DBG_BUGON(1); + erofs_workgroup_put(grp); + return ERR_PTR(-EFSCORRUPTED); + } + } else { + unsigned int llen = map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT; + + if (map->m_flags & EROFS_MAP_FULL_MAPPED) + llen |= Z_EROFS_PCLUSTER_FULL_LENGTH; + + while (llen > length && + length != cmpxchg_relaxed(&pcl->length, length, llen)) { + cpu_relax(); + length = READ_ONCE(pcl->length); + } + } + mutex_lock(&cl->lock); + /* used to check tail merging loop due to corrupted images */ + if (clt->owned_head == Z_EROFS_PCLUSTER_TAIL) + clt->tailpcl = pcl; + clt->mode = try_to_claim_pcluster(pcl, &clt->owned_head); + /* clean tailpcl if the current owned_head is Z_EROFS_PCLUSTER_TAIL */ + if (clt->owned_head == Z_EROFS_PCLUSTER_TAIL) + clt->tailpcl = NULL; + clt->pcl = pcl; + clt->cl = cl; + return cl; +} + +static struct z_erofs_collection *clregister(struct z_erofs_collector *clt, + struct inode *inode, + struct erofs_map_blocks *map) +{ + struct z_erofs_pcluster *pcl; + struct z_erofs_collection *cl; + int err; + + /* no available workgroup, let's allocate one */ + pcl = kmem_cache_alloc(pcluster_cachep, GFP_NOFS); + if (unlikely(!pcl)) + return ERR_PTR(-ENOMEM); + + init_always(pcl); + pcl->obj.index = map->m_pa >> PAGE_SHIFT; + + pcl->length = (map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT) | + (map->m_flags & EROFS_MAP_FULL_MAPPED ? + Z_EROFS_PCLUSTER_FULL_LENGTH : 0); + + if (map->m_flags & EROFS_MAP_ZIPPED) + pcl->algorithmformat = Z_EROFS_COMPRESSION_LZ4; + else + pcl->algorithmformat = Z_EROFS_COMPRESSION_SHIFTED; + + pcl->clusterbits = EROFS_V(inode)->z_physical_clusterbits[0]; + pcl->clusterbits -= PAGE_SHIFT; + + /* new pclusters should be claimed as type 1, primary and followed */ + pcl->next = clt->owned_head; + clt->mode = COLLECT_PRIMARY_FOLLOWED; + + cl = z_erofs_primarycollection(pcl); + cl->pageofs = map->m_la & ~PAGE_MASK; + + /* + * lock all primary followed works before visible to others + * and mutex_trylock *never* fails for a new pcluster. + */ + mutex_trylock(&cl->lock); + + err = erofs_register_workgroup(inode->i_sb, &pcl->obj, 0); + if (err) { + mutex_unlock(&cl->lock); + kmem_cache_free(pcluster_cachep, pcl); + return ERR_PTR(-EAGAIN); + } + /* used to check tail merging loop due to corrupted images */ + if (clt->owned_head == Z_EROFS_PCLUSTER_TAIL) + clt->tailpcl = pcl; + clt->owned_head = &pcl->next; + clt->pcl = pcl; + clt->cl = cl; + return cl; +} + +static int z_erofs_collector_begin(struct z_erofs_collector *clt, + struct inode *inode, + struct erofs_map_blocks *map) +{ + struct z_erofs_collection *cl; + + DBG_BUGON(clt->cl); + + /* must be Z_EROFS_PCLUSTER_TAIL or pointed to previous collection */ + DBG_BUGON(clt->owned_head == Z_EROFS_PCLUSTER_NIL); + DBG_BUGON(clt->owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED); + + if (!PAGE_ALIGNED(map->m_pa)) { + DBG_BUGON(1); + return -EINVAL; + } + +repeat: + cl = cllookup(clt, inode, map); + if (!cl) { + cl = clregister(clt, inode, map); + + if (unlikely(cl == ERR_PTR(-EAGAIN))) + goto repeat; + } + + if (IS_ERR(cl)) + return PTR_ERR(cl); + + z_erofs_pagevec_ctor_init(&clt->vector, Z_EROFS_NR_INLINE_PAGEVECS, + cl->pagevec, cl->vcnt); + + clt->compressedpages = clt->pcl->compressed_pages; + if (clt->mode <= COLLECT_PRIMARY) /* cannot do in-place I/O */ + clt->compressedpages += Z_EROFS_CLUSTER_MAX_PAGES; + return 0; +} + +/* + * keep in mind that no referenced pclusters will be freed + * only after a RCU grace period. + */ +static void z_erofs_rcu_callback(struct rcu_head *head) +{ + struct z_erofs_collection *const cl = + container_of(head, struct z_erofs_collection, rcu); + + kmem_cache_free(pcluster_cachep, + container_of(cl, struct z_erofs_pcluster, + primary_collection)); +} + +void erofs_workgroup_free_rcu(struct erofs_workgroup *grp) +{ + struct z_erofs_pcluster *const pcl = + container_of(grp, struct z_erofs_pcluster, obj); + struct z_erofs_collection *const cl = z_erofs_primarycollection(pcl); + + call_rcu(&cl->rcu, z_erofs_rcu_callback); +} + +static void z_erofs_collection_put(struct z_erofs_collection *cl) +{ + struct z_erofs_pcluster *const pcl = + container_of(cl, struct z_erofs_pcluster, primary_collection); + + erofs_workgroup_put(&pcl->obj); +} + +static bool z_erofs_collector_end(struct z_erofs_collector *clt) +{ + struct z_erofs_collection *cl = clt->cl; + + if (!cl) + return false; + + z_erofs_pagevec_ctor_exit(&clt->vector, false); + mutex_unlock(&cl->lock); + + /* + * if all pending pages are added, don't hold its reference + * any longer if the pcluster isn't hosted by ourselves. + */ + if (clt->mode < COLLECT_PRIMARY_FOLLOWED_NOINPLACE) + z_erofs_collection_put(cl); + + clt->cl = NULL; + return true; +} + +static inline struct page *__stagingpage_alloc(struct list_head *pagepool, + gfp_t gfp) +{ + struct page *page = erofs_allocpage(pagepool, gfp, true); + + page->mapping = Z_EROFS_MAPPING_STAGING; + return page; +} + +static bool should_alloc_managed_pages(struct z_erofs_decompress_frontend *fe, + unsigned int cachestrategy, + erofs_off_t la) +{ + if (cachestrategy <= EROFS_ZIP_CACHE_DISABLED) + return false; + + if (fe->backmost) + return true; + + return cachestrategy >= EROFS_ZIP_CACHE_READAROUND && + la < fe->headoffset; +} + +static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, + struct page *page, + struct list_head *pagepool) +{ + struct inode *const inode = fe->inode; + struct erofs_sb_info *const sbi __maybe_unused = EROFS_I_SB(inode); + struct erofs_map_blocks *const map = &fe->map; + struct z_erofs_collector *const clt = &fe->clt; + const loff_t offset = page_offset(page); + bool tight = (clt->mode >= COLLECT_PRIMARY_HOOKED); + + enum z_erofs_cache_alloctype cache_strategy; + enum z_erofs_page_type page_type; + unsigned int cur, end, spiltted, index; + int err = 0; + + /* register locked file pages as online pages in pack */ + z_erofs_onlinepage_init(page); + + spiltted = 0; + end = PAGE_SIZE; +repeat: + cur = end - 1; + + /* lucky, within the range of the current map_blocks */ + if (offset + cur >= map->m_la && + offset + cur < map->m_la + map->m_llen) { + /* didn't get a valid collection previously (very rare) */ + if (!clt->cl) + goto restart_now; + goto hitted; + } + + /* go ahead the next map_blocks */ + debugln("%s: [out-of-range] pos %llu", __func__, offset + cur); + + if (z_erofs_collector_end(clt)) + fe->backmost = false; + + map->m_la = offset + cur; + map->m_llen = 0; + err = z_erofs_map_blocks_iter(inode, map, 0); + if (unlikely(err)) + goto err_out; + +restart_now: + if (unlikely(!(map->m_flags & EROFS_MAP_MAPPED))) + goto hitted; + + err = z_erofs_collector_begin(clt, inode, map); + if (unlikely(err)) + goto err_out; + + /* preload all compressed pages (maybe downgrade role if necessary) */ + if (should_alloc_managed_pages(fe, sbi->cache_strategy, map->m_la)) + cache_strategy = DELAYEDALLOC; + else + cache_strategy = DONTALLOC; + + preload_compressed_pages(clt, MNGD_MAPPING(sbi), + cache_strategy, pagepool); + + tight &= (clt->mode >= COLLECT_PRIMARY_HOOKED); +hitted: + cur = end - min_t(unsigned int, offset + end - map->m_la, end); + if (unlikely(!(map->m_flags & EROFS_MAP_MAPPED))) { + zero_user_segment(page, cur, end); + goto next_part; + } + + /* let's derive page type */ + page_type = cur ? Z_EROFS_VLE_PAGE_TYPE_HEAD : + (!spiltted ? Z_EROFS_PAGE_TYPE_EXCLUSIVE : + (tight ? Z_EROFS_PAGE_TYPE_EXCLUSIVE : + Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED)); + + if (cur) + tight &= (clt->mode >= COLLECT_PRIMARY_FOLLOWED); + +retry: + err = z_erofs_attach_page(clt, page, page_type); + /* should allocate an additional staging page for pagevec */ + if (err == -EAGAIN) { + struct page *const newpage = + __stagingpage_alloc(pagepool, GFP_NOFS); + + err = z_erofs_attach_page(clt, newpage, + Z_EROFS_PAGE_TYPE_EXCLUSIVE); + if (likely(!err)) + goto retry; + } + + if (unlikely(err)) + goto err_out; + + index = page->index - (map->m_la >> PAGE_SHIFT); + + z_erofs_onlinepage_fixup(page, index, true); + + /* bump up the number of spiltted parts of a page */ + ++spiltted; + /* also update nr_pages */ + clt->cl->nr_pages = max_t(pgoff_t, clt->cl->nr_pages, index + 1); +next_part: + /* can be used for verification */ + map->m_llen = offset + cur - map->m_la; + + end = cur; + if (end > 0) + goto repeat; + +out: + z_erofs_onlinepage_endio(page); + + debugln("%s, finish page: %pK spiltted: %u map->m_llen %llu", + __func__, page, spiltted, map->m_llen); + return err; + + /* if some error occurred while processing this page */ +err_out: + SetPageError(page); + goto out; +} + +static void z_erofs_vle_unzip_kickoff(void *ptr, int bios) +{ + tagptr1_t t = tagptr_init(tagptr1_t, ptr); + struct z_erofs_unzip_io *io = tagptr_unfold_ptr(t); + bool background = tagptr_unfold_tags(t); + + if (!background) { + unsigned long flags; + + spin_lock_irqsave(&io->u.wait.lock, flags); + if (!atomic_add_return(bios, &io->pending_bios)) + wake_up_locked(&io->u.wait); + spin_unlock_irqrestore(&io->u.wait.lock, flags); + return; + } + + if (!atomic_add_return(bios, &io->pending_bios)) + queue_work(z_erofs_workqueue, &io->u.work); +} + +static inline void z_erofs_vle_read_endio(struct bio *bio) +{ + struct erofs_sb_info *sbi = NULL; + blk_status_t err = bio->bi_status; + struct bio_vec *bvec; + struct bvec_iter_all iter_all; + + bio_for_each_segment_all(bvec, bio, iter_all) { + struct page *page = bvec->bv_page; + bool cachemngd = false; + + DBG_BUGON(PageUptodate(page)); + DBG_BUGON(!page->mapping); + + if (unlikely(!sbi && !z_erofs_page_is_staging(page))) { + sbi = EROFS_SB(page->mapping->host->i_sb); + + if (time_to_inject(sbi, FAULT_READ_IO)) { + erofs_show_injection_info(FAULT_READ_IO); + err = BLK_STS_IOERR; + } + } + + /* sbi should already be gotten if the page is managed */ + if (sbi) + cachemngd = erofs_page_is_managed(sbi, page); + + if (unlikely(err)) + SetPageError(page); + else if (cachemngd) + SetPageUptodate(page); + + if (cachemngd) + unlock_page(page); + } + + z_erofs_vle_unzip_kickoff(bio->bi_private, -1); + bio_put(bio); +} + +static int z_erofs_decompress_pcluster(struct super_block *sb, + struct z_erofs_pcluster *pcl, + struct list_head *pagepool) +{ + struct erofs_sb_info *const sbi = EROFS_SB(sb); + const unsigned int clusterpages = BIT(pcl->clusterbits); + struct z_erofs_pagevec_ctor ctor; + unsigned int i, outputsize, llen, nr_pages; + struct page *pages_onstack[Z_EROFS_VMAP_ONSTACK_PAGES]; + struct page **pages, **compressed_pages, *page; + + enum z_erofs_page_type page_type; + bool overlapped, partial; + struct z_erofs_collection *cl; + int err; + + might_sleep(); + cl = z_erofs_primarycollection(pcl); + DBG_BUGON(!READ_ONCE(cl->nr_pages)); + + mutex_lock(&cl->lock); + nr_pages = cl->nr_pages; + + if (likely(nr_pages <= Z_EROFS_VMAP_ONSTACK_PAGES)) { + pages = pages_onstack; + } else if (nr_pages <= Z_EROFS_VMAP_GLOBAL_PAGES && + mutex_trylock(&z_pagemap_global_lock)) { + pages = z_pagemap_global; + } else { + gfp_t gfp_flags = GFP_KERNEL; + + if (nr_pages > Z_EROFS_VMAP_GLOBAL_PAGES) + gfp_flags |= __GFP_NOFAIL; + + pages = kvmalloc_array(nr_pages, sizeof(struct page *), + gfp_flags); + + /* fallback to global pagemap for the lowmem scenario */ + if (unlikely(!pages)) { + mutex_lock(&z_pagemap_global_lock); + pages = z_pagemap_global; + } + } + + for (i = 0; i < nr_pages; ++i) + pages[i] = NULL; + + err = 0; + z_erofs_pagevec_ctor_init(&ctor, Z_EROFS_NR_INLINE_PAGEVECS, + cl->pagevec, 0); + + for (i = 0; i < cl->vcnt; ++i) { + unsigned int pagenr; + + page = z_erofs_pagevec_dequeue(&ctor, &page_type); + + /* all pages in pagevec ought to be valid */ + DBG_BUGON(!page); + DBG_BUGON(!page->mapping); + + if (z_erofs_put_stagingpage(pagepool, page)) + continue; + + if (page_type == Z_EROFS_VLE_PAGE_TYPE_HEAD) + pagenr = 0; + else + pagenr = z_erofs_onlinepage_index(page); + + DBG_BUGON(pagenr >= nr_pages); + + /* + * currently EROFS doesn't support multiref(dedup), + * so here erroring out one multiref page. + */ + if (unlikely(pages[pagenr])) { + DBG_BUGON(1); + SetPageError(pages[pagenr]); + z_erofs_onlinepage_endio(pages[pagenr]); + err = -EFSCORRUPTED; + } + pages[pagenr] = page; + } + z_erofs_pagevec_ctor_exit(&ctor, true); + + overlapped = false; + compressed_pages = pcl->compressed_pages; + + for (i = 0; i < clusterpages; ++i) { + unsigned int pagenr; + + page = compressed_pages[i]; + + /* all compressed pages ought to be valid */ + DBG_BUGON(!page); + DBG_BUGON(!page->mapping); + + if (!z_erofs_page_is_staging(page)) { + if (erofs_page_is_managed(sbi, page)) { + if (unlikely(!PageUptodate(page))) + err = -EIO; + continue; + } + + /* + * only if non-head page can be selected + * for inplace decompression + */ + pagenr = z_erofs_onlinepage_index(page); + + DBG_BUGON(pagenr >= nr_pages); + if (unlikely(pages[pagenr])) { + DBG_BUGON(1); + SetPageError(pages[pagenr]); + z_erofs_onlinepage_endio(pages[pagenr]); + err = -EFSCORRUPTED; + } + pages[pagenr] = page; + + overlapped = true; + } + + /* PG_error needs checking for inplaced and staging pages */ + if (unlikely(PageError(page))) { + DBG_BUGON(PageUptodate(page)); + err = -EIO; + } + } + + if (unlikely(err)) + goto out; + + llen = pcl->length >> Z_EROFS_PCLUSTER_LENGTH_BIT; + if (nr_pages << PAGE_SHIFT >= cl->pageofs + llen) { + outputsize = llen; + partial = !(pcl->length & Z_EROFS_PCLUSTER_FULL_LENGTH); + } else { + outputsize = (nr_pages << PAGE_SHIFT) - cl->pageofs; + partial = true; + } + + err = z_erofs_decompress(&(struct z_erofs_decompress_req) { + .sb = sb, + .in = compressed_pages, + .out = pages, + .pageofs_out = cl->pageofs, + .inputsize = PAGE_SIZE, + .outputsize = outputsize, + .alg = pcl->algorithmformat, + .inplace_io = overlapped, + .partial_decoding = partial + }, pagepool); + +out: + /* must handle all compressed pages before endding pages */ + for (i = 0; i < clusterpages; ++i) { + page = compressed_pages[i]; + + if (erofs_page_is_managed(sbi, page)) + continue; + + /* recycle all individual staging pages */ + (void)z_erofs_put_stagingpage(pagepool, page); + + WRITE_ONCE(compressed_pages[i], NULL); + } + + for (i = 0; i < nr_pages; ++i) { + page = pages[i]; + if (!page) + continue; + + DBG_BUGON(!page->mapping); + + /* recycle all individual staging pages */ + if (z_erofs_put_stagingpage(pagepool, page)) + continue; + + if (unlikely(err < 0)) + SetPageError(page); + + z_erofs_onlinepage_endio(page); + } + + if (pages == z_pagemap_global) + mutex_unlock(&z_pagemap_global_lock); + else if (unlikely(pages != pages_onstack)) + kvfree(pages); + + cl->nr_pages = 0; + cl->vcnt = 0; + + /* all cl locks MUST be taken before the following line */ + WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_NIL); + + /* all cl locks SHOULD be released right now */ + mutex_unlock(&cl->lock); + + z_erofs_collection_put(cl); + return err; +} + +static void z_erofs_vle_unzip_all(struct super_block *sb, + struct z_erofs_unzip_io *io, + struct list_head *pagepool) +{ + z_erofs_next_pcluster_t owned = io->head; + + while (owned != Z_EROFS_PCLUSTER_TAIL_CLOSED) { + struct z_erofs_pcluster *pcl; + + /* no possible that 'owned' equals Z_EROFS_WORK_TPTR_TAIL */ + DBG_BUGON(owned == Z_EROFS_PCLUSTER_TAIL); + + /* no possible that 'owned' equals NULL */ + DBG_BUGON(owned == Z_EROFS_PCLUSTER_NIL); + + pcl = container_of(owned, struct z_erofs_pcluster, next); + owned = READ_ONCE(pcl->next); + + z_erofs_decompress_pcluster(sb, pcl, pagepool); + } +} + +static void z_erofs_vle_unzip_wq(struct work_struct *work) +{ + struct z_erofs_unzip_io_sb *iosb = + container_of(work, struct z_erofs_unzip_io_sb, io.u.work); + LIST_HEAD(pagepool); + + DBG_BUGON(iosb->io.head == Z_EROFS_PCLUSTER_TAIL_CLOSED); + z_erofs_vle_unzip_all(iosb->sb, &iosb->io, &pagepool); + + put_pages_list(&pagepool); + kvfree(iosb); +} + +static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl, + unsigned int nr, + struct list_head *pagepool, + struct address_space *mc, + gfp_t gfp) +{ + /* determined at compile time to avoid too many #ifdefs */ + const bool nocache = __builtin_constant_p(mc) ? !mc : false; + const pgoff_t index = pcl->obj.index; + bool tocache = false; + + struct address_space *mapping; + struct page *oldpage, *page; + + compressed_page_t t; + int justfound; + +repeat: + page = READ_ONCE(pcl->compressed_pages[nr]); + oldpage = page; + + if (!page) + goto out_allocpage; + + /* + * the cached page has not been allocated and + * an placeholder is out there, prepare it now. + */ + if (!nocache && page == PAGE_UNALLOCATED) { + tocache = true; + goto out_allocpage; + } + + /* process the target tagged pointer */ + t = tagptr_init(compressed_page_t, page); + justfound = tagptr_unfold_tags(t); + page = tagptr_unfold_ptr(t); + + mapping = READ_ONCE(page->mapping); + + /* + * if managed cache is disabled, it's no way to + * get such a cached-like page. + */ + if (nocache) { + /* if managed cache is disabled, it is impossible `justfound' */ + DBG_BUGON(justfound); + + /* and it should be locked, not uptodate, and not truncated */ + DBG_BUGON(!PageLocked(page)); + DBG_BUGON(PageUptodate(page)); + DBG_BUGON(!mapping); + goto out; + } + + /* + * unmanaged (file) pages are all locked solidly, + * therefore it is impossible for `mapping' to be NULL. + */ + if (mapping && mapping != mc) + /* ought to be unmanaged pages */ + goto out; + + lock_page(page); + + /* only true if page reclaim goes wrong, should never happen */ + DBG_BUGON(justfound && PagePrivate(page)); + + /* the page is still in manage cache */ + if (page->mapping == mc) { + WRITE_ONCE(pcl->compressed_pages[nr], page); + + ClearPageError(page); + if (!PagePrivate(page)) { + /* + * impossible to be !PagePrivate(page) for + * the current restriction as well if + * the page is already in compressed_pages[]. + */ + DBG_BUGON(!justfound); + + justfound = 0; + set_page_private(page, (unsigned long)pcl); + SetPagePrivate(page); + } + + /* no need to submit io if it is already up-to-date */ + if (PageUptodate(page)) { + unlock_page(page); + page = NULL; + } + goto out; + } + + /* + * the managed page has been truncated, it's unsafe to + * reuse this one, let's allocate a new cache-managed page. + */ + DBG_BUGON(page->mapping); + DBG_BUGON(!justfound); + + tocache = true; + unlock_page(page); + put_page(page); +out_allocpage: + page = __stagingpage_alloc(pagepool, gfp); + if (oldpage != cmpxchg(&pcl->compressed_pages[nr], oldpage, page)) { + list_add(&page->lru, pagepool); + cpu_relax(); + goto repeat; + } + if (nocache || !tocache) + goto out; + if (add_to_page_cache_lru(page, mc, index + nr, gfp)) { + page->mapping = Z_EROFS_MAPPING_STAGING; + goto out; + } + + set_page_private(page, (unsigned long)pcl); + SetPagePrivate(page); +out: /* the only exit (for tracing and debugging) */ + return page; +} + +static struct z_erofs_unzip_io *jobqueue_init(struct super_block *sb, + struct z_erofs_unzip_io *io, + bool foreground) +{ + struct z_erofs_unzip_io_sb *iosb; + + if (foreground) { + /* waitqueue available for foreground io */ + DBG_BUGON(!io); + + init_waitqueue_head(&io->u.wait); + atomic_set(&io->pending_bios, 0); + goto out; + } + + iosb = kvzalloc(sizeof(*iosb), GFP_KERNEL | __GFP_NOFAIL); + DBG_BUGON(!iosb); + + /* initialize fields in the allocated descriptor */ + io = &iosb->io; + iosb->sb = sb; + INIT_WORK(&io->u.work, z_erofs_vle_unzip_wq); +out: + io->head = Z_EROFS_PCLUSTER_TAIL_CLOSED; + return io; +} + +/* define decompression jobqueue types */ +enum { + JQ_BYPASS, + JQ_SUBMIT, + NR_JOBQUEUES, +}; + +static void *jobqueueset_init(struct super_block *sb, + z_erofs_next_pcluster_t qtail[], + struct z_erofs_unzip_io *q[], + struct z_erofs_unzip_io *fgq, + bool forcefg) +{ + /* + * if managed cache is enabled, bypass jobqueue is needed, + * no need to read from device for all pclusters in this queue. + */ + q[JQ_BYPASS] = jobqueue_init(sb, fgq + JQ_BYPASS, true); + qtail[JQ_BYPASS] = &q[JQ_BYPASS]->head; + + q[JQ_SUBMIT] = jobqueue_init(sb, fgq + JQ_SUBMIT, forcefg); + qtail[JQ_SUBMIT] = &q[JQ_SUBMIT]->head; + + return tagptr_cast_ptr(tagptr_fold(tagptr1_t, q[JQ_SUBMIT], !forcefg)); +} + +static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl, + z_erofs_next_pcluster_t qtail[], + z_erofs_next_pcluster_t owned_head) +{ + z_erofs_next_pcluster_t *const submit_qtail = qtail[JQ_SUBMIT]; + z_erofs_next_pcluster_t *const bypass_qtail = qtail[JQ_BYPASS]; + + DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED); + if (owned_head == Z_EROFS_PCLUSTER_TAIL) + owned_head = Z_EROFS_PCLUSTER_TAIL_CLOSED; + + WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_TAIL_CLOSED); + + WRITE_ONCE(*submit_qtail, owned_head); + WRITE_ONCE(*bypass_qtail, &pcl->next); + + qtail[JQ_BYPASS] = &pcl->next; +} + +static bool postsubmit_is_all_bypassed(struct z_erofs_unzip_io *q[], + unsigned int nr_bios, + bool force_fg) +{ + /* + * although background is preferred, no one is pending for submission. + * don't issue workqueue for decompression but drop it directly instead. + */ + if (force_fg || nr_bios) + return false; + + kvfree(container_of(q[JQ_SUBMIT], struct z_erofs_unzip_io_sb, io)); + return true; +} + +static bool z_erofs_vle_submit_all(struct super_block *sb, + z_erofs_next_pcluster_t owned_head, + struct list_head *pagepool, + struct z_erofs_unzip_io *fgq, + bool force_fg) +{ + struct erofs_sb_info *const sbi __maybe_unused = EROFS_SB(sb); + z_erofs_next_pcluster_t qtail[NR_JOBQUEUES]; + struct z_erofs_unzip_io *q[NR_JOBQUEUES]; + struct bio *bio; + void *bi_private; + /* since bio will be NULL, no need to initialize last_index */ + pgoff_t uninitialized_var(last_index); + bool force_submit = false; + unsigned int nr_bios; + + if (unlikely(owned_head == Z_EROFS_PCLUSTER_TAIL)) + return false; + + force_submit = false; + bio = NULL; + nr_bios = 0; + bi_private = jobqueueset_init(sb, qtail, q, fgq, force_fg); + + /* by default, all need io submission */ + q[JQ_SUBMIT]->head = owned_head; + + do { + struct z_erofs_pcluster *pcl; + unsigned int clusterpages; + pgoff_t first_index; + struct page *page; + unsigned int i = 0, bypass = 0; + int err; + + /* no possible 'owned_head' equals the following */ + DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED); + DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_NIL); + + pcl = container_of(owned_head, struct z_erofs_pcluster, next); + + clusterpages = BIT(pcl->clusterbits); + + /* close the main owned chain at first */ + owned_head = cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL, + Z_EROFS_PCLUSTER_TAIL_CLOSED); + + first_index = pcl->obj.index; + force_submit |= (first_index != last_index + 1); + +repeat: + page = pickup_page_for_submission(pcl, i, pagepool, + MNGD_MAPPING(sbi), + GFP_NOFS); + if (!page) { + force_submit = true; + ++bypass; + goto skippage; + } + + if (bio && force_submit) { +submit_bio_retry: + __submit_bio(bio, REQ_OP_READ, 0); + bio = NULL; + } + + if (!bio) { + bio = erofs_grab_bio(sb, first_index + i, + BIO_MAX_PAGES, bi_private, + z_erofs_vle_read_endio, true); + ++nr_bios; + } + + err = bio_add_page(bio, page, PAGE_SIZE, 0); + if (err < PAGE_SIZE) + goto submit_bio_retry; + + force_submit = false; + last_index = first_index + i; +skippage: + if (++i < clusterpages) + goto repeat; + + if (bypass < clusterpages) + qtail[JQ_SUBMIT] = &pcl->next; + else + move_to_bypass_jobqueue(pcl, qtail, owned_head); + } while (owned_head != Z_EROFS_PCLUSTER_TAIL); + + if (bio) + __submit_bio(bio, REQ_OP_READ, 0); + + if (postsubmit_is_all_bypassed(q, nr_bios, force_fg)) + return true; + + z_erofs_vle_unzip_kickoff(bi_private, nr_bios); + return true; +} + +static void z_erofs_submit_and_unzip(struct super_block *sb, + struct z_erofs_collector *clt, + struct list_head *pagepool, + bool force_fg) +{ + struct z_erofs_unzip_io io[NR_JOBQUEUES]; + + if (!z_erofs_vle_submit_all(sb, clt->owned_head, + pagepool, io, force_fg)) + return; + + /* decompress no I/O pclusters immediately */ + z_erofs_vle_unzip_all(sb, &io[JQ_BYPASS], pagepool); + + if (!force_fg) + return; + + /* wait until all bios are completed */ + wait_event(io[JQ_SUBMIT].u.wait, + !atomic_read(&io[JQ_SUBMIT].pending_bios)); + + /* let's synchronous decompression */ + z_erofs_vle_unzip_all(sb, &io[JQ_SUBMIT], pagepool); +} + +static int z_erofs_vle_normalaccess_readpage(struct file *file, + struct page *page) +{ + struct inode *const inode = page->mapping->host; + struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); + int err; + LIST_HEAD(pagepool); + + trace_erofs_readpage(page, false); + + f.headoffset = (erofs_off_t)page->index << PAGE_SHIFT; + + err = z_erofs_do_read_page(&f, page, &pagepool); + (void)z_erofs_collector_end(&f.clt); + + /* if some compressed cluster ready, need submit them anyway */ + z_erofs_submit_and_unzip(inode->i_sb, &f.clt, &pagepool, true); + + if (err) + errln("%s, failed to read, err [%d]", __func__, err); + + if (f.map.mpage) + put_page(f.map.mpage); + + /* clean up the remaining free pages */ + put_pages_list(&pagepool); + return err; +} + +static bool should_decompress_synchronously(struct erofs_sb_info *sbi, + unsigned int nr) +{ + return nr <= sbi->max_sync_decompress_pages; +} + +static int z_erofs_vle_normalaccess_readpages(struct file *filp, + struct address_space *mapping, + struct list_head *pages, + unsigned int nr_pages) +{ + struct inode *const inode = mapping->host; + struct erofs_sb_info *const sbi = EROFS_I_SB(inode); + + bool sync = should_decompress_synchronously(sbi, nr_pages); + struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); + gfp_t gfp = mapping_gfp_constraint(mapping, GFP_KERNEL); + struct page *head = NULL; + LIST_HEAD(pagepool); + + trace_erofs_readpages(mapping->host, lru_to_page(pages), + nr_pages, false); + + f.headoffset = (erofs_off_t)lru_to_page(pages)->index << PAGE_SHIFT; + + for (; nr_pages; --nr_pages) { + struct page *page = lru_to_page(pages); + + prefetchw(&page->flags); + list_del(&page->lru); + + /* + * A pure asynchronous readahead is indicated if + * a PG_readahead marked page is hitted at first. + * Let's also do asynchronous decompression for this case. + */ + sync &= !(PageReadahead(page) && !head); + + if (add_to_page_cache_lru(page, mapping, page->index, gfp)) { + list_add(&page->lru, &pagepool); + continue; + } + + set_page_private(page, (unsigned long)head); + head = page; + } + + while (head) { + struct page *page = head; + int err; + + /* traversal in reverse order */ + head = (void *)page_private(page); + + err = z_erofs_do_read_page(&f, page, &pagepool); + if (err) { + struct erofs_vnode *vi = EROFS_V(inode); + + errln("%s, readahead error at page %lu of nid %llu", + __func__, page->index, vi->nid); + } + put_page(page); + } + + (void)z_erofs_collector_end(&f.clt); + + z_erofs_submit_and_unzip(inode->i_sb, &f.clt, &pagepool, sync); + + if (f.map.mpage) + put_page(f.map.mpage); + + /* clean up the remaining free pages */ + put_pages_list(&pagepool); + return 0; +} + +const struct address_space_operations z_erofs_vle_normalaccess_aops = { + .readpage = z_erofs_vle_normalaccess_readpage, + .readpages = z_erofs_vle_normalaccess_readpages, +}; + diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h new file mode 100644 index 000000000000..4fc547bc01f9 --- /dev/null +++ b/fs/erofs/zdata.h @@ -0,0 +1,193 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2018 HUAWEI, Inc. + * http://www.huawei.com/ + * Created by Gao Xiang <gaoxiang25@huawei.com> + */ +#ifndef __EROFS_FS_ZDATA_H +#define __EROFS_FS_ZDATA_H + +#include "internal.h" +#include "zpvec.h" + +#define Z_EROFS_NR_INLINE_PAGEVECS 3 + +/* + * Structure fields follow one of the following exclusion rules. + * + * I: Modifiable by initialization/destruction paths and read-only + * for everyone else; + * + * L: Field should be protected by pageset lock; + * + * A: Field should be accessed / updated in atomic for parallelized code. + */ +struct z_erofs_collection { + struct mutex lock; + + /* I: page offset of start position of decompression */ + unsigned short pageofs; + + /* L: maximum relative page index in pagevec[] */ + unsigned short nr_pages; + + /* L: total number of pages in pagevec[] */ + unsigned int vcnt; + + union { + /* L: inline a certain number of pagevecs for bootstrap */ + erofs_vtptr_t pagevec[Z_EROFS_NR_INLINE_PAGEVECS]; + + /* I: can be used to free the pcluster by RCU. */ + struct rcu_head rcu; + }; +}; + +#define Z_EROFS_PCLUSTER_FULL_LENGTH 0x00000001 +#define Z_EROFS_PCLUSTER_LENGTH_BIT 1 + +/* + * let's leave a type here in case of introducing + * another tagged pointer later. + */ +typedef void *z_erofs_next_pcluster_t; + +struct z_erofs_pcluster { + struct erofs_workgroup obj; + struct z_erofs_collection primary_collection; + + /* A: point to next chained pcluster or TAILs */ + z_erofs_next_pcluster_t next; + + /* A: compressed pages (including multi-usage pages) */ + struct page *compressed_pages[Z_EROFS_CLUSTER_MAX_PAGES]; + + /* A: lower limit of decompressed length and if full length or not */ + unsigned int length; + + /* I: compression algorithm format */ + unsigned char algorithmformat; + /* I: bit shift of physical cluster size */ + unsigned char clusterbits; +}; + +#define z_erofs_primarycollection(pcluster) (&(pcluster)->primary_collection) + +/* let's avoid the valid 32-bit kernel addresses */ + +/* the chained workgroup has't submitted io (still open) */ +#define Z_EROFS_PCLUSTER_TAIL ((void *)0x5F0ECAFE) +/* the chained workgroup has already submitted io */ +#define Z_EROFS_PCLUSTER_TAIL_CLOSED ((void *)0x5F0EDEAD) + +#define Z_EROFS_PCLUSTER_NIL (NULL) + +#define Z_EROFS_WORKGROUP_SIZE sizeof(struct z_erofs_pcluster) + +struct z_erofs_unzip_io { + atomic_t pending_bios; + z_erofs_next_pcluster_t head; + + union { + wait_queue_head_t wait; + struct work_struct work; + } u; +}; + +struct z_erofs_unzip_io_sb { + struct z_erofs_unzip_io io; + struct super_block *sb; +}; + +#define MNGD_MAPPING(sbi) ((sbi)->managed_cache->i_mapping) +static inline bool erofs_page_is_managed(const struct erofs_sb_info *sbi, + struct page *page) +{ + return page->mapping == MNGD_MAPPING(sbi); +} + +#define Z_EROFS_ONLINEPAGE_COUNT_BITS 2 +#define Z_EROFS_ONLINEPAGE_COUNT_MASK ((1 << Z_EROFS_ONLINEPAGE_COUNT_BITS) - 1) +#define Z_EROFS_ONLINEPAGE_INDEX_SHIFT (Z_EROFS_ONLINEPAGE_COUNT_BITS) + +/* + * waiters (aka. ongoing_packs): # to unlock the page + * sub-index: 0 - for partial page, >= 1 full page sub-index + */ +typedef atomic_t z_erofs_onlinepage_t; + +/* type punning */ +union z_erofs_onlinepage_converter { + z_erofs_onlinepage_t *o; + unsigned long *v; +}; + +static inline unsigned int z_erofs_onlinepage_index(struct page *page) +{ + union z_erofs_onlinepage_converter u; + + DBG_BUGON(!PagePrivate(page)); + u.v = &page_private(page); + + return atomic_read(u.o) >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT; +} + +static inline void z_erofs_onlinepage_init(struct page *page) +{ + union { + z_erofs_onlinepage_t o; + unsigned long v; + /* keep from being unlocked in advance */ + } u = { .o = ATOMIC_INIT(1) }; + + set_page_private(page, u.v); + smp_wmb(); + SetPagePrivate(page); +} + +static inline void z_erofs_onlinepage_fixup(struct page *page, + uintptr_t index, bool down) +{ + unsigned long *p, o, v, id; +repeat: + p = &page_private(page); + o = READ_ONCE(*p); + + id = o >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT; + if (id) { + if (!index) + return; + + DBG_BUGON(id != index); + } + + v = (index << Z_EROFS_ONLINEPAGE_INDEX_SHIFT) | + ((o & Z_EROFS_ONLINEPAGE_COUNT_MASK) + (unsigned int)down); + if (cmpxchg(p, o, v) != o) + goto repeat; +} + +static inline void z_erofs_onlinepage_endio(struct page *page) +{ + union z_erofs_onlinepage_converter u; + unsigned int v; + + DBG_BUGON(!PagePrivate(page)); + u.v = &page_private(page); + + v = atomic_dec_return(u.o); + if (!(v & Z_EROFS_ONLINEPAGE_COUNT_MASK)) { + ClearPagePrivate(page); + if (!PageError(page)) + SetPageUptodate(page); + unlock_page(page); + } + debugln("%s, page %p value %x", __func__, page, atomic_read(u.o)); +} + +#define Z_EROFS_VMAP_ONSTACK_PAGES \ + min_t(unsigned int, THREAD_SIZE / 8 / sizeof(struct page *), 96U) +#define Z_EROFS_VMAP_GLOBAL_PAGES 2048 + +#endif + diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c new file mode 100644 index 000000000000..4dc9cec01297 --- /dev/null +++ b/fs/erofs/zmap.c @@ -0,0 +1,466 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2018-2019 HUAWEI, Inc. + * http://www.huawei.com/ + * Created by Gao Xiang <gaoxiang25@huawei.com> + */ +#include "internal.h" +#include <asm/unaligned.h> +#include <trace/events/erofs.h> + +int z_erofs_fill_inode(struct inode *inode) +{ + struct erofs_vnode *const vi = EROFS_V(inode); + + if (vi->datamode == EROFS_INODE_FLAT_COMPRESSION_LEGACY) { + vi->z_advise = 0; + vi->z_algorithmtype[0] = 0; + vi->z_algorithmtype[1] = 0; + vi->z_logical_clusterbits = LOG_BLOCK_SIZE; + vi->z_physical_clusterbits[0] = vi->z_logical_clusterbits; + vi->z_physical_clusterbits[1] = vi->z_logical_clusterbits; + set_bit(EROFS_V_Z_INITED_BIT, &vi->flags); + } + + inode->i_mapping->a_ops = &z_erofs_vle_normalaccess_aops; + return 0; +} + +static int fill_inode_lazy(struct inode *inode) +{ + struct erofs_vnode *const vi = EROFS_V(inode); + struct super_block *const sb = inode->i_sb; + int err; + erofs_off_t pos; + struct page *page; + void *kaddr; + struct z_erofs_map_header *h; + + if (test_bit(EROFS_V_Z_INITED_BIT, &vi->flags)) + return 0; + + if (wait_on_bit_lock(&vi->flags, EROFS_V_BL_Z_BIT, TASK_KILLABLE)) + return -ERESTARTSYS; + + err = 0; + if (test_bit(EROFS_V_Z_INITED_BIT, &vi->flags)) + goto out_unlock; + + DBG_BUGON(vi->datamode == EROFS_INODE_FLAT_COMPRESSION_LEGACY); + + pos = ALIGN(iloc(EROFS_SB(sb), vi->nid) + vi->inode_isize + + vi->xattr_isize, 8); + page = erofs_get_meta_page(sb, erofs_blknr(pos), false); + if (IS_ERR(page)) { + err = PTR_ERR(page); + goto out_unlock; + } + + kaddr = kmap_atomic(page); + + h = kaddr + erofs_blkoff(pos); + vi->z_advise = le16_to_cpu(h->h_advise); + vi->z_algorithmtype[0] = h->h_algorithmtype & 15; + vi->z_algorithmtype[1] = h->h_algorithmtype >> 4; + + if (vi->z_algorithmtype[0] >= Z_EROFS_COMPRESSION_MAX) { + errln("unknown compression format %u for nid %llu, please upgrade kernel", + vi->z_algorithmtype[0], vi->nid); + err = -EOPNOTSUPP; + goto unmap_done; + } + + vi->z_logical_clusterbits = LOG_BLOCK_SIZE + (h->h_clusterbits & 7); + vi->z_physical_clusterbits[0] = vi->z_logical_clusterbits + + ((h->h_clusterbits >> 3) & 3); + + if (vi->z_physical_clusterbits[0] != LOG_BLOCK_SIZE) { + errln("unsupported physical clusterbits %u for nid %llu, please upgrade kernel", + vi->z_physical_clusterbits[0], vi->nid); + err = -EOPNOTSUPP; + goto unmap_done; + } + + vi->z_physical_clusterbits[1] = vi->z_logical_clusterbits + + ((h->h_clusterbits >> 5) & 7); + set_bit(EROFS_V_Z_INITED_BIT, &vi->flags); +unmap_done: + kunmap_atomic(kaddr); + unlock_page(page); + put_page(page); +out_unlock: + clear_and_wake_up_bit(EROFS_V_BL_Z_BIT, &vi->flags); + return err; +} + +struct z_erofs_maprecorder { + struct inode *inode; + struct erofs_map_blocks *map; + void *kaddr; + + unsigned long lcn; + /* compression extent information gathered */ + u8 type; + u16 clusterofs; + u16 delta[2]; + erofs_blk_t pblk; +}; + +static int z_erofs_reload_indexes(struct z_erofs_maprecorder *m, + erofs_blk_t eblk) +{ + struct super_block *const sb = m->inode->i_sb; + struct erofs_map_blocks *const map = m->map; + struct page *mpage = map->mpage; + + if (mpage) { + if (mpage->index == eblk) { + if (!m->kaddr) + m->kaddr = kmap_atomic(mpage); + return 0; + } + + if (m->kaddr) { + kunmap_atomic(m->kaddr); + m->kaddr = NULL; + } + put_page(mpage); + } + + mpage = erofs_get_meta_page(sb, eblk, false); + if (IS_ERR(mpage)) { + map->mpage = NULL; + return PTR_ERR(mpage); + } + m->kaddr = kmap_atomic(mpage); + unlock_page(mpage); + map->mpage = mpage; + return 0; +} + +static int vle_legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m, + unsigned long lcn) +{ + struct inode *const inode = m->inode; + struct erofs_vnode *const vi = EROFS_V(inode); + const erofs_off_t ibase = iloc(EROFS_I_SB(inode), vi->nid); + const erofs_off_t pos = + Z_EROFS_VLE_LEGACY_INDEX_ALIGN(ibase + vi->inode_isize + + vi->xattr_isize) + + lcn * sizeof(struct z_erofs_vle_decompressed_index); + struct z_erofs_vle_decompressed_index *di; + unsigned int advise, type; + int err; + + err = z_erofs_reload_indexes(m, erofs_blknr(pos)); + if (err) + return err; + + m->lcn = lcn; + di = m->kaddr + erofs_blkoff(pos); + + advise = le16_to_cpu(di->di_advise); + type = (advise >> Z_EROFS_VLE_DI_CLUSTER_TYPE_BIT) & + ((1 << Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS) - 1); + switch (type) { + case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD: + m->clusterofs = 1 << vi->z_logical_clusterbits; + m->delta[0] = le16_to_cpu(di->di_u.delta[0]); + m->delta[1] = le16_to_cpu(di->di_u.delta[1]); + break; + case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN: + case Z_EROFS_VLE_CLUSTER_TYPE_HEAD: + m->clusterofs = le16_to_cpu(di->di_clusterofs); + m->pblk = le32_to_cpu(di->di_u.blkaddr); + break; + default: + DBG_BUGON(1); + return -EOPNOTSUPP; + } + m->type = type; + return 0; +} + +static unsigned int decode_compactedbits(unsigned int lobits, + unsigned int lomask, + u8 *in, unsigned int pos, u8 *type) +{ + const unsigned int v = get_unaligned_le32(in + pos / 8) >> (pos & 7); + const unsigned int lo = v & lomask; + + *type = (v >> lobits) & 3; + return lo; +} + +static int unpack_compacted_index(struct z_erofs_maprecorder *m, + unsigned int amortizedshift, + unsigned int eofs) +{ + struct erofs_vnode *const vi = EROFS_V(m->inode); + const unsigned int lclusterbits = vi->z_logical_clusterbits; + const unsigned int lomask = (1 << lclusterbits) - 1; + unsigned int vcnt, base, lo, encodebits, nblk; + int i; + u8 *in, type; + + if (1 << amortizedshift == 4) + vcnt = 2; + else if (1 << amortizedshift == 2 && lclusterbits == 12) + vcnt = 16; + else + return -EOPNOTSUPP; + + encodebits = ((vcnt << amortizedshift) - sizeof(__le32)) * 8 / vcnt; + base = round_down(eofs, vcnt << amortizedshift); + in = m->kaddr + base; + + i = (eofs - base) >> amortizedshift; + + lo = decode_compactedbits(lclusterbits, lomask, + in, encodebits * i, &type); + m->type = type; + if (type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) { + m->clusterofs = 1 << lclusterbits; + if (i + 1 != vcnt) { + m->delta[0] = lo; + return 0; + } + /* + * since the last lcluster in the pack is special, + * of which lo saves delta[1] rather than delta[0]. + * Hence, get delta[0] by the previous lcluster indirectly. + */ + lo = decode_compactedbits(lclusterbits, lomask, + in, encodebits * (i - 1), &type); + if (type != Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) + lo = 0; + m->delta[0] = lo + 1; + return 0; + } + m->clusterofs = lo; + m->delta[0] = 0; + /* figout out blkaddr (pblk) for HEAD lclusters */ + nblk = 1; + while (i > 0) { + --i; + lo = decode_compactedbits(lclusterbits, lomask, + in, encodebits * i, &type); + if (type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) + i -= lo; + + if (i >= 0) + ++nblk; + } + in += (vcnt << amortizedshift) - sizeof(__le32); + m->pblk = le32_to_cpu(*(__le32 *)in) + nblk; + return 0; +} + +static int compacted_load_cluster_from_disk(struct z_erofs_maprecorder *m, + unsigned long lcn) +{ + struct inode *const inode = m->inode; + struct erofs_vnode *const vi = EROFS_V(inode); + const unsigned int lclusterbits = vi->z_logical_clusterbits; + const erofs_off_t ebase = ALIGN(iloc(EROFS_I_SB(inode), vi->nid) + + vi->inode_isize + vi->xattr_isize, 8) + + sizeof(struct z_erofs_map_header); + const unsigned int totalidx = DIV_ROUND_UP(inode->i_size, EROFS_BLKSIZ); + unsigned int compacted_4b_initial, compacted_2b; + unsigned int amortizedshift; + erofs_off_t pos; + int err; + + if (lclusterbits != 12) + return -EOPNOTSUPP; + + if (lcn >= totalidx) + return -EINVAL; + + m->lcn = lcn; + /* used to align to 32-byte (compacted_2b) alignment */ + compacted_4b_initial = (32 - ebase % 32) / 4; + if (compacted_4b_initial == 32 / 4) + compacted_4b_initial = 0; + + if (vi->z_advise & Z_EROFS_ADVISE_COMPACTED_2B) + compacted_2b = rounddown(totalidx - compacted_4b_initial, 16); + else + compacted_2b = 0; + + pos = ebase; + if (lcn < compacted_4b_initial) { + amortizedshift = 2; + goto out; + } + pos += compacted_4b_initial * 4; + lcn -= compacted_4b_initial; + + if (lcn < compacted_2b) { + amortizedshift = 1; + goto out; + } + pos += compacted_2b * 2; + lcn -= compacted_2b; + amortizedshift = 2; +out: + pos += lcn * (1 << amortizedshift); + err = z_erofs_reload_indexes(m, erofs_blknr(pos)); + if (err) + return err; + return unpack_compacted_index(m, amortizedshift, erofs_blkoff(pos)); +} + +static int vle_load_cluster_from_disk(struct z_erofs_maprecorder *m, + unsigned int lcn) +{ + const unsigned int datamode = EROFS_V(m->inode)->datamode; + + if (datamode == EROFS_INODE_FLAT_COMPRESSION_LEGACY) + return vle_legacy_load_cluster_from_disk(m, lcn); + + if (datamode == EROFS_INODE_FLAT_COMPRESSION) + return compacted_load_cluster_from_disk(m, lcn); + + return -EINVAL; +} + +static int vle_extent_lookback(struct z_erofs_maprecorder *m, + unsigned int lookback_distance) +{ + struct erofs_vnode *const vi = EROFS_V(m->inode); + struct erofs_map_blocks *const map = m->map; + const unsigned int lclusterbits = vi->z_logical_clusterbits; + unsigned long lcn = m->lcn; + int err; + + if (lcn < lookback_distance) { + errln("bogus lookback distance @ nid %llu", vi->nid); + DBG_BUGON(1); + return -EFSCORRUPTED; + } + + /* load extent head logical cluster if needed */ + lcn -= lookback_distance; + err = vle_load_cluster_from_disk(m, lcn); + if (err) + return err; + + switch (m->type) { + case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD: + if (unlikely(!m->delta[0])) { + errln("invalid lookback distance 0 at nid %llu", + vi->nid); + DBG_BUGON(1); + return -EFSCORRUPTED; + } + return vle_extent_lookback(m, m->delta[0]); + case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN: + map->m_flags &= ~EROFS_MAP_ZIPPED; + /* fallthrough */ + case Z_EROFS_VLE_CLUSTER_TYPE_HEAD: + map->m_la = (lcn << lclusterbits) | m->clusterofs; + break; + default: + errln("unknown type %u at lcn %lu of nid %llu", + m->type, lcn, vi->nid); + DBG_BUGON(1); + return -EOPNOTSUPP; + } + return 0; +} + +int z_erofs_map_blocks_iter(struct inode *inode, + struct erofs_map_blocks *map, + int flags) +{ + struct erofs_vnode *const vi = EROFS_V(inode); + struct z_erofs_maprecorder m = { + .inode = inode, + .map = map, + }; + int err = 0; + unsigned int lclusterbits, endoff; + unsigned long long ofs, end; + + trace_z_erofs_map_blocks_iter_enter(inode, map, flags); + + /* when trying to read beyond EOF, leave it unmapped */ + if (unlikely(map->m_la >= inode->i_size)) { + map->m_llen = map->m_la + 1 - inode->i_size; + map->m_la = inode->i_size; + map->m_flags = 0; + goto out; + } + + err = fill_inode_lazy(inode); + if (err) + goto out; + + lclusterbits = vi->z_logical_clusterbits; + ofs = map->m_la; + m.lcn = ofs >> lclusterbits; + endoff = ofs & ((1 << lclusterbits) - 1); + + err = vle_load_cluster_from_disk(&m, m.lcn); + if (err) + goto unmap_out; + + map->m_flags = EROFS_MAP_ZIPPED; /* by default, compressed */ + end = (m.lcn + 1ULL) << lclusterbits; + + switch (m.type) { + case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN: + if (endoff >= m.clusterofs) + map->m_flags &= ~EROFS_MAP_ZIPPED; + /* fallthrough */ + case Z_EROFS_VLE_CLUSTER_TYPE_HEAD: + if (endoff >= m.clusterofs) { + map->m_la = (m.lcn << lclusterbits) | m.clusterofs; + break; + } + /* m.lcn should be >= 1 if endoff < m.clusterofs */ + if (unlikely(!m.lcn)) { + errln("invalid logical cluster 0 at nid %llu", + vi->nid); + err = -EFSCORRUPTED; + goto unmap_out; + } + end = (m.lcn << lclusterbits) | m.clusterofs; + map->m_flags |= EROFS_MAP_FULL_MAPPED; + m.delta[0] = 1; + /* fallthrough */ + case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD: + /* get the correspoinding first chunk */ + err = vle_extent_lookback(&m, m.delta[0]); + if (unlikely(err)) + goto unmap_out; + break; + default: + errln("unknown type %u at offset %llu of nid %llu", + m.type, ofs, vi->nid); + err = -EOPNOTSUPP; + goto unmap_out; + } + + map->m_llen = end - map->m_la; + map->m_plen = 1 << lclusterbits; + map->m_pa = blknr_to_addr(m.pblk); + map->m_flags |= EROFS_MAP_MAPPED; + +unmap_out: + if (m.kaddr) + kunmap_atomic(m.kaddr); + +out: + debugln("%s, m_la %llu m_pa %llu m_llen %llu m_plen %llu m_flags 0%o", + __func__, map->m_la, map->m_pa, + map->m_llen, map->m_plen, map->m_flags); + + trace_z_erofs_map_blocks_iter_exit(inode, map, flags, err); + + /* aggressively BUG_ON iff CONFIG_EROFS_FS_DEBUG is on */ + DBG_BUGON(err < 0 && err != -ENOMEM); + return err; +} + diff --git a/fs/erofs/zpvec.h b/fs/erofs/zpvec.h new file mode 100644 index 000000000000..bd3cee16491c --- /dev/null +++ b/fs/erofs/zpvec.h @@ -0,0 +1,157 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2018 HUAWEI, Inc. + * http://www.huawei.com/ + * Created by Gao Xiang <gaoxiang25@huawei.com> + */ +#ifndef __EROFS_FS_ZPVEC_H +#define __EROFS_FS_ZPVEC_H + +#include "tagptr.h" + +/* page type in pagevec for decompress subsystem */ +enum z_erofs_page_type { + /* including Z_EROFS_VLE_PAGE_TAIL_EXCLUSIVE */ + Z_EROFS_PAGE_TYPE_EXCLUSIVE, + + Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED, + + Z_EROFS_VLE_PAGE_TYPE_HEAD, + Z_EROFS_VLE_PAGE_TYPE_MAX +}; + +extern void __compiletime_error("Z_EROFS_PAGE_TYPE_EXCLUSIVE != 0") + __bad_page_type_exclusive(void); + +/* pagevec tagged pointer */ +typedef tagptr2_t erofs_vtptr_t; + +/* pagevec collector */ +struct z_erofs_pagevec_ctor { + struct page *curr, *next; + erofs_vtptr_t *pages; + + unsigned int nr, index; +}; + +static inline void z_erofs_pagevec_ctor_exit(struct z_erofs_pagevec_ctor *ctor, + bool atomic) +{ + if (!ctor->curr) + return; + + if (atomic) + kunmap_atomic(ctor->pages); + else + kunmap(ctor->curr); +} + +static inline struct page * +z_erofs_pagevec_ctor_next_page(struct z_erofs_pagevec_ctor *ctor, + unsigned int nr) +{ + unsigned int index; + + /* keep away from occupied pages */ + if (ctor->next) + return ctor->next; + + for (index = 0; index < nr; ++index) { + const erofs_vtptr_t t = ctor->pages[index]; + const unsigned int tags = tagptr_unfold_tags(t); + + if (tags == Z_EROFS_PAGE_TYPE_EXCLUSIVE) + return tagptr_unfold_ptr(t); + } + DBG_BUGON(nr >= ctor->nr); + return NULL; +} + +static inline void +z_erofs_pagevec_ctor_pagedown(struct z_erofs_pagevec_ctor *ctor, + bool atomic) +{ + struct page *next = z_erofs_pagevec_ctor_next_page(ctor, ctor->nr); + + z_erofs_pagevec_ctor_exit(ctor, atomic); + + ctor->curr = next; + ctor->next = NULL; + ctor->pages = atomic ? + kmap_atomic(ctor->curr) : kmap(ctor->curr); + + ctor->nr = PAGE_SIZE / sizeof(struct page *); + ctor->index = 0; +} + +static inline void z_erofs_pagevec_ctor_init(struct z_erofs_pagevec_ctor *ctor, + unsigned int nr, + erofs_vtptr_t *pages, + unsigned int i) +{ + ctor->nr = nr; + ctor->curr = ctor->next = NULL; + ctor->pages = pages; + + if (i >= nr) { + i -= nr; + z_erofs_pagevec_ctor_pagedown(ctor, false); + while (i > ctor->nr) { + i -= ctor->nr; + z_erofs_pagevec_ctor_pagedown(ctor, false); + } + } + ctor->next = z_erofs_pagevec_ctor_next_page(ctor, i); + ctor->index = i; +} + +static inline bool z_erofs_pagevec_enqueue(struct z_erofs_pagevec_ctor *ctor, + struct page *page, + enum z_erofs_page_type type, + bool *occupied) +{ + *occupied = false; + if (unlikely(!ctor->next && type)) + if (ctor->index + 1 == ctor->nr) + return false; + + if (unlikely(ctor->index >= ctor->nr)) + z_erofs_pagevec_ctor_pagedown(ctor, false); + + /* exclusive page type must be 0 */ + if (Z_EROFS_PAGE_TYPE_EXCLUSIVE != (uintptr_t)NULL) + __bad_page_type_exclusive(); + + /* should remind that collector->next never equal to 1, 2 */ + if (type == (uintptr_t)ctor->next) { + ctor->next = page; + *occupied = true; + } + ctor->pages[ctor->index++] = tagptr_fold(erofs_vtptr_t, page, type); + return true; +} + +static inline struct page * +z_erofs_pagevec_dequeue(struct z_erofs_pagevec_ctor *ctor, + enum z_erofs_page_type *type) +{ + erofs_vtptr_t t; + + if (unlikely(ctor->index >= ctor->nr)) { + DBG_BUGON(!ctor->next); + z_erofs_pagevec_ctor_pagedown(ctor, true); + } + + t = ctor->pages[ctor->index]; + + *type = tagptr_unfold_tags(t); + + /* should remind that collector->next never equal to 1, 2 */ + if (*type == (uintptr_t)ctor->next) + ctor->next = tagptr_unfold_ptr(t); + + ctor->pages[ctor->index++] = tagptr_fold(erofs_vtptr_t, NULL, 0); + return tagptr_unfold_ptr(t); +} +#endif + |