diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2020-10-17 01:14:43 +0300 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2020-10-17 01:14:43 +0300 |
commit | 7a3dadedc82e340f8292f64e7bfa964c525009c0 (patch) | |
tree | 27552fe7b39d671995cdc5e3add4847107b0ecd1 /fs | |
parent | 54a4c789ca8091ab8fcd70285caeee2c5bc62997 (diff) | |
parent | 788e96d1d39949fc91457a816f4bda0d374c257b (diff) | |
download | linux-7a3dadedc82e340f8292f64e7bfa964c525009c0.tar.xz |
Merge tag 'f2fs-for-5.10-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/jaegeuk/f2fs
Pull f2fs updates from Jaegeuk Kim:
"In this round, we've added new features such as zone capacity for ZNS
and a new GC policy, ATGC, along with in-memory segment management. In
addition, we could improve the decompression speed significantly by
changing virtual mapping method. Even though we've fixed lots of small
bugs in compression support, I feel that it becomes more stable so
that I could give it a try in production.
Enhancements:
- suport zone capacity in NVMe Zoned Namespace devices
- introduce in-memory current segment management
- add standart casefolding support
- support age threshold based garbage collection
- improve decompression speed by changing virtual mapping method
Bug fixes:
- fix condition checks in some ioctl() such as compression, move_range, etc
- fix 32/64bits support in data structures
- fix memory allocation in zstd decompress
- add some boundary checks to avoid kernel panic on corrupted image
- fix disallowing compression for non-empty file
- fix slab leakage of compressed block writes
In addition, it includes code refactoring for better readability and
minor bug fixes for compression and zoned device support"
* tag 'f2fs-for-5.10-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/jaegeuk/f2fs: (51 commits)
f2fs: code cleanup by removing unnecessary check
f2fs: wait for sysfs kobject removal before freeing f2fs_sb_info
f2fs: fix writecount false positive in releasing compress blocks
f2fs: introduce check_swap_activate_fast()
f2fs: don't issue flush in f2fs_flush_device_cache() for nobarrier case
f2fs: handle errors of f2fs_get_meta_page_nofail
f2fs: fix to set SBI_NEED_FSCK flag for inconsistent inode
f2fs: reject CASEFOLD inode flag without casefold feature
f2fs: fix memory alignment to support 32bit
f2fs: fix slab leak of rpages pointer
f2fs: compress: fix to disallow enabling compress on non-empty file
f2fs: compress: introduce cic/dic slab cache
f2fs: compress: introduce page array slab cache
f2fs: fix to do sanity check on segment/section count
f2fs: fix to check segment boundary during SIT page readahead
f2fs: fix uninit-value in f2fs_lookup
f2fs: remove unneeded parameter in find_in_block()
f2fs: fix wrong total_sections check and fsmeta check
f2fs: remove duplicated code in sanity_check_area_boundary
f2fs: remove unused check on version_bitmap
...
Diffstat (limited to 'fs')
-rw-r--r-- | fs/f2fs/acl.c | 6 | ||||
-rw-r--r-- | fs/f2fs/checkpoint.c | 17 | ||||
-rw-r--r-- | fs/f2fs/compress.c | 242 | ||||
-rw-r--r-- | fs/f2fs/data.c | 119 | ||||
-rw-r--r-- | fs/f2fs/debug.c | 18 | ||||
-rw-r--r-- | fs/f2fs/dir.c | 109 | ||||
-rw-r--r-- | fs/f2fs/extent_cache.c | 37 | ||||
-rw-r--r-- | fs/f2fs/f2fs.h | 118 | ||||
-rw-r--r-- | fs/f2fs/file.c | 88 | ||||
-rw-r--r-- | fs/f2fs/gc.c | 413 | ||||
-rw-r--r-- | fs/f2fs/gc.h | 69 | ||||
-rw-r--r-- | fs/f2fs/inline.c | 4 | ||||
-rw-r--r-- | fs/f2fs/inode.c | 21 | ||||
-rw-r--r-- | fs/f2fs/namei.c | 2 | ||||
-rw-r--r-- | fs/f2fs/node.c | 7 | ||||
-rw-r--r-- | fs/f2fs/segment.c | 522 | ||||
-rw-r--r-- | fs/f2fs/segment.h | 71 | ||||
-rw-r--r-- | fs/f2fs/super.c | 168 | ||||
-rw-r--r-- | fs/f2fs/sysfs.c | 22 | ||||
-rw-r--r-- | fs/f2fs/xattr.c | 8 | ||||
-rw-r--r-- | fs/libfs.c | 87 | ||||
-rw-r--r-- | fs/unicode/utf8-core.c | 23 |
22 files changed, 1701 insertions, 470 deletions
diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index 217b290ae3a5..306413589827 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -160,7 +160,7 @@ static void *f2fs_acl_to_disk(struct f2fs_sb_info *sbi, return (void *)f2fs_acl; fail: - kvfree(f2fs_acl); + kfree(f2fs_acl); return ERR_PTR(-EINVAL); } @@ -190,7 +190,7 @@ static struct posix_acl *__f2fs_get_acl(struct inode *inode, int type, acl = NULL; else acl = ERR_PTR(retval); - kvfree(value); + kfree(value); return acl; } @@ -240,7 +240,7 @@ static int __f2fs_set_acl(struct inode *inode, int type, error = f2fs_setxattr(inode, name_index, "", value, size, ipage, 0); - kvfree(value); + kfree(value); if (!error) set_cached_acl(inode, type, acl); diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index ff807e14c891..023462e80e58 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -107,7 +107,7 @@ struct page *f2fs_get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index) return __get_meta_page(sbi, index, true); } -struct page *f2fs_get_meta_page_nofail(struct f2fs_sb_info *sbi, pgoff_t index) +struct page *f2fs_get_meta_page_retry(struct f2fs_sb_info *sbi, pgoff_t index) { struct page *page; int count = 0; @@ -243,6 +243,8 @@ int f2fs_ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, blkno * NAT_ENTRY_PER_BLOCK); break; case META_SIT: + if (unlikely(blkno >= TOTAL_SEGS(sbi))) + goto out; /* get sit block addr */ fio.new_blkaddr = current_sit_addr(sbi, blkno * SIT_ENTRY_PER_BLOCK); @@ -1047,8 +1049,12 @@ int f2fs_sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type) get_pages(sbi, is_dir ? F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA)); retry: - if (unlikely(f2fs_cp_error(sbi))) + if (unlikely(f2fs_cp_error(sbi))) { + trace_f2fs_sync_dirty_inodes_exit(sbi->sb, is_dir, + get_pages(sbi, is_dir ? + F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA)); return -EIO; + } spin_lock(&sbi->inode_lock[type]); @@ -1619,11 +1625,16 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) f2fs_flush_sit_entries(sbi, cpc); + /* save inmem log status */ + f2fs_save_inmem_curseg(sbi); + err = do_checkpoint(sbi, cpc); if (err) f2fs_release_discard_addrs(sbi); else f2fs_clear_prefree_segments(sbi, cpc); + + f2fs_restore_inmem_curseg(sbi); stop: unblock_operations(sbi); stat_inc_cp_count(sbi->stat_info); @@ -1654,7 +1665,7 @@ void f2fs_init_ino_entry_info(struct f2fs_sb_info *sbi) } sbi->max_orphans = (sbi->blocks_per_seg - F2FS_CP_PACKS - - NR_CURSEG_TYPE - __cp_payload(sbi)) * + NR_CURSEG_PERSIST_TYPE - __cp_payload(sbi)) * F2FS_ORPHANS_PER_BLOCK; } diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 1dfb126a0cb2..14262e0f1cd6 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -17,6 +17,33 @@ #include "node.h" #include <trace/events/f2fs.h> +static struct kmem_cache *cic_entry_slab; +static struct kmem_cache *dic_entry_slab; + +static void *page_array_alloc(struct inode *inode, int nr) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + unsigned int size = sizeof(struct page *) * nr; + + if (likely(size <= sbi->page_array_slab_size)) + return kmem_cache_zalloc(sbi->page_array_slab, GFP_NOFS); + return f2fs_kzalloc(sbi, size, GFP_NOFS); +} + +static void page_array_free(struct inode *inode, void *pages, int nr) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + unsigned int size = sizeof(struct page *) * nr; + + if (!pages) + return; + + if (likely(size <= sbi->page_array_slab_size)) + kmem_cache_free(sbi->page_array_slab, pages); + else + kfree(pages); +} + struct f2fs_compress_ops { int (*init_compress_ctx)(struct compress_ctx *cc); void (*destroy_compress_ctx)(struct compress_ctx *cc); @@ -130,19 +157,16 @@ struct page *f2fs_compress_control_page(struct page *page) int f2fs_init_compress_ctx(struct compress_ctx *cc) { - struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode); - - if (cc->nr_rpages) + if (cc->rpages) return 0; - cc->rpages = f2fs_kzalloc(sbi, sizeof(struct page *) << - cc->log_cluster_size, GFP_NOFS); + cc->rpages = page_array_alloc(cc->inode, cc->cluster_size); return cc->rpages ? 0 : -ENOMEM; } void f2fs_destroy_compress_ctx(struct compress_ctx *cc) { - kfree(cc->rpages); + page_array_free(cc->inode, cc->rpages, cc->cluster_size); cc->rpages = NULL; cc->nr_rpages = 0; cc->nr_cpages = 0; @@ -382,16 +406,17 @@ static int zstd_init_decompress_ctx(struct decompress_io_ctx *dic) ZSTD_DStream *stream; void *workspace; unsigned int workspace_size; + unsigned int max_window_size = + MAX_COMPRESS_WINDOW_SIZE(dic->log_cluster_size); - workspace_size = ZSTD_DStreamWorkspaceBound(MAX_COMPRESS_WINDOW_SIZE); + workspace_size = ZSTD_DStreamWorkspaceBound(max_window_size); workspace = f2fs_kvmalloc(F2FS_I_SB(dic->inode), workspace_size, GFP_NOFS); if (!workspace) return -ENOMEM; - stream = ZSTD_initDStream(MAX_COMPRESS_WINDOW_SIZE, - workspace, workspace_size); + stream = ZSTD_initDStream(max_window_size, workspace, workspace_size); if (!stream) { printk_ratelimited("%sF2FS-fs (%s): %s ZSTD_initDStream failed\n", KERN_ERR, F2FS_I_SB(dic->inode)->sb->s_id, @@ -554,13 +579,29 @@ static void f2fs_compress_free_page(struct page *page) mempool_free(page, compress_page_pool); } +#define MAX_VMAP_RETRIES 3 + +static void *f2fs_vmap(struct page **pages, unsigned int count) +{ + int i; + void *buf = NULL; + + for (i = 0; i < MAX_VMAP_RETRIES; i++) { + buf = vm_map_ram(pages, count, -1); + if (buf) + break; + vm_unmap_aliases(); + } + return buf; +} + static int f2fs_compress_pages(struct compress_ctx *cc) { - struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode); struct f2fs_inode_info *fi = F2FS_I(cc->inode); const struct f2fs_compress_ops *cops = f2fs_cops[fi->i_compress_algorithm]; - unsigned int max_len, nr_cpages; + unsigned int max_len, new_nr_cpages; + struct page **new_cpages; int i, ret; trace_f2fs_compress_pages_start(cc->inode, cc->cluster_idx, @@ -575,8 +616,7 @@ static int f2fs_compress_pages(struct compress_ctx *cc) max_len = COMPRESS_HEADER_SIZE + cc->clen; cc->nr_cpages = DIV_ROUND_UP(max_len, PAGE_SIZE); - cc->cpages = f2fs_kzalloc(sbi, sizeof(struct page *) * - cc->nr_cpages, GFP_NOFS); + cc->cpages = page_array_alloc(cc->inode, cc->nr_cpages); if (!cc->cpages) { ret = -ENOMEM; goto destroy_compress_ctx; @@ -590,13 +630,13 @@ static int f2fs_compress_pages(struct compress_ctx *cc) } } - cc->rbuf = vmap(cc->rpages, cc->cluster_size, VM_MAP, PAGE_KERNEL_RO); + cc->rbuf = f2fs_vmap(cc->rpages, cc->cluster_size); if (!cc->rbuf) { ret = -ENOMEM; goto out_free_cpages; } - cc->cbuf = vmap(cc->cpages, cc->nr_cpages, VM_MAP, PAGE_KERNEL); + cc->cbuf = f2fs_vmap(cc->cpages, cc->nr_cpages); if (!cc->cbuf) { ret = -ENOMEM; goto out_vunmap_rbuf; @@ -618,16 +658,28 @@ static int f2fs_compress_pages(struct compress_ctx *cc) for (i = 0; i < COMPRESS_DATA_RESERVED_SIZE; i++) cc->cbuf->reserved[i] = cpu_to_le32(0); - nr_cpages = DIV_ROUND_UP(cc->clen + COMPRESS_HEADER_SIZE, PAGE_SIZE); + new_nr_cpages = DIV_ROUND_UP(cc->clen + COMPRESS_HEADER_SIZE, PAGE_SIZE); + + /* Now we're going to cut unnecessary tail pages */ + new_cpages = page_array_alloc(cc->inode, new_nr_cpages); + if (!new_cpages) { + ret = -ENOMEM; + goto out_vunmap_cbuf; + } /* zero out any unused part of the last page */ memset(&cc->cbuf->cdata[cc->clen], 0, - (nr_cpages * PAGE_SIZE) - (cc->clen + COMPRESS_HEADER_SIZE)); + (new_nr_cpages * PAGE_SIZE) - + (cc->clen + COMPRESS_HEADER_SIZE)); - vunmap(cc->cbuf); - vunmap(cc->rbuf); + vm_unmap_ram(cc->cbuf, cc->nr_cpages); + vm_unmap_ram(cc->rbuf, cc->cluster_size); - for (i = nr_cpages; i < cc->nr_cpages; i++) { + for (i = 0; i < cc->nr_cpages; i++) { + if (i < new_nr_cpages) { + new_cpages[i] = cc->cpages[i]; + continue; + } f2fs_compress_free_page(cc->cpages[i]); cc->cpages[i] = NULL; } @@ -635,22 +687,24 @@ static int f2fs_compress_pages(struct compress_ctx *cc) if (cops->destroy_compress_ctx) cops->destroy_compress_ctx(cc); - cc->nr_cpages = nr_cpages; + page_array_free(cc->inode, cc->cpages, cc->nr_cpages); + cc->cpages = new_cpages; + cc->nr_cpages = new_nr_cpages; trace_f2fs_compress_pages_end(cc->inode, cc->cluster_idx, cc->clen, ret); return 0; out_vunmap_cbuf: - vunmap(cc->cbuf); + vm_unmap_ram(cc->cbuf, cc->nr_cpages); out_vunmap_rbuf: - vunmap(cc->rbuf); + vm_unmap_ram(cc->rbuf, cc->cluster_size); out_free_cpages: for (i = 0; i < cc->nr_cpages; i++) { if (cc->cpages[i]) f2fs_compress_free_page(cc->cpages[i]); } - kfree(cc->cpages); + page_array_free(cc->inode, cc->cpages, cc->nr_cpages); cc->cpages = NULL; destroy_compress_ctx: if (cops->destroy_compress_ctx) @@ -677,7 +731,7 @@ void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity) if (bio->bi_status || PageError(page)) dic->failed = true; - if (refcount_dec_not_one(&dic->ref)) + if (atomic_dec_return(&dic->pending_pages)) return; trace_f2fs_decompress_pages_start(dic->inode, dic->cluster_idx, @@ -689,8 +743,7 @@ void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity) goto out_free_dic; } - dic->tpages = f2fs_kzalloc(sbi, sizeof(struct page *) * - dic->cluster_size, GFP_NOFS); + dic->tpages = page_array_alloc(dic->inode, dic->cluster_size); if (!dic->tpages) { ret = -ENOMEM; goto out_free_dic; @@ -715,13 +768,13 @@ void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity) goto out_free_dic; } - dic->rbuf = vmap(dic->tpages, dic->cluster_size, VM_MAP, PAGE_KERNEL); + dic->rbuf = f2fs_vmap(dic->tpages, dic->cluster_size); if (!dic->rbuf) { ret = -ENOMEM; goto destroy_decompress_ctx; } - dic->cbuf = vmap(dic->cpages, dic->nr_cpages, VM_MAP, PAGE_KERNEL_RO); + dic->cbuf = f2fs_vmap(dic->cpages, dic->nr_cpages); if (!dic->cbuf) { ret = -ENOMEM; goto out_vunmap_rbuf; @@ -738,15 +791,15 @@ void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity) ret = cops->decompress_pages(dic); out_vunmap_cbuf: - vunmap(dic->cbuf); + vm_unmap_ram(dic->cbuf, dic->nr_cpages); out_vunmap_rbuf: - vunmap(dic->rbuf); + vm_unmap_ram(dic->rbuf, dic->cluster_size); destroy_decompress_ctx: if (cops->destroy_decompress_ctx) cops->destroy_decompress_ctx(dic); out_free_dic: if (verity) - refcount_set(&dic->ref, dic->nr_cpages); + atomic_set(&dic->pending_pages, dic->nr_cpages); if (!verity) f2fs_decompress_end_io(dic->rpages, dic->cluster_size, ret, false); @@ -1029,6 +1082,7 @@ bool f2fs_compress_write_end(struct inode *inode, void *fsdata, { struct compress_ctx cc = { + .inode = inode, .log_cluster_size = F2FS_I(inode)->i_log_cluster_size, .cluster_size = F2FS_I(inode)->i_cluster_size, .rpages = fsdata, @@ -1132,7 +1186,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, */ down_read(&sbi->node_write); } else if (!f2fs_trylock_op(sbi)) { - return -EAGAIN; + goto out_free; } set_new_dnode(&dn, cc->inode, NULL, NULL, 0); @@ -1155,15 +1209,14 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, fio.version = ni.version; - cic = f2fs_kzalloc(sbi, sizeof(struct compress_io_ctx), GFP_NOFS); + cic = kmem_cache_zalloc(cic_entry_slab, GFP_NOFS); if (!cic) goto out_put_dnode; cic->magic = F2FS_COMPRESSED_PAGE_MAGIC; cic->inode = inode; - refcount_set(&cic->ref, cc->nr_cpages); - cic->rpages = f2fs_kzalloc(sbi, sizeof(struct page *) << - cc->log_cluster_size, GFP_NOFS); + atomic_set(&cic->pending_pages, cc->nr_cpages); + cic->rpages = page_array_alloc(cc->inode, cc->cluster_size); if (!cic->rpages) goto out_put_cic; @@ -1257,11 +1310,13 @@ unlock_continue: spin_unlock(&fi->i_size_lock); f2fs_put_rpages(cc); + page_array_free(cc->inode, cc->cpages, cc->nr_cpages); + cc->cpages = NULL; f2fs_destroy_compress_ctx(cc); return 0; out_destroy_crypt: - kfree(cic->rpages); + page_array_free(cc->inode, cic->rpages, cc->cluster_size); for (--i; i >= 0; i--) fscrypt_finalize_bounce_page(&cc->cpages[i]); @@ -1271,7 +1326,7 @@ out_destroy_crypt: f2fs_put_page(cc->cpages[i], 1); } out_put_cic: - kfree(cic); + kmem_cache_free(cic_entry_slab, cic); out_put_dnode: f2fs_put_dnode(&dn); out_unlock_op: @@ -1279,6 +1334,9 @@ out_unlock_op: up_read(&sbi->node_write); else f2fs_unlock_op(sbi); +out_free: + page_array_free(cc->inode, cc->cpages, cc->nr_cpages); + cc->cpages = NULL; return -EAGAIN; } @@ -1296,7 +1354,7 @@ void f2fs_compress_write_end_io(struct bio *bio, struct page *page) dec_page_count(sbi, F2FS_WB_DATA); - if (refcount_dec_not_one(&cic->ref)) + if (atomic_dec_return(&cic->pending_pages)) return; for (i = 0; i < cic->nr_rpages; i++) { @@ -1305,8 +1363,8 @@ void f2fs_compress_write_end_io(struct bio *bio, struct page *page) end_page_writeback(cic->rpages[i]); } - kfree(cic->rpages); - kfree(cic); + page_array_free(cic->inode, cic->rpages, cic->nr_rpages); + kmem_cache_free(cic_entry_slab, cic); } static int f2fs_write_raw_pages(struct compress_ctx *cc, @@ -1388,9 +1446,6 @@ int f2fs_write_multi_pages(struct compress_ctx *cc, struct writeback_control *wbc, enum iostat_type io_type) { - struct f2fs_inode_info *fi = F2FS_I(cc->inode); - const struct f2fs_compress_ops *cops = - f2fs_cops[fi->i_compress_algorithm]; int err; *submitted = 0; @@ -1405,9 +1460,6 @@ int f2fs_write_multi_pages(struct compress_ctx *cc, err = f2fs_write_compressed_pages(cc, submitted, wbc, io_type); - cops->destroy_compress_ctx(cc); - kfree(cc->cpages); - cc->cpages = NULL; if (!err) return 0; f2fs_bug_on(F2FS_I_SB(cc->inode), err != -EAGAIN); @@ -1424,25 +1476,23 @@ destroy_out: struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc) { - struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode); struct decompress_io_ctx *dic; pgoff_t start_idx = start_idx_of_cluster(cc); int i; - dic = f2fs_kzalloc(sbi, sizeof(struct decompress_io_ctx), GFP_NOFS); + dic = kmem_cache_zalloc(dic_entry_slab, GFP_NOFS); if (!dic) return ERR_PTR(-ENOMEM); - dic->rpages = f2fs_kzalloc(sbi, sizeof(struct page *) << - cc->log_cluster_size, GFP_NOFS); + dic->rpages = page_array_alloc(cc->inode, cc->cluster_size); if (!dic->rpages) { - kfree(dic); + kmem_cache_free(dic_entry_slab, dic); return ERR_PTR(-ENOMEM); } dic->magic = F2FS_COMPRESSED_PAGE_MAGIC; dic->inode = cc->inode; - refcount_set(&dic->ref, cc->nr_cpages); + atomic_set(&dic->pending_pages, cc->nr_cpages); dic->cluster_idx = cc->cluster_idx; dic->cluster_size = cc->cluster_size; dic->log_cluster_size = cc->log_cluster_size; @@ -1453,8 +1503,7 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc) dic->rpages[i] = cc->rpages[i]; dic->nr_rpages = cc->cluster_size; - dic->cpages = f2fs_kzalloc(sbi, sizeof(struct page *) * - dic->nr_cpages, GFP_NOFS); + dic->cpages = page_array_alloc(dic->inode, dic->nr_cpages); if (!dic->cpages) goto out_free; @@ -1489,7 +1538,7 @@ void f2fs_free_dic(struct decompress_io_ctx *dic) continue; f2fs_compress_free_page(dic->tpages[i]); } - kfree(dic->tpages); + page_array_free(dic->inode, dic->tpages, dic->cluster_size); } if (dic->cpages) { @@ -1498,11 +1547,11 @@ void f2fs_free_dic(struct decompress_io_ctx *dic) continue; f2fs_compress_free_page(dic->cpages[i]); } - kfree(dic->cpages); + page_array_free(dic->inode, dic->cpages, dic->nr_cpages); } - kfree(dic->rpages); - kfree(dic); + page_array_free(dic->inode, dic->rpages, dic->nr_rpages); + kmem_cache_free(dic_entry_slab, dic); } void f2fs_decompress_end_io(struct page **rpages, @@ -1530,3 +1579,76 @@ unlock: unlock_page(rpage); } } + +int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi) +{ + dev_t dev = sbi->sb->s_bdev->bd_dev; + char slab_name[32]; + + sprintf(slab_name, "f2fs_page_array_entry-%u:%u", MAJOR(dev), MINOR(dev)); + + sbi->page_array_slab_size = sizeof(struct page *) << + F2FS_OPTION(sbi).compress_log_size; + + sbi->page_array_slab = f2fs_kmem_cache_create(slab_name, + sbi->page_array_slab_size); + if (!sbi->page_array_slab) + return -ENOMEM; + return 0; +} + +void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi) +{ + kmem_cache_destroy(sbi->page_array_slab); +} + +static int __init f2fs_init_cic_cache(void) +{ + cic_entry_slab = f2fs_kmem_cache_create("f2fs_cic_entry", + sizeof(struct compress_io_ctx)); + if (!cic_entry_slab) + return -ENOMEM; + return 0; +} + +static void f2fs_destroy_cic_cache(void) +{ + kmem_cache_destroy(cic_entry_slab); +} + +static int __init f2fs_init_dic_cache(void) +{ + dic_entry_slab = f2fs_kmem_cache_create("f2fs_dic_entry", + sizeof(struct decompress_io_ctx)); + if (!dic_entry_slab) + return -ENOMEM; + return 0; +} + +static void f2fs_destroy_dic_cache(void) +{ + kmem_cache_destroy(dic_entry_slab); +} + +int __init f2fs_init_compress_cache(void) +{ + int err; + + err = f2fs_init_cic_cache(); + if (err) + goto out; + err = f2fs_init_dic_cache(); + if (err) + goto free_cic; + return 0; +free_cic: + f2fs_destroy_cic_cache(); +out: + return -ENOMEM; +} + +void f2fs_destroy_compress_cache(void) +{ + f2fs_destroy_dic_cache(); + f2fs_destroy_cic_cache(); +} diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 73683e58a08d..be4da52604ed 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -202,7 +202,7 @@ static void f2fs_verify_bio(struct bio *bio) dic = (struct decompress_io_ctx *)page_private(page); if (dic) { - if (refcount_dec_not_one(&dic->ref)) + if (atomic_dec_return(&dic->pending_pages)) continue; f2fs_verify_pages(dic->rpages, dic->cluster_size); @@ -517,7 +517,7 @@ static inline void __submit_bio(struct f2fs_sb_info *sbi, zero_user_segment(page, 0, PAGE_SIZE); SetPagePrivate(page); - set_page_private(page, (unsigned long)DUMMY_WRITTEN_PAGE); + set_page_private(page, DUMMY_WRITTEN_PAGE); lock_page(page); if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) f2fs_bug_on(sbi, 1); @@ -1416,7 +1416,7 @@ alloc: set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); old_blkaddr = dn->data_blkaddr; f2fs_allocate_data_block(sbi, NULL, old_blkaddr, &dn->data_blkaddr, - &sum, seg_type, NULL); + &sum, seg_type, NULL); if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) invalidate_mapping_pages(META_MAPPING(sbi), old_blkaddr, old_blkaddr); @@ -1803,10 +1803,6 @@ static int get_data_block_dio(struct inode *inode, sector_t iblock, static int get_data_block_bmap(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { - /* Block number less than F2FS MAX BLOCKS */ - if (unlikely(iblock >= F2FS_I_SB(inode)->max_file_blocks)) - return -EFBIG; - return __get_data_block(inode, iblock, bh_result, create, F2FS_GET_BLOCK_BMAP, NULL, NO_CHECK_TYPE, create); @@ -2272,8 +2268,8 @@ submit_and_realloc: if (IS_ERR(bio)) { ret = PTR_ERR(bio); dic->failed = true; - if (refcount_sub_and_test(dic->nr_cpages - i, - &dic->ref)) { + if (!atomic_sub_return(dic->nr_cpages - i, + &dic->pending_pages)) { f2fs_decompress_end_io(dic->rpages, cc->cluster_size, true, false); @@ -3133,6 +3129,8 @@ next: retry = 0; } } + if (f2fs_compressed_file(inode)) + f2fs_destroy_compress_ctx(&cc); #endif if (retry) { index = 0; @@ -3574,7 +3572,7 @@ static void f2fs_dio_end_io(struct bio *bio) bio->bi_private = dio->orig_private; bio->bi_end_io = dio->orig_end_io; - kvfree(dio); + kfree(dio); bio_endio(bio); } @@ -3673,12 +3671,18 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) err); if (!do_opu) set_inode_flag(inode, FI_UPDATE_WRITE); + } else if (err == -EIOCBQUEUED) { + f2fs_update_iostat(F2FS_I_SB(inode), APP_DIRECT_IO, + count - iov_iter_count(iter)); } else if (err < 0) { f2fs_write_failed(mapping, offset + count); } } else { if (err > 0) f2fs_update_iostat(sbi, APP_DIRECT_READ_IO, err); + else if (err == -EIOCBQUEUED) + f2fs_update_iostat(F2FS_I_SB(inode), APP_DIRECT_READ_IO, + count - iov_iter_count(iter)); } out: @@ -3807,11 +3811,16 @@ static sector_t f2fs_bmap(struct address_space *mapping, sector_t block) if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) filemap_write_and_wait(mapping); - if (f2fs_compressed_file(inode)) - blknr = f2fs_bmap_compress(inode, block); + /* Block number less than F2FS MAX BLOCKS */ + if (unlikely(block >= F2FS_I_SB(inode)->max_file_blocks)) + goto out; - if (!get_data_block_bmap(inode, block, &tmp, 0)) - blknr = tmp.b_blocknr; + if (f2fs_compressed_file(inode)) { + blknr = f2fs_bmap_compress(inode, block); + } else { + if (!get_data_block_bmap(inode, block, &tmp, 0)) + blknr = tmp.b_blocknr; + } out: trace_f2fs_bmap(inode, block, blknr); return blknr; @@ -3874,6 +3883,83 @@ int f2fs_migrate_page(struct address_space *mapping, #endif #ifdef CONFIG_SWAP +static int check_swap_activate_fast(struct swap_info_struct *sis, + struct file *swap_file, sector_t *span) +{ + struct address_space *mapping = swap_file->f_mapping; + struct inode *inode = mapping->host; + sector_t cur_lblock; + sector_t last_lblock; + sector_t pblock; + sector_t lowest_pblock = -1; + sector_t highest_pblock = 0; + int nr_extents = 0; + unsigned long nr_pblocks; + unsigned long len; + int ret; + + /* + * Map all the blocks into the extent list. This code doesn't try + * to be very smart. + */ + cur_lblock = 0; + last_lblock = logical_to_blk(inode, i_size_read(inode)); + len = i_size_read(inode); + + while (cur_lblock <= last_lblock && cur_lblock < sis->max) { + struct buffer_head map_bh; + pgoff_t next_pgofs; + + cond_resched(); + + memset(&map_bh, 0, sizeof(struct buffer_head)); + map_bh.b_size = len - cur_lblock; + + ret = get_data_block(inode, cur_lblock, &map_bh, 0, + F2FS_GET_BLOCK_FIEMAP, &next_pgofs); + if (ret) + goto err_out; + + /* hole */ + if (!buffer_mapped(&map_bh)) + goto err_out; + + pblock = map_bh.b_blocknr; + nr_pblocks = logical_to_blk(inode, map_bh.b_size); + + if (cur_lblock + nr_pblocks >= sis->max) + nr_pblocks = sis->max - cur_lblock; + + if (cur_lblock) { /* exclude the header page */ + if (pblock < lowest_pblock) + lowest_pblock = pblock; + if (pblock + nr_pblocks - 1 > highest_pblock) + highest_pblock = pblock + nr_pblocks - 1; + } + + /* + * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks + */ + ret = add_swap_extent(sis, cur_lblock, nr_pblocks, pblock); + if (ret < 0) + goto out; + nr_extents += ret; + cur_lblock += nr_pblocks; + } + ret = nr_extents; + *span = 1 + highest_pblock - lowest_pblock; + if (cur_lblock == 0) + cur_lblock = 1; /* force Empty message */ + sis->max = cur_lblock; + sis->pages = cur_lblock - 1; + sis->highest_bit = cur_lblock - 1; +out: + return ret; +err_out: + pr_err("swapon: swapfile has holes\n"); + return -EINVAL; +} + /* Copied from generic_swapfile_activate() to check any holes */ static int check_swap_activate(struct swap_info_struct *sis, struct file *swap_file, sector_t *span) @@ -3890,6 +3976,9 @@ static int check_swap_activate(struct swap_info_struct *sis, int nr_extents = 0; int ret; + if (PAGE_SIZE == F2FS_BLKSIZE) + return check_swap_activate_fast(sis, swap_file, span); + blkbits = inode->i_blkbits; blocks_per_page = PAGE_SIZE >> blkbits; @@ -3989,7 +4078,7 @@ static int f2fs_swap_activate(struct swap_info_struct *sis, struct file *file, if (ret) return ret; - if (f2fs_disable_compressed_file(inode)) + if (!f2fs_disable_compressed_file(inode)) return -EINVAL; ret = check_swap_activate(sis, file, span); diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 4276c0f79beb..a8357fd4f5fa 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -131,7 +131,7 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->inline_inode = atomic_read(&sbi->inline_inode); si->inline_dir = atomic_read(&sbi->inline_dir); si->compr_inode = atomic_read(&sbi->compr_inode); - si->compr_blocks = atomic_read(&sbi->compr_blocks); + si->compr_blocks = atomic64_read(&sbi->compr_blocks); si->append = sbi->im[APPEND_INO].ino_num; si->update = sbi->im[UPDATE_INO].ino_num; si->orphans = sbi->im[ORPHAN_INO].ino_num; @@ -164,7 +164,7 @@ static void update_general_status(struct f2fs_sb_info *sbi) * 100 / (int)(sbi->user_block_count >> sbi->log_blocks_per_seg) / 2; si->util_invalid = 50 - si->util_free - si->util_valid; - for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_NODE; i++) { + for (i = CURSEG_HOT_DATA; i < NO_CHECK_TYPE; i++) { struct curseg_info *curseg = CURSEG_I(sbi, i); si->curseg[i] = curseg->segno; si->cursec[i] = GET_SEC_FROM_SEG(sbi, curseg->segno); @@ -342,7 +342,7 @@ static int stat_show(struct seq_file *s, void *v) si->inline_inode); seq_printf(s, " - Inline_dentry Inode: %u\n", si->inline_dir); - seq_printf(s, " - Compressed Inode: %u, Blocks: %u\n", + seq_printf(s, " - Compressed Inode: %u, Blocks: %llu\n", si->compr_inode, si->compr_blocks); seq_printf(s, " - Orphan/Append/Update Inode: %u, %u, %u\n", si->orphans, si->append, si->update); @@ -393,6 +393,14 @@ static int stat_show(struct seq_file *s, void *v) si->dirty_seg[CURSEG_COLD_NODE], si->full_seg[CURSEG_COLD_NODE], si->valid_blks[CURSEG_COLD_NODE]); + seq_printf(s, " - Pinned file: %8d %8d %8d\n", + si->curseg[CURSEG_COLD_DATA_PINNED], + si->cursec[CURSEG_COLD_DATA_PINNED], + si->curzone[CURSEG_COLD_DATA_PINNED]); + seq_printf(s, " - ATGC data: %8d %8d %8d\n", + si->curseg[CURSEG_ALL_DATA_ATGC], + si->cursec[CURSEG_ALL_DATA_ATGC], + si->curzone[CURSEG_ALL_DATA_ATGC]); seq_printf(s, "\n - Valid: %d\n - Dirty: %d\n", si->main_area_segs - si->dirty_count - si->prefree_count - si->free_segs, @@ -542,7 +550,7 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) atomic_set(&sbi->inline_inode, 0); atomic_set(&sbi->inline_dir, 0); atomic_set(&sbi->compr_inode, 0); - atomic_set(&sbi->compr_blocks, 0); + atomic64_set(&sbi->compr_blocks, 0); atomic_set(&sbi->inplace_count, 0); for (i = META_CP; i < META_MAX; i++) atomic_set(&sbi->meta_count[i], 0); @@ -566,7 +574,7 @@ void f2fs_destroy_stats(struct f2fs_sb_info *sbi) list_del(&si->stat_list); mutex_unlock(&f2fs_stat_mutex); - kvfree(si); + kfree(si); } void __init f2fs_create_root_stats(void) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 53fbc4dd6e48..4b9ef8bbfa4a 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -75,21 +75,22 @@ int f2fs_init_casefolded_name(const struct inode *dir, struct f2fs_filename *fname) { #ifdef CONFIG_UNICODE - struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); + struct super_block *sb = dir->i_sb; + struct f2fs_sb_info *sbi = F2FS_SB(sb); if (IS_CASEFOLDED(dir)) { fname->cf_name.name = f2fs_kmalloc(sbi, F2FS_NAME_LEN, GFP_NOFS); if (!fname->cf_name.name) return -ENOMEM; - fname->cf_name.len = utf8_casefold(sbi->s_encoding, + fname->cf_name.len = utf8_casefold(sb->s_encoding, fname->usr_fname, fname->cf_name.name, F2FS_NAME_LEN); if ((int)fname->cf_name.len <= 0) { kfree(fname->cf_name.name); fname->cf_name.name = NULL; - if (f2fs_has_strict_mode(sbi)) + if (sb_has_strict_encoding(sb)) return -EINVAL; /* fall back to treating name as opaque byte sequence */ } @@ -190,21 +191,15 @@ static unsigned long dir_block_index(unsigned int level, static struct f2fs_dir_entry *find_in_block(struct inode *dir, struct page *dentry_page, const struct f2fs_filename *fname, - int *max_slots, - struct page **res_page) + int *max_slots) { struct f2fs_dentry_block *dentry_blk; - struct f2fs_dir_entry *de; struct f2fs_dentry_ptr d; dentry_blk = (struct f2fs_dentry_block *)page_address(dentry_page); make_dentry_ptr_block(dir, &d, dentry_blk); - de = f2fs_find_target_dentry(&d, fname, max_slots); - if (de) - *res_page = dentry_page; - - return de; + return f2fs_find_target_dentry(&d, fname, max_slots); } #ifdef CONFIG_UNICODE @@ -215,8 +210,8 @@ static struct f2fs_dir_entry *find_in_block(struct inode *dir, static bool f2fs_match_ci_name(const struct inode *dir, const struct qstr *name, const u8 *de_name, u32 de_name_len) { - const struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); - const struct unicode_map *um = sbi->s_encoding; + const struct super_block *sb = dir->i_sb; + const struct unicode_map *um = sb->s_encoding; struct qstr entry = QSTR_INIT(de_name, de_name_len); int res; @@ -226,7 +221,7 @@ static bool f2fs_match_ci_name(const struct inode *dir, const struct qstr *name, * In strict mode, ignore invalid names. In non-strict mode, * fall back to treating them as opaque byte sequences. */ - if (f2fs_has_strict_mode(sbi) || name->len != entry.len) + if (sb_has_strict_encoding(sb) || name->len != entry.len) return false; return !memcmp(name->name, entry.name, name->len); } @@ -330,10 +325,11 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, } } - de = find_in_block(dir, dentry_page, fname, &max_slots, - res_page); - if (de) + de = find_in_block(dir, dentry_page, fname, &max_slots); + if (de) { + *res_page = dentry_page; break; + } if (max_slots >= s) room = true; @@ -357,16 +353,15 @@ struct f2fs_dir_entry *__f2fs_find_entry(struct inode *dir, unsigned int max_depth; unsigned int level; + *res_page = NULL; + if (f2fs_has_inline_dentry(dir)) { - *res_page = NULL; de = f2fs_find_in_inline_dir(dir, fname, res_page); goto out; } - if (npages == 0) { - *res_page = NULL; + if (npages == 0) goto out; - } max_depth = F2FS_I(dir)->i_current_depth; if (unlikely(max_depth > MAX_DIR_HASH_DEPTH)) { @@ -377,7 +372,6 @@ struct f2fs_dir_entry *__f2fs_find_entry(struct inode *dir, } for (level = 0; level < max_depth; level++) { - *res_page = NULL; de = find_in_level(dir, level, fname, res_page); if (de || IS_ERR(*res_page)) break; @@ -1107,75 +1101,8 @@ const struct file_operations f2fs_dir_operations = { }; #ifdef CONFIG_UNICODE -static int f2fs_d_compare(const struct dentry *dentry, unsigned int len, - const char *str, const struct qstr *name) -{ - const struct dentry *parent = READ_ONCE(dentry->d_parent); - const struct inode *dir = READ_ONCE(parent->d_inode); - const struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb); - struct qstr entry = QSTR_INIT(str, len); - char strbuf[DNAME_INLINE_LEN]; - int res; - - if (!dir || !IS_CASEFOLDED(dir)) - goto fallback; - - /* - * If the dentry name is stored in-line, then it may be concurrently - * modified by a rename. If this happens, the VFS will eventually retry - * the lookup, so it doesn't matter what ->d_compare() returns. - * However, it's unsafe to call utf8_strncasecmp() with an unstable - * string. Therefore, we have to copy the name into a temporary buffer. - */ - if (len <= DNAME_INLINE_LEN - 1) { - memcpy(strbuf, str, len); - strbuf[len] = 0; - entry.name = strbuf; - /* prevent compiler from optimizing out the temporary buffer */ - barrier(); - } - - res = utf8_strncasecmp(sbi->s_encoding, name, &entry); - if (res >= 0) - return res; - - if (f2fs_has_strict_mode(sbi)) - return -EINVAL; -fallback: - if (len != name->len) - return 1; - return !!memcmp(str, name->name, len); -} - -static int f2fs_d_hash(const struct dentry *dentry, struct qstr *str) -{ - struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb); - const struct unicode_map *um = sbi->s_encoding; - const struct inode *inode = READ_ONCE(dentry->d_inode); - unsigned char *norm; - int len, ret = 0; - - if (!inode || !IS_CASEFOLDED(inode)) - return 0; - - norm = f2fs_kmalloc(sbi, PATH_MAX, GFP_ATOMIC); - if (!norm) - return -ENOMEM; - - len = utf8_casefold(um, str, norm, PATH_MAX); - if (len < 0) { - if (f2fs_has_strict_mode(sbi)) - ret = -EINVAL; - goto out; - } - str->hash = full_name_hash(dentry, norm, len); -out: - kvfree(norm); - return ret; -} - const struct dentry_operations f2fs_dentry_ops = { - .d_hash = f2fs_d_hash, - .d_compare = f2fs_d_compare, + .d_hash = generic_ci_d_hash, + .d_compare = generic_ci_d_compare, }; #endif diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 686c68b98610..3ebf976a682d 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -58,6 +58,29 @@ struct rb_entry *f2fs_lookup_rb_tree(struct rb_root_cached *root, return re; } +struct rb_node **f2fs_lookup_rb_tree_ext(struct f2fs_sb_info *sbi, + struct rb_root_cached *root, + struct rb_node **parent, + unsigned long long key, bool *leftmost) +{ + struct rb_node **p = &root->rb_root.rb_node; + struct rb_entry *re; + + while (*p) { + *parent = *p; + re = rb_entry(*parent, struct rb_entry, rb_node); + + if (key < re->key) { + p = &(*p)->rb_left; + } else { + p = &(*p)->rb_right; + *leftmost = false; + } + } + + return p; +} + struct rb_node **f2fs_lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi, struct rb_root_cached *root, struct rb_node **parent, @@ -166,7 +189,7 @@ lookup_neighbors: } bool f2fs_check_rb_tree_consistence(struct f2fs_sb_info *sbi, - struct rb_root_cached *root) + struct rb_root_cached *root, bool check_key) { #ifdef CONFIG_F2FS_CHECK_FS struct rb_node *cur = rb_first_cached(root), *next; @@ -183,13 +206,23 @@ bool f2fs_check_rb_tree_consistence(struct f2fs_sb_info *sbi, cur_re = rb_entry(cur, struct rb_entry, rb_node); next_re = rb_entry(next, struct rb_entry, rb_node); + if (check_key) { + if (cur_re->key > next_re->key) { + f2fs_info(sbi, "inconsistent rbtree, " + "cur(%llu) next(%llu)", + cur_re->key, next_re->key); + return false; + } + goto next; + } + if (cur_re->ofs + cur_re->len > next_re->ofs) { f2fs_info(sbi, "inconsistent rbtree, cur(%u, %u) next(%u, %u)", cur_re->ofs, cur_re->len, next_re->ofs, next_re->len); return false; } - +next: cur = next; } #endif diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 7c089ff7ff94..cb700d797296 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -98,6 +98,7 @@ extern const char *f2fs_fault_name[FAULT_MAX]; #define F2FS_MOUNT_RESERVE_ROOT 0x01000000 #define F2FS_MOUNT_DISABLE_CHECKPOINT 0x02000000 #define F2FS_MOUNT_NORECOVERY 0x04000000 +#define F2FS_MOUNT_ATGC 0x08000000 #define F2FS_OPTION(sbi) ((sbi)->mount_opt) #define clear_opt(sbi, option) (F2FS_OPTION(sbi).opt &= ~F2FS_MOUNT_##option) @@ -612,8 +613,13 @@ enum { struct rb_entry { struct rb_node rb_node; /* rb node located in rb-tree */ - unsigned int ofs; /* start offset of the entry */ - unsigned int len; /* length of the entry */ + union { + struct { + unsigned int ofs; /* start offset of the entry */ + unsigned int len; /* length of the entry */ + }; + unsigned long long key; /* 64-bits key */ + } __packed; }; struct extent_info { @@ -801,7 +807,7 @@ struct f2fs_inode_info { struct timespec64 i_disk_time[4];/* inode disk times */ /* for file compress */ - u64 i_compr_blocks; /* # of compressed blocks */ + atomic_t i_compr_blocks; /* # of compressed blocks */ unsigned char i_compress_algorithm; /* algorithm type */ unsigned char i_log_cluster_size; /* log of cluster size */ unsigned int i_cluster_size; /* cluster size */ @@ -973,7 +979,9 @@ static inline void set_new_dnode(struct dnode_of_data *dn, struct inode *inode, */ #define NR_CURSEG_DATA_TYPE (3) #define NR_CURSEG_NODE_TYPE (3) -#define NR_CURSEG_TYPE (NR_CURSEG_DATA_TYPE + NR_CURSEG_NODE_TYPE) +#define NR_CURSEG_INMEM_TYPE (2) +#define NR_CURSEG_PERSIST_TYPE (NR_CURSEG_DATA_TYPE + NR_CURSEG_NODE_TYPE) +#define NR_CURSEG_TYPE (NR_CURSEG_INMEM_TYPE + NR_CURSEG_PERSIST_TYPE) enum { CURSEG_HOT_DATA = 0, /* directory entry blocks */ @@ -982,8 +990,11 @@ enum { CURSEG_HOT_NODE, /* direct node blocks of directory files */ CURSEG_WARM_NODE, /* direct node blocks of normal files */ CURSEG_COLD_NODE, /* indirect node blocks */ - NO_CHECK_TYPE, - CURSEG_COLD_DATA_PINNED,/* cold data for pinned file */ + NR_PERSISTENT_LOG, /* number of persistent log */ + CURSEG_COLD_DATA_PINNED = NR_PERSISTENT_LOG, + /* pinned file that needs consecutive block address */ + CURSEG_ALL_DATA_ATGC, /* SSR alloctor in hot/warm/cold data area */ + NO_CHECK_TYPE, /* number of persistent & inmem log */ }; struct flush_cmd { @@ -1209,6 +1220,7 @@ struct f2fs_dev_info { #ifdef CONFIG_BLK_DEV_ZONED unsigned int nr_blkz; /* Total number of zones */ unsigned long *blkz_seq; /* Bitmap indicating sequential zones */ + block_t *zone_capacity_blocks; /* Array of zone capacity in blks */ #endif }; @@ -1228,6 +1240,18 @@ struct inode_management { unsigned long ino_num; /* number of entries */ }; +/* for GC_AT */ +struct atgc_management { + bool atgc_enabled; /* ATGC is enabled or not */ + struct rb_root_cached root; /* root of victim rb-tree */ + struct list_head victim_list; /* linked with all victim entries */ + unsigned int victim_count; /* victim count in rb-tree */ + unsigned int candidate_ratio; /* candidate ratio */ + unsigned int max_candidate_count; /* max candidate count */ + unsigned int age_weight; /* age weight, vblock_weight = 100 - age_weight */ + unsigned long long age_threshold; /* age threshold */ +}; + /* For s_flag in struct f2fs_sb_info */ enum { SBI_IS_DIRTY, /* dirty flag for checkpoint */ @@ -1260,6 +1284,7 @@ enum { GC_NORMAL, GC_IDLE_CB, GC_IDLE_GREEDY, + GC_IDLE_AT, GC_URGENT_HIGH, GC_URGENT_LOW, }; @@ -1303,9 +1328,9 @@ enum fsync_mode { #define DUMMY_WRITTEN_PAGE ((unsigned long)-2) #define IS_ATOMIC_WRITTEN_PAGE(page) \ - (page_private(page) == (unsigned long)ATOMIC_WRITTEN_PAGE) + (page_private(page) == ATOMIC_WRITTEN_PAGE) #define IS_DUMMY_WRITTEN_PAGE(page) \ - (page_private(page) == (unsigned long)DUMMY_WRITTEN_PAGE) + (page_private(page) == DUMMY_WRITTEN_PAGE) #ifdef CONFIG_F2FS_IO_TRACE #define IS_IO_TRACED_PAGE(page) \ @@ -1359,7 +1384,7 @@ struct compress_io_ctx { struct inode *inode; /* inode the context belong to */ struct page **rpages; /* pages store raw data in cluster */ unsigned int nr_rpages; /* total page number in rpages */ - refcount_t ref; /* referrence count of raw page */ + atomic_t pending_pages; /* in-flight compressed page count */ }; /* decompress io context for read IO path */ @@ -1378,7 +1403,7 @@ struct decompress_io_ctx { struct compress_data *cbuf; /* virtual mapped address on cpages */ size_t rlen; /* valid data length in rbuf */ size_t clen; /* valid data length in cbuf */ - refcount_t ref; /* referrence count of compressed page */ + atomic_t pending_pages; /* in-flight compressed page count */ bool failed; /* indicate IO error during decompression */ void *private; /* payload buffer for specified decompression algorithm */ void *private2; /* extra payload buffer */ @@ -1387,7 +1412,7 @@ struct decompress_io_ctx { #define NULL_CLUSTER ((unsigned int)(~0)) #define MIN_COMPRESS_LOG_SIZE 2 #define MAX_COMPRESS_LOG_SIZE 8 -#define MAX_COMPRESS_WINDOW_SIZE ((PAGE_SIZE) << MAX_COMPRESS_LOG_SIZE) +#define MAX_COMPRESS_WINDOW_SIZE(log_size) ((PAGE_SIZE) << (log_size)) struct f2fs_sb_info { struct super_block *sb; /* pointer to VFS super block */ @@ -1397,10 +1422,6 @@ struct f2fs_sb_info { int valid_super_block; /* valid super block no */ unsigned long s_flag; /* flags for sbi */ struct mutex writepages; /* mutex for writepages() */ -#ifdef CONFIG_UNICODE - struct unicode_map *s_encoding; - __u16 s_encoding_flags; -#endif #ifdef CONFIG_BLK_DEV_ZONED unsigned int blocks_per_blkz; /* F2FS blocks per zone */ @@ -1508,6 +1529,7 @@ struct f2fs_sb_info { * race between GC and GC or CP */ struct f2fs_gc_kthread *gc_thread; /* GC thread */ + struct atgc_management am; /* atgc management */ unsigned int cur_victim_sec; /* current victim section num */ unsigned int gc_mode; /* current GC state */ unsigned int next_victim_seg[2]; /* next segment in victim section */ @@ -1544,7 +1566,7 @@ struct f2fs_sb_info { atomic_t inline_inode; /* # of inline_data inodes */ atomic_t inline_dir; /* # of inline_dentry inodes */ atomic_t compr_inode; /* # of compressed inodes */ - atomic_t compr_blocks; /* # of compressed blocks */ + atomic64_t compr_blocks; /* # of compressed blocks */ atomic_t vw_cnt; /* # of volatile writes */ atomic_t max_aw_cnt; /* max # of atomic writes */ atomic_t max_vw_cnt; /* max # of volatile writes */ @@ -1593,6 +1615,11 @@ struct f2fs_sb_info { struct kmem_cache *inline_xattr_slab; /* inline xattr entry */ unsigned int inline_xattr_slab_size; /* default inline xattr slab size */ + +#ifdef CONFIG_F2FS_FS_COMPRESSION + struct kmem_cache *page_array_slab; /* page array entry */ + unsigned int page_array_slab_size; /* default page array slab size */ +#endif }; struct f2fs_private_dio { @@ -3325,6 +3352,11 @@ block_t f2fs_get_unusable_blocks(struct f2fs_sb_info *sbi); int f2fs_disable_cp_again(struct f2fs_sb_info *sbi, block_t unusable); void f2fs_release_discard_addrs(struct f2fs_sb_info *sbi); int f2fs_npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra); +void f2fs_init_inmem_curseg(struct f2fs_sb_info *sbi); +void f2fs_save_inmem_curseg(struct f2fs_sb_info *sbi); +void f2fs_restore_inmem_curseg(struct f2fs_sb_info *sbi); +void f2fs_get_new_segment(struct f2fs_sb_info *sbi, + unsigned int *newseg, bool new_sec, int dir); void f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type, unsigned int start, unsigned int end); void f2fs_allocate_new_segment(struct f2fs_sb_info *sbi, int type); @@ -3343,7 +3375,8 @@ void f2fs_outplace_write_data(struct dnode_of_data *dn, int f2fs_inplace_write_data(struct f2fs_io_info *fio); void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, block_t old_blkaddr, block_t new_blkaddr, - bool recover_curseg, bool recover_newaddr); + bool recover_curseg, bool recover_newaddr, + bool from_gc); void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn, block_t old_addr, block_t new_addr, unsigned char version, bool recover_curseg, @@ -3371,6 +3404,10 @@ void f2fs_destroy_segment_manager_caches(void); int f2fs_rw_hint_to_seg_type(enum rw_hint hint); enum rw_hint f2fs_io_type_to_rw_hint(struct f2fs_sb_info *sbi, enum page_type type, enum temp_type temp); +unsigned int f2fs_usable_segs_in_sec(struct f2fs_sb_info *sbi, + unsigned int segno); +unsigned int f2fs_usable_blks_in_seg(struct f2fs_sb_info *sbi, + unsigned int segno); /* * checkpoint.c @@ -3378,7 +3415,7 @@ enum rw_hint f2fs_io_type_to_rw_hint(struct f2fs_sb_info *sbi, void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io); struct page *f2fs_grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index); struct page *f2fs_get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index); -struct page *f2fs_get_meta_page_nofail(struct f2fs_sb_info *sbi, pgoff_t index); +struct page *f2fs_get_meta_page_retry(struct f2fs_sb_info *sbi, pgoff_t index); struct page *f2fs_get_tmp_page(struct f2fs_sb_info *sbi, pgoff_t index); bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type); @@ -3486,6 +3523,8 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background, unsigned int segno); void f2fs_build_gc_manager(struct f2fs_sb_info *sbi); int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count); +int __init f2fs_create_garbage_collection_cache(void); +void f2fs_destroy_garbage_collection_cache(void); /* * recovery.c @@ -3521,7 +3560,8 @@ struct f2fs_stat_info { int nr_discard_cmd; unsigned int undiscard_blks; int inline_xattr, inline_inode, inline_dir, append, update, orphans; - int compr_inode, compr_blocks; + int compr_inode; + unsigned long long compr_blocks; int aw_cnt, max_aw_cnt, vw_cnt, max_vw_cnt; unsigned int valid_count, valid_node_count, valid_inode_count, discard_blks; unsigned int bimodal, avg_vblocks; @@ -3606,9 +3646,9 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) (atomic_dec(&F2FS_I_SB(inode)->compr_inode)); \ } while (0) #define stat_add_compr_blocks(inode, blocks) \ - (atomic_add(blocks, &F2FS_I_SB(inode)->compr_blocks)) + (atomic64_add(blocks, &F2FS_I_SB(inode)->compr_blocks)) #define stat_sub_compr_blocks(inode, blocks) \ - (atomic_sub(blocks, &F2FS_I_SB(inode)->compr_blocks)) + (atomic64_sub(blocks, &F2FS_I_SB(inode)->compr_blocks)) #define stat_inc_meta_count(sbi, blkaddr) \ do { \ if (blkaddr < SIT_I(sbi)->sit_base_addr) \ @@ -3787,6 +3827,10 @@ void f2fs_leave_shrinker(struct f2fs_sb_info *sbi); */ struct rb_entry *f2fs_lookup_rb_tree(struct rb_root_cached *root, struct rb_entry *cached_re, unsigned int ofs); +struct rb_node **f2fs_lookup_rb_tree_ext(struct f2fs_sb_info *sbi, + struct rb_root_cached *root, + struct rb_node **parent, + unsigned long long key, bool *left_most); struct rb_node **f2fs_lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi, struct rb_root_cached *root, struct rb_node **parent, @@ -3797,7 +3841,7 @@ struct rb_entry *f2fs_lookup_rb_tree_ret(struct rb_root_cached *root, struct rb_node ***insert_p, struct rb_node **insert_parent, bool force, bool *leftmost); bool f2fs_check_rb_tree_consistence(struct f2fs_sb_info *sbi, - struct rb_root_cached *root); + struct rb_root_cached *root, bool check_key); unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink); void f2fs_init_extent_tree(struct inode *inode, struct page *ipage); void f2fs_drop_extent_tree(struct inode *inode); @@ -3883,6 +3927,10 @@ void f2fs_decompress_end_io(struct page **rpages, int f2fs_init_compress_ctx(struct compress_ctx *cc); void f2fs_destroy_compress_ctx(struct compress_ctx *cc); void f2fs_init_compress_info(struct f2fs_sb_info *sbi); +int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi); +void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi); +int __init f2fs_init_compress_cache(void); +void f2fs_destroy_compress_cache(void); #else static inline bool f2fs_is_compressed_page(struct page *page) { return false; } static inline bool f2fs_is_compress_backend_ready(struct inode *inode) @@ -3899,6 +3947,10 @@ static inline struct page *f2fs_compress_control_page(struct page *page) } static inline int f2fs_init_compress_mempool(void) { return 0; } static inline void f2fs_destroy_compress_mempool(void) { } +static inline int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi) { return 0; } +static inline void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi) { } +static inline int __init f2fs_init_compress_cache(void) { return 0; } +static inline void f2fs_destroy_compress_cache(void) { } #endif static inline void set_compress_context(struct inode *inode) @@ -3917,24 +3969,21 @@ static inline void set_compress_context(struct inode *inode) f2fs_mark_inode_dirty_sync(inode, true); } -static inline u64 f2fs_disable_compressed_file(struct inode *inode) +static inline bool f2fs_disable_compressed_file(struct inode *inode) { struct f2fs_inode_info *fi = F2FS_I(inode); if (!f2fs_compressed_file(inode)) - return 0; - if (S_ISREG(inode->i_mode)) { - if (get_dirty_pages(inode)) - return 1; - if (fi->i_compr_blocks) - return fi->i_compr_blocks; - } + return true; + if (S_ISREG(inode->i_mode) && + (get_dirty_pages(inode) || atomic_read(&fi->i_compr_blocks))) + return false; fi->i_flags &= ~F2FS_COMPR_FL; stat_dec_compr_inode(inode); clear_inode_flag(inode, FI_COMPRESSED_FILE); f2fs_mark_inode_dirty_sync(inode, true); - return 0; + return true; } #define F2FS_FEATURE_FUNCS(name, flagname) \ @@ -4028,16 +4077,17 @@ static inline void f2fs_i_compr_blocks_update(struct inode *inode, u64 blocks, bool add) { int diff = F2FS_I(inode)->i_cluster_size - blocks; + struct f2fs_inode_info *fi = F2FS_I(inode); /* don't update i_compr_blocks if saved blocks were released */ - if (!add && !F2FS_I(inode)->i_compr_blocks) + if (!add && !atomic_read(&fi->i_compr_blocks)) return; if (add) { - F2FS_I(inode)->i_compr_blocks += diff; + atomic_add(diff, &fi->i_compr_blocks); stat_add_compr_blocks(inode, diff); } else { - F2FS_I(inode)->i_compr_blocks -= diff; + atomic_sub(diff, &fi->i_compr_blocks); stat_sub_compr_blocks(inode, diff); } f2fs_mark_inode_dirty_sync(inode, true); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 8a422400e824..ee861c6d9ff0 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -376,32 +376,15 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) return f2fs_do_sync_file(file, start, end, datasync, false); } -static pgoff_t __get_first_dirty_index(struct address_space *mapping, - pgoff_t pgofs, int whence) -{ - struct page *page; - int nr_pages; - - if (whence != SEEK_DATA) - return 0; - - /* find first dirty page index */ - nr_pages = find_get_pages_tag(mapping, &pgofs, PAGECACHE_TAG_DIRTY, - 1, &page); - if (!nr_pages) - return ULONG_MAX; - pgofs = page->index; - put_page(page); - return pgofs; -} - -static bool __found_offset(struct f2fs_sb_info *sbi, block_t blkaddr, - pgoff_t dirty, pgoff_t pgofs, int whence) +static bool __found_offset(struct address_space *mapping, block_t blkaddr, + pgoff_t index, int whence) { switch (whence) { case SEEK_DATA: - if ((blkaddr == NEW_ADDR && dirty == pgofs) || - __is_valid_data_blkaddr(blkaddr)) + if (__is_valid_data_blkaddr(blkaddr)) + return true; + if (blkaddr == NEW_ADDR && + xa_get_mark(&mapping->i_pages, index, PAGECACHE_TAG_DIRTY)) return true; break; case SEEK_HOLE: @@ -417,7 +400,7 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) struct inode *inode = file->f_mapping->host; loff_t maxbytes = inode->i_sb->s_maxbytes; struct dnode_of_data dn; - pgoff_t pgofs, end_offset, dirty; + pgoff_t pgofs, end_offset; loff_t data_ofs = offset; loff_t isize; int err = 0; @@ -429,16 +412,13 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) goto fail; /* handle inline data case */ - if (f2fs_has_inline_data(inode) || f2fs_has_inline_dentry(inode)) { - if (whence == SEEK_HOLE) - data_ofs = isize; + if (f2fs_has_inline_data(inode) && whence == SEEK_HOLE) { + data_ofs = isize; goto found; } pgofs = (pgoff_t)(offset >> PAGE_SHIFT); - dirty = __get_first_dirty_index(inode->i_mapping, pgofs, whence); - for (; data_ofs < isize; data_ofs = (loff_t)pgofs << PAGE_SHIFT) { set_new_dnode(&dn, inode, NULL, NULL, 0); err = f2fs_get_dnode_of_data(&dn, pgofs, LOOKUP_NODE); @@ -471,7 +451,7 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) goto fail; } - if (__found_offset(F2FS_I_SB(inode), blkaddr, dirty, + if (__found_offset(file->f_mapping, blkaddr, pgofs, whence)) { f2fs_put_dnode(&dn); goto found; @@ -564,7 +544,7 @@ void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count) bool compressed_cluster = false; int cluster_index = 0, valid_blocks = 0; int cluster_size = F2FS_I(dn->inode)->i_cluster_size; - bool released = !F2FS_I(dn->inode)->i_compr_blocks; + bool released = !atomic_read(&F2FS_I(dn->inode)->i_compr_blocks); if (IS_INODE(dn->node_page) && f2fs_has_extra_attr(dn->inode)) base = get_extra_isize(dn->inode); @@ -753,11 +733,14 @@ int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock) return err; #ifdef CONFIG_F2FS_FS_COMPRESSION - if (from != free_from) + if (from != free_from) { err = f2fs_truncate_partial_cluster(inode, from, lock); + if (err) + return err; + } #endif - return err; + return 0; } int f2fs_truncate(struct inode *inode) @@ -1656,13 +1639,14 @@ next_alloc: } down_write(&sbi->pin_sem); - map.m_seg_type = CURSEG_COLD_DATA_PINNED; f2fs_lock_op(sbi); - f2fs_allocate_new_segment(sbi, CURSEG_COLD_DATA); + f2fs_allocate_new_segment(sbi, CURSEG_COLD_DATA_PINNED); f2fs_unlock_op(sbi); + map.m_seg_type = CURSEG_COLD_DATA_PINNED; err = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_DIO); + up_write(&sbi->pin_sem); done += map.m_len; @@ -1828,7 +1812,7 @@ static int f2fs_setflags_common(struct inode *inode, u32 iflags, u32 mask) if ((iflags ^ masked_flags) & F2FS_COMPR_FL) { if (masked_flags & F2FS_COMPR_FL) { - if (f2fs_disable_compressed_file(inode)) + if (!f2fs_disable_compressed_file(inode)) return -EINVAL; } if (iflags & F2FS_NOCOMP_FL) @@ -1836,6 +1820,8 @@ static int f2fs_setflags_common(struct inode *inode, u32 iflags, u32 mask) if (iflags & F2FS_COMPR_FL) { if (!f2fs_may_compress(inode)) return -EINVAL; + if (S_ISREG(inode->i_mode) && inode->i_size) + return -EINVAL; set_compress_context(inode); } @@ -2783,6 +2769,9 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, if (IS_ENCRYPTED(src) || IS_ENCRYPTED(dst)) return -EOPNOTSUPP; + if (pos_out < 0 || pos_in < 0) + return -EINVAL; + if (src == dst) { if (pos_in == pos_out) return 0; @@ -3258,7 +3247,7 @@ static int f2fs_ioc_set_pin_file(struct file *filp, unsigned long arg) if (ret) goto out; - if (f2fs_disable_compressed_file(inode)) { + if (!f2fs_disable_compressed_file(inode)) { ret = -EOPNOTSUPP; goto out; } @@ -3385,7 +3374,7 @@ static int f2fs_ioc_getfslabel(struct file *filp, unsigned long arg) min(FSLABEL_MAX, count))) err = -EFAULT; - kvfree(vbuf); + kfree(vbuf); return err; } @@ -3436,7 +3425,7 @@ static int f2fs_get_compress_blocks(struct file *filp, unsigned long arg) if (!f2fs_compressed_file(inode)) return -EINVAL; - blocks = F2FS_I(inode)->i_compr_blocks; + blocks = atomic_read(&F2FS_I(inode)->i_compr_blocks); return put_user(blocks, (u64 __user *)arg); } @@ -3521,7 +3510,8 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg) inode_lock(inode); writecount = atomic_read(&inode->i_writecount); - if ((filp->f_mode & FMODE_WRITE && writecount != 1) || writecount) { + if ((filp->f_mode & FMODE_WRITE && writecount != 1) || + (!(filp->f_mode & FMODE_WRITE) && writecount)) { ret = -EBUSY; goto out; } @@ -3540,7 +3530,7 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg) inode->i_ctime = current_time(inode); f2fs_mark_inode_dirty_sync(inode, true); - if (!F2FS_I(inode)->i_compr_blocks) + if (!atomic_read(&F2FS_I(inode)->i_compr_blocks)) goto out; down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); @@ -3588,14 +3578,15 @@ out: if (ret >= 0) { ret = put_user(released_blocks, (u64 __user *)arg); - } else if (released_blocks && F2FS_I(inode)->i_compr_blocks) { + } else if (released_blocks && + atomic_read(&F2FS_I(inode)->i_compr_blocks)) { set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_warn(sbi, "%s: partial blocks were released i_ino=%lx " - "iblocks=%llu, released=%u, compr_blocks=%llu, " + "iblocks=%llu, released=%u, compr_blocks=%u, " "run fsck to fix.", __func__, inode->i_ino, inode->i_blocks, released_blocks, - F2FS_I(inode)->i_compr_blocks); + atomic_read(&F2FS_I(inode)->i_compr_blocks)); } return ret; @@ -3683,7 +3674,7 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg) if (ret) return ret; - if (F2FS_I(inode)->i_compr_blocks) + if (atomic_read(&F2FS_I(inode)->i_compr_blocks)) goto out; f2fs_balance_fs(F2FS_I_SB(inode), true); @@ -3747,14 +3738,15 @@ out: if (ret >= 0) { ret = put_user(reserved_blocks, (u64 __user *)arg); - } else if (reserved_blocks && F2FS_I(inode)->i_compr_blocks) { + } else if (reserved_blocks && + atomic_read(&F2FS_I(inode)->i_compr_blocks)) { set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_warn(sbi, "%s: partial blocks were released i_ino=%lx " - "iblocks=%llu, reserved=%u, compr_blocks=%llu, " + "iblocks=%llu, reserved=%u, compr_blocks=%u, " "run fsck to fix.", __func__, inode->i_ino, inode->i_blocks, reserved_blocks, - F2FS_I(inode)->i_compr_blocks); + atomic_read(&F2FS_I(inode)->i_compr_blocks)); } return ret; diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 11b4adde9baf..05641a1e36cc 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -21,6 +21,8 @@ #include "gc.h" #include <trace/events/f2fs.h> +static struct kmem_cache *victim_entry_slab; + static unsigned int count_bits(const unsigned long *addr, unsigned int offset, unsigned int len); @@ -150,7 +152,7 @@ int f2fs_start_gc_thread(struct f2fs_sb_info *sbi) "f2fs_gc-%u:%u", MAJOR(dev), MINOR(dev)); if (IS_ERR(gc_th->f2fs_gc_task)) { err = PTR_ERR(gc_th->f2fs_gc_task); - kvfree(gc_th); + kfree(gc_th); sbi->gc_thread = NULL; } out: @@ -163,13 +165,22 @@ void f2fs_stop_gc_thread(struct f2fs_sb_info *sbi) if (!gc_th) return; kthread_stop(gc_th->f2fs_gc_task); - kvfree(gc_th); + kfree(gc_th); sbi->gc_thread = NULL; } static int select_gc_type(struct f2fs_sb_info *sbi, int gc_type) { - int gc_mode = (gc_type == BG_GC) ? GC_CB : GC_GREEDY; + int gc_mode; + + if (gc_type == BG_GC) { + if (sbi->am.atgc_enabled) + gc_mode = GC_AT; + else + gc_mode = GC_CB; + } else { + gc_mode = GC_GREEDY; + } switch (sbi->gc_mode) { case GC_IDLE_CB: @@ -179,7 +190,11 @@ static int select_gc_type(struct f2fs_sb_info *sbi, int gc_type) case GC_URGENT_HIGH: gc_mode = GC_GREEDY; break; + case GC_IDLE_AT: + gc_mode = GC_AT; + break; } + return gc_mode; } @@ -193,6 +208,11 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type, p->dirty_bitmap = dirty_i->dirty_segmap[type]; p->max_search = dirty_i->nr_dirty[type]; p->ofs_unit = 1; + } else if (p->alloc_mode == AT_SSR) { + p->gc_mode = GC_GREEDY; + p->dirty_bitmap = dirty_i->dirty_segmap[type]; + p->max_search = dirty_i->nr_dirty[type]; + p->ofs_unit = 1; } else { p->gc_mode = select_gc_type(sbi, gc_type); p->ofs_unit = sbi->segs_per_sec; @@ -212,6 +232,7 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type, */ if (gc_type != FG_GC && (sbi->gc_mode != GC_URGENT_HIGH) && + (p->gc_mode != GC_AT && p->alloc_mode != AT_SSR) && p->max_search > sbi->max_victim_search) p->max_search = sbi->max_victim_search; @@ -229,10 +250,16 @@ static unsigned int get_max_cost(struct f2fs_sb_info *sbi, /* SSR allocates in a segment unit */ if (p->alloc_mode == SSR) return sbi->blocks_per_seg; + else if (p->alloc_mode == AT_SSR) + return UINT_MAX; + + /* LFS */ if (p->gc_mode == GC_GREEDY) return 2 * sbi->blocks_per_seg * p->ofs_unit; else if (p->gc_mode == GC_CB) return UINT_MAX; + else if (p->gc_mode == GC_AT) + return UINT_MAX; else /* No other gc_mode */ return 0; } @@ -266,13 +293,14 @@ static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno) unsigned char age = 0; unsigned char u; unsigned int i; + unsigned int usable_segs_per_sec = f2fs_usable_segs_in_sec(sbi, segno); - for (i = 0; i < sbi->segs_per_sec; i++) + for (i = 0; i < usable_segs_per_sec; i++) mtime += get_seg_entry(sbi, start + i)->mtime; vblocks = get_valid_blocks(sbi, segno, true); - mtime = div_u64(mtime, sbi->segs_per_sec); - vblocks = div_u64(vblocks, sbi->segs_per_sec); + mtime = div_u64(mtime, usable_segs_per_sec); + vblocks = div_u64(vblocks, usable_segs_per_sec); u = (vblocks * 100) >> sbi->log_blocks_per_seg; @@ -297,8 +325,11 @@ static inline unsigned int get_gc_cost(struct f2fs_sb_info *sbi, /* alloc_mode == LFS */ if (p->gc_mode == GC_GREEDY) return get_valid_blocks(sbi, segno, true); - else + else if (p->gc_mode == GC_CB) return get_cb_cost(sbi, segno); + + f2fs_bug_on(sbi, 1); + return 0; } static unsigned int count_bits(const unsigned long *addr, @@ -313,6 +344,273 @@ static unsigned int count_bits(const unsigned long *addr, return sum; } +static struct victim_entry *attach_victim_entry(struct f2fs_sb_info *sbi, + unsigned long long mtime, unsigned int segno, + struct rb_node *parent, struct rb_node **p, + bool left_most) +{ + struct atgc_management *am = &sbi->am; + struct victim_entry *ve; + + ve = f2fs_kmem_cache_alloc(victim_entry_slab, GFP_NOFS); + + ve->mtime = mtime; + ve->segno = segno; + + rb_link_node(&ve->rb_node, parent, p); + rb_insert_color_cached(&ve->rb_node, &am->root, left_most); + + list_add_tail(&ve->list, &am->victim_list); + + am->victim_count++; + + return ve; +} + +static void insert_victim_entry(struct f2fs_sb_info *sbi, + unsigned long long mtime, unsigned int segno) +{ + struct atgc_management *am = &sbi->am; + struct rb_node **p; + struct rb_node *parent = NULL; + bool left_most = true; + + p = f2fs_lookup_rb_tree_ext(sbi, &am->root, &parent, mtime, &left_most); + attach_victim_entry(sbi, mtime, segno, parent, p, left_most); +} + +static void add_victim_entry(struct f2fs_sb_info *sbi, + struct victim_sel_policy *p, unsigned int segno) +{ + struct sit_info *sit_i = SIT_I(sbi); + unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); + unsigned int start = GET_SEG_FROM_SEC(sbi, secno); + unsigned long long mtime = 0; + unsigned int i; + + if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) { + if (p->gc_mode == GC_AT && + get_valid_blocks(sbi, segno, true) == 0) + return; + + if (p->alloc_mode == AT_SSR && + get_seg_entry(sbi, segno)->ckpt_valid_blocks == 0) + return; + } + + for (i = 0; i < sbi->segs_per_sec; i++) + mtime += get_seg_entry(sbi, start + i)->mtime; + mtime = div_u64(mtime, sbi->segs_per_sec); + + /* Handle if the system time has changed by the user */ + if (mtime < sit_i->min_mtime) + sit_i->min_mtime = mtime; + if (mtime > sit_i->max_mtime) + sit_i->max_mtime = mtime; + if (mtime < sit_i->dirty_min_mtime) + sit_i->dirty_min_mtime = mtime; + if (mtime > sit_i->dirty_max_mtime) + sit_i->dirty_max_mtime = mtime; + + /* don't choose young section as candidate */ + if (sit_i->dirty_max_mtime - mtime < p->age_threshold) + return; + + insert_victim_entry(sbi, mtime, segno); +} + +static struct rb_node *lookup_central_victim(struct f2fs_sb_info *sbi, + struct victim_sel_policy *p) +{ + struct atgc_management *am = &sbi->am; + struct rb_node *parent = NULL; + bool left_most; + + f2fs_lookup_rb_tree_ext(sbi, &am->root, &parent, p->age, &left_most); + + return parent; +} + +static void atgc_lookup_victim(struct f2fs_sb_info *sbi, + struct victim_sel_policy *p) +{ + struct sit_info *sit_i = SIT_I(sbi); + struct atgc_management *am = &sbi->am; + struct rb_root_cached *root = &am->root; + struct rb_node *node; + struct rb_entry *re; + struct victim_entry *ve; + unsigned long long total_time; + unsigned long long age, u, accu; + unsigned long long max_mtime = sit_i->dirty_max_mtime; + unsigned long long min_mtime = sit_i->dirty_min_mtime; + unsigned int sec_blocks = BLKS_PER_SEC(sbi); + unsigned int vblocks; + unsigned int dirty_threshold = max(am->max_candidate_count, + am->candidate_ratio * + am->victim_count / 100); + unsigned int age_weight = am->age_weight; + unsigned int cost; + unsigned int iter = 0; + + if (max_mtime < min_mtime) + return; + + max_mtime += 1; + total_time = max_mtime - min_mtime; + + accu = div64_u64(ULLONG_MAX, total_time); + accu = min_t(unsigned long long, div_u64(accu, 100), + DEFAULT_ACCURACY_CLASS); + + node = rb_first_cached(root); +next: + re = rb_entry_safe(node, struct rb_entry, rb_node); + if (!re) + return; + + ve = (struct victim_entry *)re; + + if (ve->mtime >= max_mtime || ve->mtime < min_mtime) + goto skip; + + /* age = 10000 * x% * 60 */ + age = div64_u64(accu * (max_mtime - ve->mtime), total_time) * + age_weight; + + vblocks = get_valid_blocks(sbi, ve->segno, true); + f2fs_bug_on(sbi, !vblocks || vblocks == sec_blocks); + + /* u = 10000 * x% * 40 */ + u = div64_u64(accu * (sec_blocks - vblocks), sec_blocks) * + (100 - age_weight); + + f2fs_bug_on(sbi, age + u >= UINT_MAX); + + cost = UINT_MAX - (age + u); + iter++; + + if (cost < p->min_cost || + (cost == p->min_cost && age > p->oldest_age)) { + p->min_cost = cost; + p->oldest_age = age; + p->min_segno = ve->segno; + } +skip: + if (iter < dirty_threshold) { + node = rb_next(node); + goto next; + } +} + +/* + * select candidates around source section in range of + * [target - dirty_threshold, target + dirty_threshold] + */ +static void atssr_lookup_victim(struct f2fs_sb_info *sbi, + struct victim_sel_policy *p) +{ + struct sit_info *sit_i = SIT_I(sbi); + struct atgc_management *am = &sbi->am; + struct rb_node *node; + struct rb_entry *re; + struct victim_entry *ve; + unsigned long long age; + unsigned long long max_mtime = sit_i->dirty_max_mtime; + unsigned long long min_mtime = sit_i->dirty_min_mtime; + unsigned int seg_blocks = sbi->blocks_per_seg; + unsigned int vblocks; + unsigned int dirty_threshold = max(am->max_candidate_count, + am->candidate_ratio * + am->victim_count / 100); + unsigned int cost; + unsigned int iter = 0; + int stage = 0; + + if (max_mtime < min_mtime) + return; + max_mtime += 1; +next_stage: + node = lookup_central_victim(sbi, p); +next_node: + re = rb_entry_safe(node, struct rb_entry, rb_node); + if (!re) { + if (stage == 0) + goto skip_stage; + return; + } + + ve = (struct victim_entry *)re; + + if (ve->mtime >= max_mtime || ve->mtime < min_mtime) + goto skip_node; + + age = max_mtime - ve->mtime; + + vblocks = get_seg_entry(sbi, ve->segno)->ckpt_valid_blocks; + f2fs_bug_on(sbi, !vblocks); + + /* rare case */ + if (vblocks == seg_blocks) + goto skip_node; + + iter++; + + age = max_mtime - abs(p->age - age); + cost = UINT_MAX - vblocks; + + if (cost < p->min_cost || + (cost == p->min_cost && age > p->oldest_age)) { + p->min_cost = cost; + p->oldest_age = age; + p->min_segno = ve->segno; + } +skip_node: + if (iter < dirty_threshold) { + if (stage == 0) + node = rb_prev(node); + else if (stage == 1) + node = rb_next(node); + goto next_node; + } +skip_stage: + if (stage < 1) { + stage++; + iter = 0; + goto next_stage; + } +} +static void lookup_victim_by_age(struct f2fs_sb_info *sbi, + struct victim_sel_policy *p) +{ + f2fs_bug_on(sbi, !f2fs_check_rb_tree_consistence(sbi, + &sbi->am.root, true)); + + if (p->gc_mode == GC_AT) + atgc_lookup_victim(sbi, p); + else if (p->alloc_mode == AT_SSR) + atssr_lookup_victim(sbi, p); + else + f2fs_bug_on(sbi, 1); +} + +static void release_victim_entry(struct f2fs_sb_info *sbi) +{ + struct atgc_management *am = &sbi->am; + struct victim_entry *ve, *tmp; + + list_for_each_entry_safe(ve, tmp, &am->victim_list, list) { + list_del(&ve->list); + kmem_cache_free(victim_entry_slab, ve); + am->victim_count--; + } + + am->root = RB_ROOT_CACHED; + + f2fs_bug_on(sbi, am->victim_count); + f2fs_bug_on(sbi, !list_empty(&am->victim_list)); +} + /* * This function is called from two paths. * One is garbage collection and the other is SSR segment selection. @@ -322,25 +620,37 @@ static unsigned int count_bits(const unsigned long *addr, * which has minimum valid blocks and removes it from dirty seglist. */ static int get_victim_by_default(struct f2fs_sb_info *sbi, - unsigned int *result, int gc_type, int type, char alloc_mode) + unsigned int *result, int gc_type, int type, + char alloc_mode, unsigned long long age) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); struct sit_info *sm = SIT_I(sbi); struct victim_sel_policy p; unsigned int secno, last_victim; unsigned int last_segment; - unsigned int nsearched = 0; + unsigned int nsearched; + bool is_atgc; int ret = 0; mutex_lock(&dirty_i->seglist_lock); last_segment = MAIN_SECS(sbi) * sbi->segs_per_sec; p.alloc_mode = alloc_mode; - select_policy(sbi, gc_type, type, &p); + p.age = age; + p.age_threshold = sbi->am.age_threshold; +retry: + select_policy(sbi, gc_type, type, &p); p.min_segno = NULL_SEGNO; + p.oldest_age = 0; p.min_cost = get_max_cost(sbi, &p); + is_atgc = (p.gc_mode == GC_AT || p.alloc_mode == AT_SSR); + nsearched = 0; + + if (is_atgc) + SIT_I(sbi)->dirty_min_mtime = ULLONG_MAX; + if (*result != NULL_SEGNO) { if (!get_valid_blocks(sbi, *result, false)) { ret = -ENODATA; @@ -421,11 +731,16 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, /* Don't touch checkpointed data */ if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED) && get_ckpt_valid_blocks(sbi, segno) && - p.alloc_mode != SSR)) + p.alloc_mode == LFS)) goto next; if (gc_type == BG_GC && test_bit(secno, dirty_i->victim_secmap)) goto next; + if (is_atgc) { + add_victim_entry(sbi, &p, segno); + goto next; + } + cost = get_gc_cost(sbi, segno, &p); if (p.min_cost > cost) { @@ -444,6 +759,19 @@ next: break; } } + + /* get victim for GC_AT/AT_SSR */ + if (is_atgc) { + lookup_victim_by_age(sbi, &p); + release_victim_entry(sbi); + } + + if (is_atgc && p.min_segno == NULL_SEGNO && + sm->elapsed_time < p.age_threshold) { + p.age_threshold = 0; + goto retry; + } + if (p.min_segno != NULL_SEGNO) { got_it: *result = (p.min_segno / p.ofs_unit) * p.ofs_unit; @@ -536,6 +864,7 @@ static int gc_node_segment(struct f2fs_sb_info *sbi, int phase = 0; bool fggc = (gc_type == FG_GC); int submitted = 0; + unsigned int usable_blks_in_seg = f2fs_usable_blks_in_seg(sbi, segno); start_addr = START_BLOCK(sbi, segno); @@ -545,7 +874,7 @@ next_step: if (fggc && phase == 2) atomic_inc(&sbi->wb_sync_req[NODE]); - for (off = 0; off < sbi->blocks_per_seg; off++, entry++) { + for (off = 0; off < usable_blks_in_seg; off++, entry++) { nid_t nid = le32_to_cpu(entry->nid); struct page *node_page; struct node_info ni; @@ -791,6 +1120,8 @@ static int move_data_block(struct inode *inode, block_t bidx, block_t newaddr; int err = 0; bool lfs_mode = f2fs_lfs_mode(fio.sbi); + int type = fio.sbi->am.atgc_enabled ? + CURSEG_ALL_DATA_ATGC : CURSEG_COLD_DATA; /* do not read out */ page = f2fs_grab_cache_page(inode->i_mapping, bidx, false); @@ -877,7 +1208,7 @@ static int move_data_block(struct inode *inode, block_t bidx, } f2fs_allocate_data_block(fio.sbi, NULL, fio.old_blkaddr, &newaddr, - &sum, CURSEG_COLD_DATA, NULL); + &sum, type, NULL); fio.encrypted_page = f2fs_pagecache_get_page(META_MAPPING(fio.sbi), newaddr, FGP_LOCK | FGP_CREAT, GFP_NOFS); @@ -927,7 +1258,7 @@ put_page_out: recover_block: if (err) f2fs_do_replace_block(fio.sbi, &sum, newaddr, fio.old_blkaddr, - true, true); + true, true, true); up_out: if (lfs_mode) up_write(&fio.sbi->io_order_lock); @@ -1033,13 +1364,14 @@ static int gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, int off; int phase = 0; int submitted = 0; + unsigned int usable_blks_in_seg = f2fs_usable_blks_in_seg(sbi, segno); start_addr = START_BLOCK(sbi, segno); next_step: entry = sum; - for (off = 0; off < sbi->blocks_per_seg; off++, entry++) { + for (off = 0; off < usable_blks_in_seg; off++, entry++) { struct page *data_page; struct inode *inode; struct node_info dni; /* dnode info for the data */ @@ -1182,7 +1514,7 @@ static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim, down_write(&sit_i->sentry_lock); ret = DIRTY_I(sbi)->v_ops->get_victim(sbi, victim, gc_type, - NO_CHECK_TYPE, LFS); + NO_CHECK_TYPE, LFS, 0); up_write(&sit_i->sentry_lock); return ret; } @@ -1204,6 +1536,17 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, if (__is_large_section(sbi)) end_segno = rounddown(end_segno, sbi->segs_per_sec); + /* + * zone-capacity can be less than zone-size in zoned devices, + * resulting in less than expected usable segments in the zone, + * calculate the end segno in the zone which can be garbage collected + */ + if (f2fs_sb_has_blkzoned(sbi)) + end_segno -= sbi->segs_per_sec - + f2fs_usable_segs_in_sec(sbi, segno); + + sanity_check_seg_type(sbi, get_seg_entry(sbi, segno)->type); + /* readahead multi ssa blocks those have contiguous address */ if (__is_large_section(sbi)) f2fs_ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno), @@ -1356,7 +1699,8 @@ gc_more: goto stop; seg_freed = do_garbage_collect(sbi, segno, &gc_list, gc_type); - if (gc_type == FG_GC && seg_freed == sbi->segs_per_sec) + if (gc_type == FG_GC && + seg_freed == f2fs_usable_segs_in_sec(sbi, segno)) sec_freed++; total_freed += seg_freed; @@ -1413,6 +1757,37 @@ stop: return ret; } +int __init f2fs_create_garbage_collection_cache(void) +{ + victim_entry_slab = f2fs_kmem_cache_create("f2fs_victim_entry", + sizeof(struct victim_entry)); + if (!victim_entry_slab) + return -ENOMEM; + return 0; +} + +void f2fs_destroy_garbage_collection_cache(void) +{ + kmem_cache_destroy(victim_entry_slab); +} + +static void init_atgc_management(struct f2fs_sb_info *sbi) +{ + struct atgc_management *am = &sbi->am; + + if (test_opt(sbi, ATGC) && + SIT_I(sbi)->elapsed_time >= DEF_GC_THREAD_AGE_THRESHOLD) + am->atgc_enabled = true; + + am->root = RB_ROOT_CACHED; + INIT_LIST_HEAD(&am->victim_list); + am->victim_count = 0; + + am->candidate_ratio = DEF_GC_THREAD_CANDIDATE_RATIO; + am->max_candidate_count = DEF_GC_THREAD_MAX_CANDIDATE_COUNT; + am->age_weight = DEF_GC_THREAD_AGE_WEIGHT; +} + void f2fs_build_gc_manager(struct f2fs_sb_info *sbi) { DIRTY_I(sbi)->v_ops = &default_v_ops; @@ -1423,6 +1798,8 @@ void f2fs_build_gc_manager(struct f2fs_sb_info *sbi) if (f2fs_is_multi_device(sbi) && !__is_large_section(sbi)) SIT_I(sbi)->last_victim[ALLOC_NEXT] = GET_SEGNO(sbi, FDEV(0).end_blk) + 1; + + init_atgc_management(sbi); } static int free_segment_range(struct f2fs_sb_info *sbi, @@ -1450,7 +1827,7 @@ static int free_segment_range(struct f2fs_sb_info *sbi, mutex_unlock(&DIRTY_I(sbi)->seglist_lock); /* Move out cursegs from the target range */ - for (type = CURSEG_HOT_DATA; type < NR_CURSEG_TYPE; type++) + for (type = CURSEG_HOT_DATA; type < NR_CURSEG_PERSIST_TYPE; type++) f2fs_allocate_segment_for_resize(sbi, type, start, end); /* do GC to move out valid blocks in the range */ diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h index db3c61046aa4..0c8dae12dc51 100644 --- a/fs/f2fs/gc.h +++ b/fs/f2fs/gc.h @@ -14,6 +14,14 @@ #define DEF_GC_THREAD_MIN_SLEEP_TIME 30000 /* milliseconds */ #define DEF_GC_THREAD_MAX_SLEEP_TIME 60000 #define DEF_GC_THREAD_NOGC_SLEEP_TIME 300000 /* wait 5 min */ + +/* choose candidates from sections which has age of more than 7 days */ +#define DEF_GC_THREAD_AGE_THRESHOLD (60 * 60 * 24 * 7) +#define DEF_GC_THREAD_CANDIDATE_RATIO 20 /* select 20% oldest sections as candidates */ +#define DEF_GC_THREAD_MAX_CANDIDATE_COUNT 10 /* select at most 10 sections as candidates */ +#define DEF_GC_THREAD_AGE_WEIGHT 60 /* age weight */ +#define DEFAULT_ACCURACY_CLASS 10000 /* accuracy class */ + #define LIMIT_INVALID_BLOCK 40 /* percentage over total user space */ #define LIMIT_FREE_BLOCK 40 /* percentage over invalid + free space */ @@ -41,16 +49,69 @@ struct gc_inode_list { struct radix_tree_root iroot; }; +struct victim_info { + unsigned long long mtime; /* mtime of section */ + unsigned int segno; /* section No. */ +}; + +struct victim_entry { + struct rb_node rb_node; /* rb node located in rb-tree */ + union { + struct { + unsigned long long mtime; /* mtime of section */ + unsigned int segno; /* segment No. */ + }; + struct victim_info vi; /* victim info */ + }; + struct list_head list; +}; + /* * inline functions */ + +/* + * On a Zoned device zone-capacity can be less than zone-size and if + * zone-capacity is not aligned to f2fs segment size(2MB), then the segment + * starting just before zone-capacity has some blocks spanning across the + * zone-capacity, these blocks are not usable. + * Such spanning segments can be in free list so calculate the sum of usable + * blocks in currently free segments including normal and spanning segments. + */ +static inline block_t free_segs_blk_count_zoned(struct f2fs_sb_info *sbi) +{ + block_t free_seg_blks = 0; + struct free_segmap_info *free_i = FREE_I(sbi); + int j; + + spin_lock(&free_i->segmap_lock); + for (j = 0; j < MAIN_SEGS(sbi); j++) + if (!test_bit(j, free_i->free_segmap)) + free_seg_blks += f2fs_usable_blks_in_seg(sbi, j); + spin_unlock(&free_i->segmap_lock); + + return free_seg_blks; +} + +static inline block_t free_segs_blk_count(struct f2fs_sb_info *sbi) +{ + if (f2fs_sb_has_blkzoned(sbi)) + return free_segs_blk_count_zoned(sbi); + + return free_segments(sbi) << sbi->log_blocks_per_seg; +} + static inline block_t free_user_blocks(struct f2fs_sb_info *sbi) { - if (free_segments(sbi) < overprovision_segments(sbi)) + block_t free_blks, ovp_blks; + + free_blks = free_segs_blk_count(sbi); + ovp_blks = overprovision_segments(sbi) << sbi->log_blocks_per_seg; + + if (free_blks < ovp_blks) return 0; - else - return (free_segments(sbi) - overprovision_segments(sbi)) - << sbi->log_blocks_per_seg; + + return free_blks - ovp_blks; } static inline block_t limit_invalid_user_blocks(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 102df444f623..70384e31788d 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -524,7 +524,7 @@ static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage, !f2fs_has_inline_xattr(dir)) F2FS_I(dir)->i_inline_xattr_size = 0; - kvfree(backup_dentry); + kfree(backup_dentry); return 0; recover: lock_page(ipage); @@ -535,7 +535,7 @@ recover: set_page_dirty(ipage); f2fs_put_page(ipage, 1); - kvfree(backup_dentry); + kfree(backup_dentry); return err; } diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 66969ae852b9..657db2fb6739 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -287,11 +287,19 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) return false; } + if ((fi->i_flags & F2FS_CASEFOLD_FL) && !f2fs_sb_has_casefold(sbi)) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_warn(sbi, "%s: inode (ino=%lx) has casefold flag, but casefold feature is off", + __func__, inode->i_ino); + return false; + } + if (f2fs_has_extra_attr(inode) && f2fs_sb_has_compression(sbi) && fi->i_flags & F2FS_COMPR_FL && F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, i_log_cluster_size)) { if (ri->i_compress_algorithm >= COMPRESS_MAX) { + set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_warn(sbi, "%s: inode (ino=%lx) has unsupported " "compress algorithm: %u, run fsck to fix", __func__, inode->i_ino, @@ -300,6 +308,7 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) } if (le64_to_cpu(ri->i_compr_blocks) > SECTOR_TO_BLOCK(inode->i_blocks)) { + set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_warn(sbi, "%s: inode (ino=%lx) has inconsistent " "i_compr_blocks:%llu, i_blocks:%llu, run fsck to fix", __func__, inode->i_ino, @@ -309,6 +318,7 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) } if (ri->i_log_cluster_size < MIN_COMPRESS_LOG_SIZE || ri->i_log_cluster_size > MAX_COMPRESS_LOG_SIZE) { + set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_warn(sbi, "%s: inode (ino=%lx) has unsupported " "log cluster size: %u, run fsck to fix", __func__, inode->i_ino, @@ -442,7 +452,8 @@ static int do_read_inode(struct inode *inode) (fi->i_flags & F2FS_COMPR_FL)) { if (F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, i_log_cluster_size)) { - fi->i_compr_blocks = le64_to_cpu(ri->i_compr_blocks); + atomic_set(&fi->i_compr_blocks, + le64_to_cpu(ri->i_compr_blocks)); fi->i_compress_algorithm = ri->i_compress_algorithm; fi->i_log_cluster_size = ri->i_log_cluster_size; fi->i_cluster_size = 1 << fi->i_log_cluster_size; @@ -460,7 +471,7 @@ static int do_read_inode(struct inode *inode) stat_inc_inline_inode(inode); stat_inc_inline_dir(inode); stat_inc_compr_inode(inode); - stat_add_compr_blocks(inode, F2FS_I(inode)->i_compr_blocks); + stat_add_compr_blocks(inode, atomic_read(&fi->i_compr_blocks)); return 0; } @@ -619,7 +630,8 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page) F2FS_FITS_IN_INODE(ri, F2FS_I(inode)->i_extra_isize, i_log_cluster_size)) { ri->i_compr_blocks = - cpu_to_le64(F2FS_I(inode)->i_compr_blocks); + cpu_to_le64(atomic_read( + &F2FS_I(inode)->i_compr_blocks)); ri->i_compress_algorithm = F2FS_I(inode)->i_compress_algorithm; ri->i_log_cluster_size = @@ -768,7 +780,8 @@ no_delete: stat_dec_inline_dir(inode); stat_dec_inline_inode(inode); stat_dec_compr_inode(inode); - stat_sub_compr_blocks(inode, F2FS_I(inode)->i_compr_blocks); + stat_sub_compr_blocks(inode, + atomic_read(&F2FS_I(inode)->i_compr_blocks)); if (likely(!f2fs_cp_error(sbi) && !is_sbi_flag_set(sbi, SBI_CP_DISABLED))) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 45f324511a19..8fa37d1434de 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -712,7 +712,7 @@ out_f2fs_handle_failed_inode: f2fs_handle_failed_inode(inode); out_free_encrypted_link: if (disk_link.name != (unsigned char *)symname) - kvfree(disk_link.name); + kfree(disk_link.name); return err; } diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index cb1b5b61a1da..d5d8ce077f29 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -109,7 +109,7 @@ static void clear_node_page_dirty(struct page *page) static struct page *get_current_nat_page(struct f2fs_sb_info *sbi, nid_t nid) { - return f2fs_get_meta_page_nofail(sbi, current_nat_addr(sbi, nid)); + return f2fs_get_meta_page(sbi, current_nat_addr(sbi, nid)); } static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid) @@ -3105,9 +3105,6 @@ static int init_node_manager(struct f2fs_sb_info *sbi) nm_i->next_scan_nid = le32_to_cpu(sbi->ckpt->next_free_nid); nm_i->bitmap_size = __bitmap_size(sbi, NAT_BITMAP); version_bitmap = __bitmap_ptr(sbi, NAT_BITMAP); - if (!version_bitmap) - return -EFAULT; - nm_i->nat_bitmap = kmemdup(version_bitmap, nm_i->bitmap_size, GFP_KERNEL); if (!nm_i->nat_bitmap) @@ -3257,7 +3254,7 @@ void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi) kvfree(nm_i->nat_bitmap_mir); #endif sbi->nm_info = NULL; - kvfree(nm_i); + kfree(nm_i); } int __init f2fs_create_node_manager_caches(void) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index e247a5ef3713..1596502f7375 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -189,7 +189,7 @@ void f2fs_register_inmem_page(struct inode *inode, struct page *page) f2fs_trace_pid(page); - f2fs_set_page_private(page, (unsigned long)ATOMIC_WRITTEN_PAGE); + f2fs_set_page_private(page, ATOMIC_WRITTEN_PAGE); new = f2fs_kmem_cache_alloc(inmem_entry_slab, GFP_NOFS); @@ -728,7 +728,7 @@ init_thread: "f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev)); if (IS_ERR(fcc->f2fs_issue_flush)) { err = PTR_ERR(fcc->f2fs_issue_flush); - kvfree(fcc); + kfree(fcc); SM_I(sbi)->fcc_info = NULL; return err; } @@ -747,7 +747,7 @@ void f2fs_destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free) kthread_stop(flush_thread); } if (free) { - kvfree(fcc); + kfree(fcc); SM_I(sbi)->fcc_info = NULL; } } @@ -759,6 +759,9 @@ int f2fs_flush_device_cache(struct f2fs_sb_info *sbi) if (!f2fs_is_multi_device(sbi)) return 0; + if (test_opt(sbi, NOBARRIER)) + return 0; + for (i = 1; i < sbi->s_ndevs; i++) { if (!f2fs_test_bit(i, (char *)&sbi->dirty_device)) continue; @@ -859,20 +862,22 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); unsigned short valid_blocks, ckpt_valid_blocks; + unsigned int usable_blocks; if (segno == NULL_SEGNO || IS_CURSEG(sbi, segno)) return; + usable_blocks = f2fs_usable_blks_in_seg(sbi, segno); mutex_lock(&dirty_i->seglist_lock); valid_blocks = get_valid_blocks(sbi, segno, false); ckpt_valid_blocks = get_ckpt_valid_blocks(sbi, segno); if (valid_blocks == 0 && (!is_sbi_flag_set(sbi, SBI_CP_DISABLED) || - ckpt_valid_blocks == sbi->blocks_per_seg)) { + ckpt_valid_blocks == usable_blocks)) { __locate_dirty_segment(sbi, segno, PRE); __remove_dirty_segment(sbi, segno, DIRTY); - } else if (valid_blocks < sbi->blocks_per_seg) { + } else if (valid_blocks < usable_blocks) { __locate_dirty_segment(sbi, segno, DIRTY); } else { /* Recovery routine with SSR needs this */ @@ -915,9 +920,11 @@ block_t f2fs_get_unusable_blocks(struct f2fs_sb_info *sbi) for_each_set_bit(segno, dirty_i->dirty_segmap[DIRTY], MAIN_SEGS(sbi)) { se = get_seg_entry(sbi, segno); if (IS_NODESEG(se->type)) - holes[NODE] += sbi->blocks_per_seg - se->valid_blocks; + holes[NODE] += f2fs_usable_blks_in_seg(sbi, segno) - + se->valid_blocks; else - holes[DATA] += sbi->blocks_per_seg - se->valid_blocks; + holes[DATA] += f2fs_usable_blks_in_seg(sbi, segno) - + se->valid_blocks; } mutex_unlock(&dirty_i->seglist_lock); @@ -1521,7 +1528,7 @@ retry: goto next; if (unlikely(dcc->rbtree_check)) f2fs_bug_on(sbi, !f2fs_check_rb_tree_consistence(sbi, - &dcc->root)); + &dcc->root, false)); blk_start_plug(&plug); list_for_each_entry_safe(dc, tmp, pend_list, list) { f2fs_bug_on(sbi, dc->state != D_PREP); @@ -1958,7 +1965,7 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi) mutex_lock(&dirty_i->seglist_lock); for_each_set_bit(segno, dirty_i->dirty_segmap[PRE], MAIN_SEGS(sbi)) - __set_test_and_free(sbi, segno); + __set_test_and_free(sbi, segno, false); mutex_unlock(&dirty_i->seglist_lock); } @@ -2101,7 +2108,7 @@ init_thread: "f2fs_discard-%u:%u", MAJOR(dev), MINOR(dev)); if (IS_ERR(dcc->f2fs_issue_discard)) { err = PTR_ERR(dcc->f2fs_issue_discard); - kvfree(dcc); + kfree(dcc); SM_I(sbi)->dcc_info = NULL; return err; } @@ -2125,7 +2132,7 @@ static void destroy_discard_cmd_control(struct f2fs_sb_info *sbi) if (unlikely(atomic_read(&dcc->discard_cmd_cnt))) f2fs_issue_discard_timeout(sbi); - kvfree(dcc); + kfree(dcc); SM_I(sbi)->dcc_info = NULL; } @@ -2150,6 +2157,39 @@ static void __set_sit_entry_type(struct f2fs_sb_info *sbi, int type, __mark_sit_entry_dirty(sbi, segno); } +static inline unsigned long long get_segment_mtime(struct f2fs_sb_info *sbi, + block_t blkaddr) +{ + unsigned int segno = GET_SEGNO(sbi, blkaddr); + + if (segno == NULL_SEGNO) + return 0; + return get_seg_entry(sbi, segno)->mtime; +} + +static void update_segment_mtime(struct f2fs_sb_info *sbi, block_t blkaddr, + unsigned long long old_mtime) +{ + struct seg_entry *se; + unsigned int segno = GET_SEGNO(sbi, blkaddr); + unsigned long long ctime = get_mtime(sbi, false); + unsigned long long mtime = old_mtime ? old_mtime : ctime; + + if (segno == NULL_SEGNO) + return; + + se = get_seg_entry(sbi, segno); + + if (!se->mtime) + se->mtime = mtime; + else + se->mtime = div_u64(se->mtime * se->valid_blocks + mtime, + se->valid_blocks + 1); + + if (ctime > SIT_I(sbi)->max_mtime) + SIT_I(sbi)->max_mtime = ctime; +} + static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) { struct seg_entry *se; @@ -2167,12 +2207,9 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr); f2fs_bug_on(sbi, (new_vblocks < 0 || - (new_vblocks > sbi->blocks_per_seg))); + (new_vblocks > f2fs_usable_blks_in_seg(sbi, segno)))); se->valid_blocks = new_vblocks; - se->mtime = get_mtime(sbi, false); - if (se->mtime > SIT_I(sbi)->max_mtime) - SIT_I(sbi)->max_mtime = se->mtime; /* Update valid block bitmap */ if (del > 0) { @@ -2265,6 +2302,7 @@ void f2fs_invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr) /* add it into sit main buffer */ down_write(&sit_i->sentry_lock); + update_segment_mtime(sbi, addr, 0); update_sit_entry(sbi, addr, -1); /* add it into dirty seglist */ @@ -2344,7 +2382,9 @@ int f2fs_npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra) */ struct page *f2fs_get_sum_page(struct f2fs_sb_info *sbi, unsigned int segno) { - return f2fs_get_meta_page_nofail(sbi, GET_SUM_BLOCK(sbi, segno)); + if (unlikely(f2fs_cp_error(sbi))) + return ERR_PTR(-EIO); + return f2fs_get_meta_page_retry(sbi, GET_SUM_BLOCK(sbi, segno)); } void f2fs_update_meta_page(struct f2fs_sb_info *sbi, @@ -2389,9 +2429,9 @@ static void write_current_sum_page(struct f2fs_sb_info *sbi, f2fs_put_page(page, 1); } -static int is_next_segment_free(struct f2fs_sb_info *sbi, int type) +static int is_next_segment_free(struct f2fs_sb_info *sbi, + struct curseg_info *curseg, int type) { - struct curseg_info *curseg = CURSEG_I(sbi, type); unsigned int segno = curseg->segno + 1; struct free_segmap_info *free_i = FREE_I(sbi); @@ -2495,7 +2535,9 @@ static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified) { struct curseg_info *curseg = CURSEG_I(sbi, type); struct summary_footer *sum_footer; + unsigned short seg_type = curseg->seg_type; + curseg->inited = true; curseg->segno = curseg->next_segno; curseg->zone = GET_ZONE_FROM_SEG(sbi, curseg->segno); curseg->next_blkoff = 0; @@ -2503,24 +2545,36 @@ static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified) sum_footer = &(curseg->sum_blk->footer); memset(sum_footer, 0, sizeof(struct summary_footer)); - if (IS_DATASEG(type)) + + sanity_check_seg_type(sbi, seg_type); + + if (IS_DATASEG(seg_type)) SET_SUM_TYPE(sum_footer, SUM_TYPE_DATA); - if (IS_NODESEG(type)) + if (IS_NODESEG(seg_type)) SET_SUM_TYPE(sum_footer, SUM_TYPE_NODE); - __set_sit_entry_type(sbi, type, curseg->segno, modified); + __set_sit_entry_type(sbi, seg_type, curseg->segno, modified); } static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type) { + struct curseg_info *curseg = CURSEG_I(sbi, type); + unsigned short seg_type = curseg->seg_type; + + sanity_check_seg_type(sbi, seg_type); + /* if segs_per_sec is large than 1, we need to keep original policy. */ if (__is_large_section(sbi)) - return CURSEG_I(sbi, type)->segno; + return curseg->segno; + + /* inmem log may not locate on any segment after mount */ + if (!curseg->inited) + return 0; if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) return 0; if (test_opt(sbi, NOHEAP) && - (type == CURSEG_HOT_DATA || IS_NODESEG(type))) + (seg_type == CURSEG_HOT_DATA || IS_NODESEG(seg_type))) return 0; if (SIT_I(sbi)->last_victim[ALLOC_NEXT]) @@ -2530,7 +2584,7 @@ static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type) if (F2FS_OPTION(sbi).alloc_mode == ALLOC_MODE_REUSE) return 0; - return CURSEG_I(sbi, type)->segno; + return curseg->segno; } /* @@ -2540,12 +2594,14 @@ static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type) static void new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec) { struct curseg_info *curseg = CURSEG_I(sbi, type); + unsigned short seg_type = curseg->seg_type; unsigned int segno = curseg->segno; int dir = ALLOC_LEFT; - write_sum_page(sbi, curseg->sum_blk, + if (curseg->inited) + write_sum_page(sbi, curseg->sum_blk, GET_SUM_BLOCK(sbi, segno)); - if (type == CURSEG_WARM_DATA || type == CURSEG_COLD_DATA) + if (seg_type == CURSEG_WARM_DATA || seg_type == CURSEG_COLD_DATA) dir = ALLOC_RIGHT; if (test_opt(sbi, NOHEAP)) @@ -2594,7 +2650,7 @@ static void __refresh_next_blkoff(struct f2fs_sb_info *sbi, * This function always allocates a used segment(from dirty seglist) by SSR * manner, so it should recover the existing segment information of valid blocks */ -static void change_curseg(struct f2fs_sb_info *sbi, int type) +static void change_curseg(struct f2fs_sb_info *sbi, int type, bool flush) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, type); @@ -2602,8 +2658,10 @@ static void change_curseg(struct f2fs_sb_info *sbi, int type) struct f2fs_summary_block *sum_node; struct page *sum_page; - write_sum_page(sbi, curseg->sum_blk, - GET_SUM_BLOCK(sbi, curseg->segno)); + if (flush) + write_sum_page(sbi, curseg->sum_blk, + GET_SUM_BLOCK(sbi, curseg->segno)); + __set_test_and_inuse(sbi, new_segno); mutex_lock(&dirty_i->seglist_lock); @@ -2616,29 +2674,139 @@ static void change_curseg(struct f2fs_sb_info *sbi, int type) __next_free_blkoff(sbi, curseg, 0); sum_page = f2fs_get_sum_page(sbi, new_segno); - f2fs_bug_on(sbi, IS_ERR(sum_page)); + if (IS_ERR(sum_page)) { + /* GC won't be able to use stale summary pages by cp_error */ + memset(curseg->sum_blk, 0, SUM_ENTRY_SIZE); + return; + } sum_node = (struct f2fs_summary_block *)page_address(sum_page); memcpy(curseg->sum_blk, sum_node, SUM_ENTRY_SIZE); f2fs_put_page(sum_page, 1); } -static int get_ssr_segment(struct f2fs_sb_info *sbi, int type) +static int get_ssr_segment(struct f2fs_sb_info *sbi, int type, + int alloc_mode, unsigned long long age); + +static void get_atssr_segment(struct f2fs_sb_info *sbi, int type, + int target_type, int alloc_mode, + unsigned long long age) +{ + struct curseg_info *curseg = CURSEG_I(sbi, type); + + curseg->seg_type = target_type; + + if (get_ssr_segment(sbi, type, alloc_mode, age)) { + struct seg_entry *se = get_seg_entry(sbi, curseg->next_segno); + + curseg->seg_type = se->type; + change_curseg(sbi, type, true); + } else { + /* allocate cold segment by default */ + curseg->seg_type = CURSEG_COLD_DATA; + new_curseg(sbi, type, true); + } + stat_inc_seg_type(sbi, curseg); +} + +static void __f2fs_init_atgc_curseg(struct f2fs_sb_info *sbi) +{ + struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_ALL_DATA_ATGC); + + if (!sbi->am.atgc_enabled) + return; + + down_read(&SM_I(sbi)->curseg_lock); + + mutex_lock(&curseg->curseg_mutex); + down_write(&SIT_I(sbi)->sentry_lock); + + get_atssr_segment(sbi, CURSEG_ALL_DATA_ATGC, CURSEG_COLD_DATA, SSR, 0); + + up_write(&SIT_I(sbi)->sentry_lock); + mutex_unlock(&curseg->curseg_mutex); + + up_read(&SM_I(sbi)->curseg_lock); + +} +void f2fs_init_inmem_curseg(struct f2fs_sb_info *sbi) +{ + __f2fs_init_atgc_curseg(sbi); +} + +static void __f2fs_save_inmem_curseg(struct f2fs_sb_info *sbi, int type) +{ + struct curseg_info *curseg = CURSEG_I(sbi, type); + + mutex_lock(&curseg->curseg_mutex); + if (!curseg->inited) + goto out; + + if (get_valid_blocks(sbi, curseg->segno, false)) { + write_sum_page(sbi, curseg->sum_blk, + GET_SUM_BLOCK(sbi, curseg->segno)); + } else { + mutex_lock(&DIRTY_I(sbi)->seglist_lock); + __set_test_and_free(sbi, curseg->segno, true); + mutex_unlock(&DIRTY_I(sbi)->seglist_lock); + } +out: + mutex_unlock(&curseg->curseg_mutex); +} + +void f2fs_save_inmem_curseg(struct f2fs_sb_info *sbi) +{ + __f2fs_save_inmem_curseg(sbi, CURSEG_COLD_DATA_PINNED); + + if (sbi->am.atgc_enabled) + __f2fs_save_inmem_curseg(sbi, CURSEG_ALL_DATA_ATGC); +} + +static void __f2fs_restore_inmem_curseg(struct f2fs_sb_info *sbi, int type) +{ + struct curseg_info *curseg = CURSEG_I(sbi, type); + + mutex_lock(&curseg->curseg_mutex); + if (!curseg->inited) + goto out; + if (get_valid_blocks(sbi, curseg->segno, false)) + goto out; + + mutex_lock(&DIRTY_I(sbi)->seglist_lock); + __set_test_and_inuse(sbi, curseg->segno); + mutex_unlock(&DIRTY_I(sbi)->seglist_lock); +out: + mutex_unlock(&curseg->curseg_mutex); +} + +void f2fs_restore_inmem_curseg(struct f2fs_sb_info *sbi) +{ + __f2fs_restore_inmem_curseg(sbi, CURSEG_COLD_DATA_PINNED); + + if (sbi->am.atgc_enabled) + __f2fs_restore_inmem_curseg(sbi, CURSEG_ALL_DATA_ATGC); +} + +static int get_ssr_segment(struct f2fs_sb_info *sbi, int type, + int alloc_mode, unsigned long long age) { struct curseg_info *curseg = CURSEG_I(sbi, type); const struct victim_selection *v_ops = DIRTY_I(sbi)->v_ops; unsigned segno = NULL_SEGNO; + unsigned short seg_type = curseg->seg_type; int i, cnt; bool reversed = false; + sanity_check_seg_type(sbi, seg_type); + /* f2fs_need_SSR() already forces to do this */ - if (!v_ops->get_victim(sbi, &segno, BG_GC, type, SSR)) { + if (!v_ops->get_victim(sbi, &segno, BG_GC, seg_type, alloc_mode, age)) { curseg->next_segno = segno; return 1; } /* For node segments, let's do SSR more intensively */ - if (IS_NODESEG(type)) { - if (type >= CURSEG_WARM_NODE) { + if (IS_NODESEG(seg_type)) { + if (seg_type >= CURSEG_WARM_NODE) { reversed = true; i = CURSEG_COLD_NODE; } else { @@ -2646,7 +2814,7 @@ static int get_ssr_segment(struct f2fs_sb_info *sbi, int type) } cnt = NR_CURSEG_NODE_TYPE; } else { - if (type >= CURSEG_WARM_DATA) { + if (seg_type >= CURSEG_WARM_DATA) { reversed = true; i = CURSEG_COLD_DATA; } else { @@ -2656,9 +2824,9 @@ static int get_ssr_segment(struct f2fs_sb_info *sbi, int type) } for (; cnt-- > 0; reversed ? i-- : i++) { - if (i == type) + if (i == seg_type) continue; - if (!v_ops->get_victim(sbi, &segno, BG_GC, i, SSR)) { + if (!v_ops->get_victim(sbi, &segno, BG_GC, i, alloc_mode, age)) { curseg->next_segno = segno; return 1; } @@ -2687,13 +2855,15 @@ static void allocate_segment_by_default(struct f2fs_sb_info *sbi, if (force) new_curseg(sbi, type, true); else if (!is_set_ckpt_flags(sbi, CP_CRC_RECOVERY_FLAG) && - type == CURSEG_WARM_NODE) + curseg->seg_type == CURSEG_WARM_NODE) new_curseg(sbi, type, false); - else if (curseg->alloc_type == LFS && is_next_segment_free(sbi, type) && + else if (curseg->alloc_type == LFS && + is_next_segment_free(sbi, curseg, type) && likely(!is_sbi_flag_set(sbi, SBI_CP_DISABLED))) new_curseg(sbi, type, false); - else if (f2fs_need_SSR(sbi) && get_ssr_segment(sbi, type)) - change_curseg(sbi, type); + else if (f2fs_need_SSR(sbi) && + get_ssr_segment(sbi, type, SSR, 0)) + change_curseg(sbi, type, true); else new_curseg(sbi, type, false); @@ -2714,8 +2884,8 @@ void f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type, if (segno < start || segno > end) goto unlock; - if (f2fs_need_SSR(sbi) && get_ssr_segment(sbi, type)) - change_curseg(sbi, type); + if (f2fs_need_SSR(sbi) && get_ssr_segment(sbi, type, SSR, 0)) + change_curseg(sbi, type, true); else new_curseg(sbi, type, true); @@ -2738,11 +2908,15 @@ static void __allocate_new_segment(struct f2fs_sb_info *sbi, int type) struct curseg_info *curseg = CURSEG_I(sbi, type); unsigned int old_segno; + if (!curseg->inited) + goto alloc; + if (!curseg->next_blkoff && !get_valid_blocks(sbi, curseg->segno, false) && !get_ckpt_valid_blocks(sbi, curseg->segno)) return; +alloc: old_segno = curseg->segno; SIT_I(sbi)->s_ops->allocate_segment(sbi, type, true); locate_dirty_segment(sbi, old_segno); @@ -2806,7 +2980,7 @@ next: mutex_lock(&dcc->cmd_lock); if (unlikely(dcc->rbtree_check)) f2fs_bug_on(sbi, !f2fs_check_rb_tree_consistence(sbi, - &dcc->root)); + &dcc->root, false)); dc = (struct discard_cmd *)f2fs_lookup_rb_tree_ret(&dcc->root, NULL, start, @@ -2930,12 +3104,11 @@ out: return err; } -static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type) +static bool __has_curseg_space(struct f2fs_sb_info *sbi, + struct curseg_info *curseg) { - struct curseg_info *curseg = CURSEG_I(sbi, type); - if (curseg->next_blkoff < sbi->blocks_per_seg) - return true; - return false; + return curseg->next_blkoff < f2fs_usable_blks_in_seg(sbi, + curseg->segno); } int f2fs_rw_hint_to_seg_type(enum rw_hint hint) @@ -3075,8 +3248,13 @@ static int __get_segment_type_6(struct f2fs_io_info *fio) if (fio->type == DATA) { struct inode *inode = fio->page->mapping->host; - if (is_cold_data(fio->page) || file_is_cold(inode) || - f2fs_compressed_file(inode)) + if (is_cold_data(fio->page)) { + if (fio->sbi->am.atgc_enabled) + return CURSEG_ALL_DATA_ATGC; + else + return CURSEG_COLD_DATA; + } + if (file_is_cold(inode) || f2fs_compressed_file(inode)) return CURSEG_COLD_DATA; if (file_is_hot(inode) || is_inode_flag_set(inode, FI_HOT_DATA) || @@ -3126,27 +3304,25 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, { struct sit_info *sit_i = SIT_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, type); - bool put_pin_sem = false; - - if (type == CURSEG_COLD_DATA) { - /* GC during CURSEG_COLD_DATA_PINNED allocation */ - if (down_read_trylock(&sbi->pin_sem)) { - put_pin_sem = true; - } else { - type = CURSEG_WARM_DATA; - curseg = CURSEG_I(sbi, type); - } - } else if (type == CURSEG_COLD_DATA_PINNED) { - type = CURSEG_COLD_DATA; - } + unsigned long long old_mtime; + bool from_gc = (type == CURSEG_ALL_DATA_ATGC); + struct seg_entry *se = NULL; down_read(&SM_I(sbi)->curseg_lock); mutex_lock(&curseg->curseg_mutex); down_write(&sit_i->sentry_lock); + if (from_gc) { + f2fs_bug_on(sbi, GET_SEGNO(sbi, old_blkaddr) == NULL_SEGNO); + se = get_seg_entry(sbi, GET_SEGNO(sbi, old_blkaddr)); + sanity_check_seg_type(sbi, se->type); + f2fs_bug_on(sbi, IS_NODESEG(se->type)); + } *new_blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); + f2fs_bug_on(sbi, curseg->next_blkoff >= sbi->blocks_per_seg); + f2fs_wait_discard_bio(sbi, *new_blkaddr); /* @@ -3160,6 +3336,14 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, stat_inc_block_count(sbi, curseg); + if (from_gc) { + old_mtime = get_segment_mtime(sbi, old_blkaddr); + } else { + update_segment_mtime(sbi, old_blkaddr, 0); + old_mtime = 0; + } + update_segment_mtime(sbi, *new_blkaddr, old_mtime); + /* * SIT information should be updated before segment allocation, * since SSR needs latest valid block information. @@ -3168,9 +3352,13 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) update_sit_entry(sbi, old_blkaddr, -1); - if (!__has_curseg_space(sbi, type)) - sit_i->s_ops->allocate_segment(sbi, type, false); - + if (!__has_curseg_space(sbi, curseg)) { + if (from_gc) + get_atssr_segment(sbi, type, se->type, + AT_SSR, se->mtime); + else + sit_i->s_ops->allocate_segment(sbi, type, false); + } /* * segment dirty status should be updated after segment allocation, * so we just need to update status only one time after previous @@ -3204,9 +3392,6 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, mutex_unlock(&curseg->curseg_mutex); up_read(&SM_I(sbi)->curseg_lock); - - if (put_pin_sem) - up_read(&sbi->pin_sem); } static void update_device_state(struct f2fs_io_info *fio) @@ -3355,7 +3540,8 @@ static inline int __f2fs_get_curseg(struct f2fs_sb_info *sbi, void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, block_t old_blkaddr, block_t new_blkaddr, - bool recover_curseg, bool recover_newaddr) + bool recover_curseg, bool recover_newaddr, + bool from_gc) { struct sit_info *sit_i = SIT_I(sbi); struct curseg_info *curseg; @@ -3400,17 +3586,22 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, /* change the current segment */ if (segno != curseg->segno) { curseg->next_segno = segno; - change_curseg(sbi, type); + change_curseg(sbi, type, true); } curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr); __add_sum_entry(sbi, type, sum); - if (!recover_curseg || recover_newaddr) + if (!recover_curseg || recover_newaddr) { + if (!from_gc) + update_segment_mtime(sbi, new_blkaddr, 0); update_sit_entry(sbi, new_blkaddr, 1); + } if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) { invalidate_mapping_pages(META_MAPPING(sbi), old_blkaddr, old_blkaddr); + if (!from_gc) + update_segment_mtime(sbi, old_blkaddr, 0); update_sit_entry(sbi, old_blkaddr, -1); } @@ -3422,7 +3613,7 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, if (recover_curseg) { if (old_cursegno != curseg->segno) { curseg->next_segno = old_cursegno; - change_curseg(sbi, type); + change_curseg(sbi, type, true); } curseg->next_blkoff = old_blkoff; } @@ -3442,7 +3633,7 @@ void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn, set_summary(&sum, dn->nid, dn->ofs_in_node, version); f2fs_do_replace_block(sbi, &sum, old_addr, new_addr, - recover_curseg, recover_newaddr); + recover_curseg, recover_newaddr, false); f2fs_update_data_blkaddr(dn, new_addr); } @@ -3574,7 +3765,7 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) blk_off = le16_to_cpu(ckpt->cur_data_blkoff[type - CURSEG_HOT_DATA]); if (__exist_node_summaries(sbi)) - blk_addr = sum_blk_addr(sbi, NR_CURSEG_TYPE, type); + blk_addr = sum_blk_addr(sbi, NR_CURSEG_PERSIST_TYPE, type); else blk_addr = sum_blk_addr(sbi, NR_CURSEG_DATA_TYPE, type); } else { @@ -3652,8 +3843,9 @@ static int restore_curseg_summaries(struct f2fs_sb_info *sbi) } if (__exist_node_summaries(sbi)) - f2fs_ra_meta_pages(sbi, sum_blk_addr(sbi, NR_CURSEG_TYPE, type), - NR_CURSEG_TYPE - type, META_CP, true); + f2fs_ra_meta_pages(sbi, + sum_blk_addr(sbi, NR_CURSEG_PERSIST_TYPE, type), + NR_CURSEG_PERSIST_TYPE - type, META_CP, true); for (; type <= CURSEG_COLD_NODE; type++) { err = read_normal_summaries(sbi, type); @@ -3781,7 +3973,7 @@ int f2fs_lookup_journal_in_cursum(struct f2fs_journal *journal, int type, static struct page *get_current_sit_page(struct f2fs_sb_info *sbi, unsigned int segno) { - return f2fs_get_meta_page_nofail(sbi, current_sit_addr(sbi, segno)); + return f2fs_get_meta_page(sbi, current_sit_addr(sbi, segno)); } static struct page *get_next_sit_page(struct f2fs_sb_info *sbi, @@ -4155,14 +4347,14 @@ static int build_curseg(struct f2fs_sb_info *sbi) struct curseg_info *array; int i; - array = f2fs_kzalloc(sbi, array_size(NR_CURSEG_TYPE, sizeof(*array)), - GFP_KERNEL); + array = f2fs_kzalloc(sbi, array_size(NR_CURSEG_TYPE, + sizeof(*array)), GFP_KERNEL); if (!array) return -ENOMEM; SM_I(sbi)->curseg_array = array; - for (i = 0; i < NR_CURSEG_TYPE; i++) { + for (i = 0; i < NO_CHECK_TYPE; i++) { mutex_init(&array[i].curseg_mutex); array[i].sum_blk = f2fs_kzalloc(sbi, PAGE_SIZE, GFP_KERNEL); if (!array[i].sum_blk) @@ -4172,8 +4364,15 @@ static int build_curseg(struct f2fs_sb_info *sbi) sizeof(struct f2fs_journal), GFP_KERNEL); if (!array[i].journal) return -ENOMEM; + if (i < NR_PERSISTENT_LOG) + array[i].seg_type = CURSEG_HOT_DATA + i; + else if (i == CURSEG_COLD_DATA_PINNED) + array[i].seg_type = CURSEG_COLD_DATA; + else if (i == CURSEG_ALL_DATA_ATGC) + array[i].seg_type = CURSEG_COLD_DATA; array[i].segno = NULL_SEGNO; array[i].next_blkoff = 0; + array[i].inited = false; } return restore_curseg_summaries(sbi); } @@ -4294,9 +4493,12 @@ static void init_free_segmap(struct f2fs_sb_info *sbi) { unsigned int start; int type; + struct seg_entry *sentry; for (start = 0; start < MAIN_SEGS(sbi); start++) { - struct seg_entry *sentry = get_seg_entry(sbi, start); + if (f2fs_usable_blks_in_seg(sbi, start) == 0) + continue; + sentry = get_seg_entry(sbi, start); if (!sentry->valid_blocks) __set_free(sbi, start); else @@ -4316,7 +4518,7 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi) struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); struct free_segmap_info *free_i = FREE_I(sbi); unsigned int segno = 0, offset = 0, secno; - block_t valid_blocks; + block_t valid_blocks, usable_blks_in_seg; block_t blks_per_sec = BLKS_PER_SEC(sbi); while (1) { @@ -4326,9 +4528,10 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi) break; offset = segno + 1; valid_blocks = get_valid_blocks(sbi, segno, false); - if (valid_blocks == sbi->blocks_per_seg || !valid_blocks) + usable_blks_in_seg = f2fs_usable_blks_in_seg(sbi, segno); + if (valid_blocks == usable_blks_in_seg || !valid_blocks) continue; - if (valid_blocks > sbi->blocks_per_seg) { + if (valid_blocks > usable_blks_in_seg) { f2fs_bug_on(sbi, 1); continue; } @@ -4408,11 +4611,13 @@ static int sanity_check_curseg(struct f2fs_sb_info *sbi) * In LFS/SSR curseg, .next_blkoff should point to an unused blkaddr; * In LFS curseg, all blkaddr after .next_blkoff should be unused. */ - for (i = 0; i < NO_CHECK_TYPE; i++) { + for (i = 0; i < NR_PERSISTENT_LOG; i++) { struct curseg_info *curseg = CURSEG_I(sbi, i); struct seg_entry *se = get_seg_entry(sbi, curseg->segno); unsigned int blkofs = curseg->next_blkoff; + sanity_check_seg_type(sbi, curseg->seg_type); + if (f2fs_test_bit(blkofs, se->cur_valid_map)) goto out; @@ -4637,7 +4842,7 @@ int f2fs_fix_curseg_write_pointer(struct f2fs_sb_info *sbi) { int i, ret; - for (i = 0; i < NO_CHECK_TYPE; i++) { + for (i = 0; i < NR_PERSISTENT_LOG; i++) { ret = fix_curseg_write_pointer(sbi, i); if (ret) return ret; @@ -4678,6 +4883,101 @@ int f2fs_check_write_pointer(struct f2fs_sb_info *sbi) return 0; } + +static bool is_conv_zone(struct f2fs_sb_info *sbi, unsigned int zone_idx, + unsigned int dev_idx) +{ + if (!bdev_is_zoned(FDEV(dev_idx).bdev)) + return true; + return !test_bit(zone_idx, FDEV(dev_idx).blkz_seq); +} + +/* Return the zone index in the given device */ +static unsigned int get_zone_idx(struct f2fs_sb_info *sbi, unsigned int secno, + int dev_idx) +{ + block_t sec_start_blkaddr = START_BLOCK(sbi, GET_SEG_FROM_SEC(sbi, secno)); + + return (sec_start_blkaddr - FDEV(dev_idx).start_blk) >> + sbi->log_blocks_per_blkz; +} + +/* + * Return the usable segments in a section based on the zone's + * corresponding zone capacity. Zone is equal to a section. + */ +static inline unsigned int f2fs_usable_zone_segs_in_sec( + struct f2fs_sb_info *sbi, unsigned int segno) +{ + unsigned int dev_idx, zone_idx, unusable_segs_in_sec; + + dev_idx = f2fs_target_device_index(sbi, START_BLOCK(sbi, segno)); + zone_idx = get_zone_idx(sbi, GET_SEC_FROM_SEG(sbi, segno), dev_idx); + + /* Conventional zone's capacity is always equal to zone size */ + if (is_conv_zone(sbi, zone_idx, dev_idx)) + return sbi->segs_per_sec; + + /* + * If the zone_capacity_blocks array is NULL, then zone capacity + * is equal to the zone size for all zones + */ + if (!FDEV(dev_idx).zone_capacity_blocks) + return sbi->segs_per_sec; + + /* Get the segment count beyond zone capacity block */ + unusable_segs_in_sec = (sbi->blocks_per_blkz - + FDEV(dev_idx).zone_capacity_blocks[zone_idx]) >> + sbi->log_blocks_per_seg; + return sbi->segs_per_sec - unusable_segs_in_sec; +} + +/* + * Return the number of usable blocks in a segment. The number of blocks + * returned is always equal to the number of blocks in a segment for + * segments fully contained within a sequential zone capacity or a + * conventional zone. For segments partially contained in a sequential + * zone capacity, the number of usable blocks up to the zone capacity + * is returned. 0 is returned in all other cases. + */ +static inline unsigned int f2fs_usable_zone_blks_in_seg( + struct f2fs_sb_info *sbi, unsigned int segno) +{ + block_t seg_start, sec_start_blkaddr, sec_cap_blkaddr; + unsigned int zone_idx, dev_idx, secno; + + secno = GET_SEC_FROM_SEG(sbi, segno); + seg_start = START_BLOCK(sbi, segno); + dev_idx = f2fs_target_device_index(sbi, seg_start); + zone_idx = get_zone_idx(sbi, secno, dev_idx); + + /* + * Conventional zone's capacity is always equal to zone size, + * so, blocks per segment is unchanged. + */ + if (is_conv_zone(sbi, zone_idx, dev_idx)) + return sbi->blocks_per_seg; + + if (!FDEV(dev_idx).zone_capacity_blocks) + return sbi->blocks_per_seg; + + sec_start_blkaddr = START_BLOCK(sbi, GET_SEG_FROM_SEC(sbi, secno)); + sec_cap_blkaddr = sec_start_blkaddr + + FDEV(dev_idx).zone_capacity_blocks[zone_idx]; + + /* + * If segment starts before zone capacity and spans beyond + * zone capacity, then usable blocks are from seg start to + * zone capacity. If the segment starts after the zone capacity, + * then there are no usable blocks. + */ + if (seg_start >= sec_cap_blkaddr) + return 0; + if (seg_start + sbi->blocks_per_seg > sec_cap_blkaddr) + return sec_cap_blkaddr - seg_start; + + return sbi->blocks_per_seg; +} #else int f2fs_fix_curseg_write_pointer(struct f2fs_sb_info *sbi) { @@ -4688,7 +4988,36 @@ int f2fs_check_write_pointer(struct f2fs_sb_info *sbi) { return 0; } + +static inline unsigned int f2fs_usable_zone_blks_in_seg(struct f2fs_sb_info *sbi, + unsigned int segno) +{ + return 0; +} + +static inline unsigned int f2fs_usable_zone_segs_in_sec(struct f2fs_sb_info *sbi, + unsigned int segno) +{ + return 0; +} #endif +unsigned int f2fs_usable_blks_in_seg(struct f2fs_sb_info *sbi, + unsigned int segno) +{ + if (f2fs_sb_has_blkzoned(sbi)) + return f2fs_usable_zone_blks_in_seg(sbi, segno); + + return sbi->blocks_per_seg; +} + +unsigned int f2fs_usable_segs_in_sec(struct f2fs_sb_info *sbi, + unsigned int segno) +{ + if (f2fs_sb_has_blkzoned(sbi)) + return f2fs_usable_zone_segs_in_sec(sbi, segno); + + return sbi->segs_per_sec; +} /* * Update min, max modified time for cost-benefit GC algorithm @@ -4715,6 +5044,7 @@ static void init_min_max_mtime(struct f2fs_sb_info *sbi) sit_i->min_mtime = mtime; } sit_i->max_mtime = get_mtime(sbi, false); + sit_i->dirty_max_mtime = 0; up_write(&sit_i->sentry_lock); } @@ -4830,7 +5160,7 @@ static void destroy_dirty_segmap(struct f2fs_sb_info *sbi) destroy_victim_secmap(sbi); SM_I(sbi)->dirty_info = NULL; - kvfree(dirty_i); + kfree(dirty_i); } static void destroy_curseg(struct f2fs_sb_info *sbi) @@ -4842,10 +5172,10 @@ static void destroy_curseg(struct f2fs_sb_info *sbi) return; SM_I(sbi)->curseg_array = NULL; for (i = 0; i < NR_CURSEG_TYPE; i++) { - kvfree(array[i].sum_blk); - kvfree(array[i].journal); + kfree(array[i].sum_blk); + kfree(array[i].journal); } - kvfree(array); + kfree(array); } static void destroy_free_segmap(struct f2fs_sb_info *sbi) @@ -4856,7 +5186,7 @@ static void destroy_free_segmap(struct f2fs_sb_info *sbi) SM_I(sbi)->free_info = NULL; kvfree(free_i->free_segmap); kvfree(free_i->free_secmap); - kvfree(free_i); + kfree(free_i); } static void destroy_sit_info(struct f2fs_sb_info *sbi) @@ -4868,7 +5198,7 @@ static void destroy_sit_info(struct f2fs_sb_info *sbi) if (sit_i->sentries) kvfree(sit_i->bitmap); - kvfree(sit_i->tmp_map); + kfree(sit_i->tmp_map); kvfree(sit_i->sentries); kvfree(sit_i->sec_entries); @@ -4880,7 +5210,7 @@ static void destroy_sit_info(struct f2fs_sb_info *sbi) kvfree(sit_i->sit_bitmap_mir); kvfree(sit_i->invalid_segmap); #endif - kvfree(sit_i); + kfree(sit_i); } void f2fs_destroy_segment_manager(struct f2fs_sb_info *sbi) @@ -4896,7 +5226,7 @@ void f2fs_destroy_segment_manager(struct f2fs_sb_info *sbi) destroy_free_segmap(sbi); destroy_sit_info(sbi); sbi->sm_info = NULL; - kvfree(sm_info); + kfree(sm_info); } int __init f2fs_create_segment_manager_caches(void) diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 752b177073b2..e81eb0748e2a 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -16,13 +16,20 @@ #define DEF_MAX_RECLAIM_PREFREE_SEGMENTS 4096 /* 8GB in maximum */ #define F2FS_MIN_SEGMENTS 9 /* SB + 2 (CP + SIT + NAT) + SSA + MAIN */ +#define F2FS_MIN_META_SEGMENTS 8 /* SB + 2 (CP + SIT + NAT) + SSA */ /* L: Logical segment # in volume, R: Relative segment # in main area */ #define GET_L2R_SEGNO(free_i, segno) ((segno) - (free_i)->start_segno) #define GET_R2L_SEGNO(free_i, segno) ((segno) + (free_i)->start_segno) #define IS_DATASEG(t) ((t) <= CURSEG_COLD_DATA) -#define IS_NODESEG(t) ((t) >= CURSEG_HOT_NODE) +#define IS_NODESEG(t) ((t) >= CURSEG_HOT_NODE && (t) <= CURSEG_COLD_NODE) + +static inline void sanity_check_seg_type(struct f2fs_sb_info *sbi, + unsigned short seg_type) +{ + f2fs_bug_on(sbi, seg_type >= NR_PERSISTENT_LOG); +} #define IS_HOT(t) ((t) == CURSEG_HOT_NODE || (t) == CURSEG_HOT_DATA) #define IS_WARM(t) ((t) == CURSEG_WARM_NODE || (t) == CURSEG_WARM_DATA) @@ -34,7 +41,9 @@ ((seg) == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno) || \ ((seg) == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno) || \ ((seg) == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno) || \ - ((seg) == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno)) + ((seg) == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno) || \ + ((seg) == CURSEG_I(sbi, CURSEG_COLD_DATA_PINNED)->segno) || \ + ((seg) == CURSEG_I(sbi, CURSEG_ALL_DATA_ATGC)->segno)) #define IS_CURSEC(sbi, secno) \ (((secno) == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno / \ @@ -48,7 +57,11 @@ ((secno) == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno / \ (sbi)->segs_per_sec) || \ ((secno) == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno / \ - (sbi)->segs_per_sec)) \ + (sbi)->segs_per_sec) || \ + ((secno) == CURSEG_I(sbi, CURSEG_COLD_DATA_PINNED)->segno / \ + (sbi)->segs_per_sec) || \ + ((secno) == CURSEG_I(sbi, CURSEG_ALL_DATA_ATGC)->segno / \ + (sbi)->segs_per_sec)) #define MAIN_BLKADDR(sbi) \ (SM_I(sbi) ? SM_I(sbi)->main_blkaddr : \ @@ -132,20 +145,25 @@ enum { * In the victim_sel_policy->alloc_mode, there are two block allocation modes. * LFS writes data sequentially with cleaning operations. * SSR (Slack Space Recycle) reuses obsolete space without cleaning operations. + * AT_SSR (Age Threshold based Slack Space Recycle) merges fragments into + * fragmented segment which has similar aging degree. */ enum { LFS = 0, - SSR + SSR, + AT_SSR, }; /* * In the victim_sel_policy->gc_mode, there are two gc, aka cleaning, modes. * GC_CB is based on cost-benefit algorithm. * GC_GREEDY is based on greedy algorithm. + * GC_AT is based on age-threshold algorithm. */ enum { GC_CB = 0, GC_GREEDY, + GC_AT, ALLOC_NEXT, FLUSH_DEVICE, MAX_GC_POLICY, @@ -174,7 +192,10 @@ struct victim_sel_policy { unsigned int offset; /* last scanned bitmap offset */ unsigned int ofs_unit; /* bitmap search unit */ unsigned int min_cost; /* minimum cost */ + unsigned long long oldest_age; /* oldest age of segments having the same min cost */ unsigned int min_segno; /* segment # having min. cost */ + unsigned long long age; /* mtime of GCed section*/ + unsigned long long age_threshold;/* age threshold */ }; struct seg_entry { @@ -240,6 +261,8 @@ struct sit_info { unsigned long long mounted_time; /* mount time */ unsigned long long min_mtime; /* min. modification time */ unsigned long long max_mtime; /* max. modification time */ + unsigned long long dirty_min_mtime; /* rerange candidates in GC_AT */ + unsigned long long dirty_max_mtime; /* rerange candidates in GC_AT */ unsigned int last_victim[MAX_GC_POLICY]; /* last victim segment # */ }; @@ -278,7 +301,7 @@ struct dirty_seglist_info { /* victim selection function for cleaning and SSR */ struct victim_selection { int (*get_victim)(struct f2fs_sb_info *, unsigned int *, - int, int, char); + int, int, char, unsigned long long); }; /* for active log information */ @@ -288,10 +311,12 @@ struct curseg_info { struct rw_semaphore journal_rwsem; /* protect journal area */ struct f2fs_journal *journal; /* cached journal info */ unsigned char alloc_type; /* current allocation type */ + unsigned short seg_type; /* segment type like CURSEG_XXX_TYPE */ unsigned int segno; /* current segment number */ unsigned short next_blkoff; /* next block offset to write */ unsigned int zone; /* current zone number */ unsigned int next_segno; /* preallocated segment */ + bool inited; /* indicate inmem log is inited */ }; struct sit_entry_set { @@ -305,8 +330,6 @@ struct sit_entry_set { */ static inline struct curseg_info *CURSEG_I(struct f2fs_sb_info *sbi, int type) { - if (type == CURSEG_COLD_DATA_PINNED) - type = CURSEG_COLD_DATA; return (struct curseg_info *)(SM_I(sbi)->curseg_array + type); } @@ -411,6 +434,7 @@ static inline void __set_free(struct f2fs_sb_info *sbi, unsigned int segno) unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); unsigned int start_segno = GET_SEG_FROM_SEC(sbi, secno); unsigned int next; + unsigned int usable_segs = f2fs_usable_segs_in_sec(sbi, segno); spin_lock(&free_i->segmap_lock); clear_bit(segno, free_i->free_segmap); @@ -418,7 +442,7 @@ static inline void __set_free(struct f2fs_sb_info *sbi, unsigned int segno) next = find_next_bit(free_i->free_segmap, start_segno + sbi->segs_per_sec, start_segno); - if (next >= start_segno + sbi->segs_per_sec) { + if (next >= start_segno + usable_segs) { clear_bit(secno, free_i->free_secmap); free_i->free_sections++; } @@ -438,22 +462,23 @@ static inline void __set_inuse(struct f2fs_sb_info *sbi, } static inline void __set_test_and_free(struct f2fs_sb_info *sbi, - unsigned int segno) + unsigned int segno, bool inmem) { struct free_segmap_info *free_i = FREE_I(sbi); unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); unsigned int start_segno = GET_SEG_FROM_SEC(sbi, secno); unsigned int next; + unsigned int usable_segs = f2fs_usable_segs_in_sec(sbi, segno); spin_lock(&free_i->segmap_lock); if (test_and_clear_bit(segno, free_i->free_segmap)) { free_i->free_segments++; - if (IS_CURSEC(sbi, secno)) + if (!inmem && IS_CURSEC(sbi, secno)) goto skip_free; next = find_next_bit(free_i->free_segmap, start_segno + sbi->segs_per_sec, start_segno); - if (next >= start_segno + sbi->segs_per_sec) { + if (next >= start_segno + usable_segs) { if (test_and_clear_bit(secno, free_i->free_secmap)) free_i->free_sections++; } @@ -500,7 +525,7 @@ static inline unsigned int free_segments(struct f2fs_sb_info *sbi) return FREE_I(sbi)->free_segments; } -static inline int reserved_segments(struct f2fs_sb_info *sbi) +static inline unsigned int reserved_segments(struct f2fs_sb_info *sbi) { return SM_I(sbi)->reserved_segments; } @@ -532,7 +557,7 @@ static inline int overprovision_segments(struct f2fs_sb_info *sbi) static inline int reserved_sections(struct f2fs_sb_info *sbi) { - return GET_SEC_FROM_SEG(sbi, (unsigned int)reserved_segments(sbi)); + return GET_SEC_FROM_SEG(sbi, reserved_segments(sbi)); } static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi) @@ -546,8 +571,8 @@ static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi) /* check current node segment */ for (i = CURSEG_HOT_NODE; i <= CURSEG_COLD_NODE; i++) { segno = CURSEG_I(sbi, i)->segno; - left_blocks = sbi->blocks_per_seg - - get_seg_entry(sbi, segno)->ckpt_valid_blocks; + left_blocks = f2fs_usable_blks_in_seg(sbi, segno) - + get_seg_entry(sbi, segno)->ckpt_valid_blocks; if (node_blocks > left_blocks) return false; @@ -555,7 +580,7 @@ static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi) /* check current data segment */ segno = CURSEG_I(sbi, CURSEG_HOT_DATA)->segno; - left_blocks = sbi->blocks_per_seg - + left_blocks = f2fs_usable_blks_in_seg(sbi, segno) - get_seg_entry(sbi, segno)->ckpt_valid_blocks; if (dent_blocks > left_blocks) return false; @@ -677,21 +702,22 @@ static inline int check_block_count(struct f2fs_sb_info *sbi, bool is_valid = test_bit_le(0, raw_sit->valid_map) ? true : false; int valid_blocks = 0; int cur_pos = 0, next_pos; + unsigned int usable_blks_per_seg = f2fs_usable_blks_in_seg(sbi, segno); /* check bitmap with valid block count */ do { if (is_valid) { next_pos = find_next_zero_bit_le(&raw_sit->valid_map, - sbi->blocks_per_seg, + usable_blks_per_seg, cur_pos); valid_blocks += next_pos - cur_pos; } else next_pos = find_next_bit_le(&raw_sit->valid_map, - sbi->blocks_per_seg, + usable_blks_per_seg, cur_pos); cur_pos = next_pos; is_valid = !is_valid; - } while (cur_pos < sbi->blocks_per_seg); + } while (cur_pos < usable_blks_per_seg); if (unlikely(GET_SIT_VBLOCKS(raw_sit) != valid_blocks)) { f2fs_err(sbi, "Mismatch valid blocks %d vs. %d", @@ -700,8 +726,13 @@ static inline int check_block_count(struct f2fs_sb_info *sbi, return -EFSCORRUPTED; } + if (usable_blks_per_seg < sbi->blocks_per_seg) + f2fs_bug_on(sbi, find_next_bit_le(&raw_sit->valid_map, + sbi->blocks_per_seg, + usable_blks_per_seg) != sbi->blocks_per_seg); + /* check segment usage, and check boundary of a given segment number */ - if (unlikely(GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg + if (unlikely(GET_SIT_VBLOCKS(raw_sit) > usable_blks_per_seg || segno > TOTAL_SEGS(sbi) - 1)) { f2fs_err(sbi, "Wrong valid blocks %d or segno %u", GET_SIT_VBLOCKS(raw_sit), segno); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index bef2be3fa3d0..0c958fed3392 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -146,6 +146,7 @@ enum { Opt_compress_algorithm, Opt_compress_log_size, Opt_compress_extension, + Opt_atgc, Opt_err, }; @@ -213,6 +214,7 @@ static match_table_t f2fs_tokens = { {Opt_compress_algorithm, "compress_algorithm=%s"}, {Opt_compress_log_size, "compress_log_size=%u"}, {Opt_compress_extension, "compress_extension=%s"}, + {Opt_atgc, "atgc"}, {Opt_err, NULL}, }; @@ -580,7 +582,8 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) case Opt_active_logs: if (args->from && match_int(args, &arg)) return -EINVAL; - if (arg != 2 && arg != 4 && arg != NR_CURSEG_TYPE) + if (arg != 2 && arg != 4 && + arg != NR_CURSEG_PERSIST_TYPE) return -EINVAL; F2FS_OPTION(sbi).active_logs = arg; break; @@ -868,8 +871,8 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) #ifdef CONFIG_F2FS_FS_COMPRESSION case Opt_compress_algorithm: if (!f2fs_sb_has_compression(sbi)) { - f2fs_err(sbi, "Compression feature if off"); - return -EINVAL; + f2fs_info(sbi, "Image doesn't support compression"); + break; } name = match_strdup(&args[0]); if (!name) @@ -894,8 +897,8 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) break; case Opt_compress_log_size: if (!f2fs_sb_has_compression(sbi)) { - f2fs_err(sbi, "Compression feature is off"); - return -EINVAL; + f2fs_info(sbi, "Image doesn't support compression"); + break; } if (args->from && match_int(args, &arg)) return -EINVAL; @@ -909,8 +912,8 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) break; case Opt_compress_extension: if (!f2fs_sb_has_compression(sbi)) { - f2fs_err(sbi, "Compression feature is off"); - return -EINVAL; + f2fs_info(sbi, "Image doesn't support compression"); + break; } name = match_strdup(&args[0]); if (!name) @@ -938,6 +941,9 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) f2fs_info(sbi, "compression options not supported"); break; #endif + case Opt_atgc: + set_opt(sbi, ATGC); + break; default: f2fs_err(sbi, "Unrecognized mount option \"%s\" or missing value", p); @@ -964,6 +970,17 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) return -EINVAL; } #endif + /* + * The BLKZONED feature indicates that the drive was formatted with + * zone alignment optimization. This is optional for host-aware + * devices, but mandatory for host-managed zoned block devices. + */ +#ifndef CONFIG_BLK_DEV_ZONED + if (f2fs_sb_has_blkzoned(sbi)) { + f2fs_err(sbi, "Zoned block device support is not enabled"); + return -EINVAL; + } +#endif if (F2FS_IO_SIZE_BITS(sbi) && !f2fs_lfs_mode(sbi)) { f2fs_err(sbi, "Should set mode=lfs with %uKB-sized IO", @@ -1001,7 +1018,7 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) } /* Not pass down write hints if the number of active logs is lesser - * than NR_CURSEG_TYPE. + * than NR_CURSEG_PERSIST_TYPE. */ if (F2FS_OPTION(sbi).active_logs != NR_CURSEG_TYPE) F2FS_OPTION(sbi).whint_mode = WHINT_MODE_OFF; @@ -1020,6 +1037,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) /* Initialize f2fs-specific inode info */ atomic_set(&fi->dirty_pages, 0); + atomic_set(&fi->i_compr_blocks, 0); init_rwsem(&fi->i_sem); spin_lock_init(&fi->i_size_lock); INIT_LIST_HEAD(&fi->dirty_list); @@ -1184,6 +1202,7 @@ static void destroy_device_list(struct f2fs_sb_info *sbi) blkdev_put(FDEV(i).bdev, FMODE_EXCL); #ifdef CONFIG_BLK_DEV_ZONED kvfree(FDEV(i).blkz_seq); + kfree(FDEV(i).zone_capacity_blocks); #endif } kvfree(sbi->devs); @@ -1269,6 +1288,7 @@ static void f2fs_put_super(struct super_block *sb) kfree(sbi->raw_super); destroy_device_list(sbi); + f2fs_destroy_page_array_cache(sbi); f2fs_destroy_xattr_caches(sbi); mempool_destroy(sbi->write_io_dummy); #ifdef CONFIG_QUOTA @@ -1280,7 +1300,7 @@ static void f2fs_put_super(struct super_block *sb) for (i = 0; i < NR_PAGE_TYPE; i++) kvfree(sbi->write_io[i]); #ifdef CONFIG_UNICODE - utf8_unload(sbi->s_encoding); + utf8_unload(sb->s_encoding); #endif kfree(sbi); } @@ -1634,13 +1654,16 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) #ifdef CONFIG_F2FS_FS_COMPRESSION f2fs_show_compress_options(seq, sbi->sb); #endif + + if (test_opt(sbi, ATGC)) + seq_puts(seq, ",atgc"); return 0; } static void default_options(struct f2fs_sb_info *sbi) { /* init some FS parameters */ - F2FS_OPTION(sbi).active_logs = NR_CURSEG_TYPE; + F2FS_OPTION(sbi).active_logs = NR_CURSEG_PERSIST_TYPE; F2FS_OPTION(sbi).inline_xattr_size = DEFAULT_INLINE_XATTR_ADDRS; F2FS_OPTION(sbi).whint_mode = WHINT_MODE_OFF; F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_DEFAULT; @@ -1763,6 +1786,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) bool no_extent_cache = !test_opt(sbi, EXTENT_CACHE); bool disable_checkpoint = test_opt(sbi, DISABLE_CHECKPOINT); bool no_io_align = !F2FS_IO_ALIGNED(sbi); + bool no_atgc = !test_opt(sbi, ATGC); bool checkpoint_changed; #ifdef CONFIG_QUOTA int i, j; @@ -1835,6 +1859,13 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) } } #endif + /* disallow enable atgc dynamically */ + if (no_atgc == !!test_opt(sbi, ATGC)) { + err = -EINVAL; + f2fs_warn(sbi, "switch atgc option is not allowed"); + goto restore_opts; + } + /* disallow enable/disable extent_cache dynamically */ if (no_extent_cache == !!test_opt(sbi, EXTENT_CACHE)) { err = -EINVAL; @@ -2679,10 +2710,8 @@ static inline bool sanity_check_area_boundary(struct f2fs_sb_info *sbi, } if (main_end_blkaddr > seg_end_blkaddr) { - f2fs_info(sbi, "Wrong MAIN_AREA boundary, start(%u) end(%u) block(%u)", - main_blkaddr, - segment0_blkaddr + - (segment_count << log_blocks_per_seg), + f2fs_info(sbi, "Wrong MAIN_AREA boundary, start(%u) end(%llu) block(%u)", + main_blkaddr, seg_end_blkaddr, segment_count_main << log_blocks_per_seg); return true; } else if (main_end_blkaddr < seg_end_blkaddr) { @@ -2700,10 +2729,8 @@ static inline bool sanity_check_area_boundary(struct f2fs_sb_info *sbi, err = __f2fs_commit_super(bh, NULL); res = err ? "failed" : "done"; } - f2fs_info(sbi, "Fix alignment : %s, start(%u) end(%u) block(%u)", - res, main_blkaddr, - segment0_blkaddr + - (segment_count << log_blocks_per_seg), + f2fs_info(sbi, "Fix alignment : %s, start(%u) end(%llu) block(%u)", + res, main_blkaddr, seg_end_blkaddr, segment_count_main << log_blocks_per_seg); if (err) return true; @@ -2714,7 +2741,7 @@ static inline bool sanity_check_area_boundary(struct f2fs_sb_info *sbi, static int sanity_check_raw_super(struct f2fs_sb_info *sbi, struct buffer_head *bh) { - block_t segment_count, segs_per_sec, secs_per_zone; + block_t segment_count, segs_per_sec, secs_per_zone, segment_count_main; block_t total_sections, blocks_per_seg; struct f2fs_super_block *raw_super = (struct f2fs_super_block *) (bh->b_data + F2FS_SUPER_OFFSET); @@ -2785,6 +2812,7 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi, } segment_count = le32_to_cpu(raw_super->segment_count); + segment_count_main = le32_to_cpu(raw_super->segment_count_main); segs_per_sec = le32_to_cpu(raw_super->segs_per_sec); secs_per_zone = le32_to_cpu(raw_super->secs_per_zone); total_sections = le32_to_cpu(raw_super->section_count); @@ -2798,14 +2826,19 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi, return -EFSCORRUPTED; } - if (total_sections > segment_count || - total_sections < F2FS_MIN_SEGMENTS || + if (total_sections > segment_count_main || total_sections < 1 || segs_per_sec > segment_count || !segs_per_sec) { f2fs_info(sbi, "Invalid segment/section count (%u, %u x %u)", segment_count, total_sections, segs_per_sec); return -EFSCORRUPTED; } + if (segment_count_main != total_sections * segs_per_sec) { + f2fs_info(sbi, "Invalid segment/section count (%u != %u * %u)", + segment_count_main, total_sections, segs_per_sec); + return -EFSCORRUPTED; + } + if ((segment_count / segs_per_sec) < total_sections) { f2fs_info(sbi, "Small segment_count (%u < %u * %u)", segment_count, segs_per_sec, total_sections); @@ -2831,6 +2864,12 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi, segment_count, dev_seg_count); return -EFSCORRUPTED; } + } else { + if (__F2FS_HAS_FEATURE(raw_super, F2FS_FEATURE_BLKZONED) && + !bdev_is_zoned(sbi->sb->s_bdev)) { + f2fs_info(sbi, "Zoned block device path is missing"); + return -EFSCORRUPTED; + } } if (secs_per_zone > total_sections || !secs_per_zone) { @@ -2906,7 +2945,7 @@ int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi) ovp_segments = le32_to_cpu(ckpt->overprov_segment_count); reserved_segments = le32_to_cpu(ckpt->rsvd_segment_count); - if (unlikely(fsmeta < F2FS_MIN_SEGMENTS || + if (unlikely(fsmeta < F2FS_MIN_META_SEGMENTS || ovp_segments == 0 || reserved_segments == 0)) { f2fs_err(sbi, "Wrong layout: check mkfs.f2fs version"); return 1; @@ -2994,7 +3033,7 @@ int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi) cp_payload = __cp_payload(sbi); if (cp_pack_start_sum < cp_payload + 1 || cp_pack_start_sum > blocks_per_seg - 1 - - NR_CURSEG_TYPE) { + NR_CURSEG_PERSIST_TYPE) { f2fs_err(sbi, "Wrong cp_pack_start_sum: %u", cp_pack_start_sum); return 1; @@ -3087,13 +3126,26 @@ static int init_percpu_info(struct f2fs_sb_info *sbi) } #ifdef CONFIG_BLK_DEV_ZONED + +struct f2fs_report_zones_args { + struct f2fs_dev_info *dev; + bool zone_cap_mismatch; +}; + static int f2fs_report_zone_cb(struct blk_zone *zone, unsigned int idx, - void *data) + void *data) { - struct f2fs_dev_info *dev = data; + struct f2fs_report_zones_args *rz_args = data; + + if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) + return 0; + + set_bit(idx, rz_args->dev->blkz_seq); + rz_args->dev->zone_capacity_blocks[idx] = zone->capacity >> + F2FS_LOG_SECTORS_PER_BLOCK; + if (zone->len != zone->capacity && !rz_args->zone_cap_mismatch) + rz_args->zone_cap_mismatch = true; - if (zone->type != BLK_ZONE_TYPE_CONVENTIONAL) - set_bit(idx, dev->blkz_seq); return 0; } @@ -3101,6 +3153,7 @@ static int init_blkz_info(struct f2fs_sb_info *sbi, int devi) { struct block_device *bdev = FDEV(devi).bdev; sector_t nr_sectors = bdev->bd_part->nr_sects; + struct f2fs_report_zones_args rep_zone_arg; int ret; if (!f2fs_sb_has_blkzoned(sbi)) @@ -3126,12 +3179,26 @@ static int init_blkz_info(struct f2fs_sb_info *sbi, int devi) if (!FDEV(devi).blkz_seq) return -ENOMEM; - /* Get block zones type */ + /* Get block zones type and zone-capacity */ + FDEV(devi).zone_capacity_blocks = f2fs_kzalloc(sbi, + FDEV(devi).nr_blkz * sizeof(block_t), + GFP_KERNEL); + if (!FDEV(devi).zone_capacity_blocks) + return -ENOMEM; + + rep_zone_arg.dev = &FDEV(devi); + rep_zone_arg.zone_cap_mismatch = false; + ret = blkdev_report_zones(bdev, 0, BLK_ALL_ZONES, f2fs_report_zone_cb, - &FDEV(devi)); + &rep_zone_arg); if (ret < 0) return ret; + if (!rep_zone_arg.zone_cap_mismatch) { + kfree(FDEV(devi).zone_capacity_blocks); + FDEV(devi).zone_capacity_blocks = NULL; + } + return 0; } #endif @@ -3328,7 +3395,7 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) static int f2fs_setup_casefold(struct f2fs_sb_info *sbi) { #ifdef CONFIG_UNICODE - if (f2fs_sb_has_casefold(sbi) && !sbi->s_encoding) { + if (f2fs_sb_has_casefold(sbi) && !sbi->sb->s_encoding) { const struct f2fs_sb_encodings *encoding_info; struct unicode_map *encoding; __u16 encoding_flags; @@ -3359,8 +3426,8 @@ static int f2fs_setup_casefold(struct f2fs_sb_info *sbi) "%s-%s with flags 0x%hx", encoding_info->name, encoding_info->version?:"\b", encoding_flags); - sbi->s_encoding = encoding; - sbi->s_encoding_flags = encoding_flags; + sbi->sb->s_encoding = encoding; + sbi->sb->s_encoding_flags = encoding_flags; sbi->sb->s_d_op = &f2fs_dentry_ops; } #else @@ -3439,18 +3506,6 @@ try_onemore: sbi->s_chksum_seed = f2fs_chksum(sbi, ~0, raw_super->uuid, sizeof(raw_super->uuid)); - /* - * The BLKZONED feature indicates that the drive was formatted with - * zone alignment optimization. This is optional for host-aware - * devices, but mandatory for host-managed zoned block devices. - */ -#ifndef CONFIG_BLK_DEV_ZONED - if (f2fs_sb_has_blkzoned(sbi)) { - f2fs_err(sbi, "Zoned block device support is not enabled"); - err = -EOPNOTSUPP; - goto free_sb_buf; - } -#endif default_options(sbi); /* parse mount options */ options = kstrdup((const char *)data, GFP_KERNEL); @@ -3565,13 +3620,16 @@ try_onemore: err = f2fs_init_xattr_caches(sbi); if (err) goto free_io_dummy; + err = f2fs_init_page_array_cache(sbi); + if (err) + goto free_xattr_cache; /* get an inode for meta space */ sbi->meta_inode = f2fs_iget(sb, F2FS_META_INO(sbi)); if (IS_ERR(sbi->meta_inode)) { f2fs_err(sbi, "Failed to read F2FS meta data inode"); err = PTR_ERR(sbi->meta_inode); - goto free_xattr_cache; + goto free_page_array_cache; } err = f2fs_get_valid_checkpoint(sbi); @@ -3761,6 +3819,8 @@ try_onemore: } reset_checkpoint: + f2fs_init_inmem_curseg(sbi); + /* f2fs_recover_fsync_data() cleared this already */ clear_sbi_flag(sbi, SBI_POR_DOING); @@ -3845,6 +3905,8 @@ free_meta_inode: make_bad_inode(sbi->meta_inode); iput(sbi->meta_inode); sbi->meta_inode = NULL; +free_page_array_cache: + f2fs_destroy_page_array_cache(sbi); free_xattr_cache: f2fs_destroy_xattr_caches(sbi); free_io_dummy: @@ -3856,7 +3918,7 @@ free_bio_info: kvfree(sbi->write_io[i]); #ifdef CONFIG_UNICODE - utf8_unload(sbi->s_encoding); + utf8_unload(sb->s_encoding); #endif free_options: #ifdef CONFIG_QUOTA @@ -3966,9 +4028,12 @@ static int __init init_f2fs_fs(void) err = f2fs_create_extent_cache(); if (err) goto free_checkpoint_caches; - err = f2fs_init_sysfs(); + err = f2fs_create_garbage_collection_cache(); if (err) goto free_extent_cache; + err = f2fs_init_sysfs(); + if (err) + goto free_garbage_collection_cache; err = register_shrinker(&f2fs_shrinker_info); if (err) goto free_sysfs; @@ -3988,7 +4053,12 @@ static int __init init_f2fs_fs(void) err = f2fs_init_compress_mempool(); if (err) goto free_bioset; + err = f2fs_init_compress_cache(); + if (err) + goto free_compress_mempool; return 0; +free_compress_mempool: + f2fs_destroy_compress_mempool(); free_bioset: f2fs_destroy_bioset(); free_bio_enrty_cache: @@ -4002,6 +4072,8 @@ free_shrinker: unregister_shrinker(&f2fs_shrinker_info); free_sysfs: f2fs_exit_sysfs(); +free_garbage_collection_cache: + f2fs_destroy_garbage_collection_cache(); free_extent_cache: f2fs_destroy_extent_cache(); free_checkpoint_caches: @@ -4018,6 +4090,7 @@ fail: static void __exit exit_f2fs_fs(void) { + f2fs_destroy_compress_cache(); f2fs_destroy_compress_mempool(); f2fs_destroy_bioset(); f2fs_destroy_bio_entry_cache(); @@ -4026,6 +4099,7 @@ static void __exit exit_f2fs_fs(void) unregister_filesystem(&f2fs_fs_type); unregister_shrinker(&f2fs_shrinker_info); f2fs_exit_sysfs(); + f2fs_destroy_garbage_collection_cache(); f2fs_destroy_extent_cache(); f2fs_destroy_checkpoint_caches(); f2fs_destroy_segment_manager_caches(); diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 88ed9969cc86..ec77ccfea923 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -176,12 +176,14 @@ static ssize_t encoding_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { #ifdef CONFIG_UNICODE + struct super_block *sb = sbi->sb; + if (f2fs_sb_has_casefold(sbi)) return snprintf(buf, PAGE_SIZE, "%s (%d.%d.%d)\n", - sbi->s_encoding->charset, - (sbi->s_encoding->version >> 16) & 0xff, - (sbi->s_encoding->version >> 8) & 0xff, - sbi->s_encoding->version & 0xff); + sb->s_encoding->charset, + (sb->s_encoding->version >> 16) & 0xff, + (sb->s_encoding->version >> 8) & 0xff, + sb->s_encoding->version & 0xff); #endif return sprintf(buf, "(none)"); } @@ -375,12 +377,17 @@ out: return count; } if (!strcmp(a->attr.name, "gc_idle")) { - if (t == GC_IDLE_CB) + if (t == GC_IDLE_CB) { sbi->gc_mode = GC_IDLE_CB; - else if (t == GC_IDLE_GREEDY) + } else if (t == GC_IDLE_GREEDY) { sbi->gc_mode = GC_IDLE_GREEDY; - else + } else if (t == GC_IDLE_AT) { + if (!sbi->am.atgc_enabled) + return -EINVAL; + sbi->gc_mode = GC_AT; + } else { sbi->gc_mode = GC_NORMAL; + } return count; } @@ -968,4 +975,5 @@ void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi) } kobject_del(&sbi->s_kobj); kobject_put(&sbi->s_kobj); + wait_for_completion(&sbi->s_kobj_unregister); } diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 1b0736ce0918..65afcc3cc68a 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -39,7 +39,7 @@ static void xattr_free(struct f2fs_sb_info *sbi, void *xattr_addr, if (is_inline) kmem_cache_free(sbi->inline_xattr_slab, xattr_addr); else - kvfree(xattr_addr); + kfree(xattr_addr); } static int f2fs_xattr_generic_get(const struct xattr_handler *handler, @@ -425,7 +425,7 @@ static int read_all_xattrs(struct inode *inode, struct page *ipage, *base_addr = txattr_addr; return 0; fail: - kvfree(txattr_addr); + kfree(txattr_addr); return err; } @@ -610,7 +610,7 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) } error = buffer_size - rest; cleanup: - kvfree(base_addr); + kfree(base_addr); return error; } @@ -750,7 +750,7 @@ static int __f2fs_setxattr(struct inode *inode, int index, if (!error && S_ISDIR(inode->i_mode)) set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_CP); exit: - kvfree(base_addr); + kfree(base_addr); return error; } diff --git a/fs/libfs.c b/fs/libfs.c index e0d42e977d9a..fc34361c1489 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -20,6 +20,8 @@ #include <linux/fs_context.h> #include <linux/pseudo_fs.h> #include <linux/fsnotify.h> +#include <linux/unicode.h> +#include <linux/fscrypt.h> #include <linux/uaccess.h> @@ -1363,3 +1365,88 @@ bool is_empty_dir_inode(struct inode *inode) return (inode->i_fop == &empty_dir_operations) && (inode->i_op == &empty_dir_inode_operations); } + +#ifdef CONFIG_UNICODE +/* + * Determine if the name of a dentry should be casefolded. + * + * Return: if names will need casefolding + */ +static bool needs_casefold(const struct inode *dir) +{ + return IS_CASEFOLDED(dir) && dir->i_sb->s_encoding; +} + +/** + * generic_ci_d_compare - generic d_compare implementation for casefolding filesystems + * @dentry: dentry whose name we are checking against + * @len: len of name of dentry + * @str: str pointer to name of dentry + * @name: Name to compare against + * + * Return: 0 if names match, 1 if mismatch, or -ERRNO + */ +int generic_ci_d_compare(const struct dentry *dentry, unsigned int len, + const char *str, const struct qstr *name) +{ + const struct dentry *parent = READ_ONCE(dentry->d_parent); + const struct inode *dir = READ_ONCE(parent->d_inode); + const struct super_block *sb = dentry->d_sb; + const struct unicode_map *um = sb->s_encoding; + struct qstr qstr = QSTR_INIT(str, len); + char strbuf[DNAME_INLINE_LEN]; + int ret; + + if (!dir || !needs_casefold(dir)) + goto fallback; + /* + * If the dentry name is stored in-line, then it may be concurrently + * modified by a rename. If this happens, the VFS will eventually retry + * the lookup, so it doesn't matter what ->d_compare() returns. + * However, it's unsafe to call utf8_strncasecmp() with an unstable + * string. Therefore, we have to copy the name into a temporary buffer. + */ + if (len <= DNAME_INLINE_LEN - 1) { + memcpy(strbuf, str, len); + strbuf[len] = 0; + qstr.name = strbuf; + /* prevent compiler from optimizing out the temporary buffer */ + barrier(); + } + ret = utf8_strncasecmp(um, name, &qstr); + if (ret >= 0) + return ret; + + if (sb_has_strict_encoding(sb)) + return -EINVAL; +fallback: + if (len != name->len) + return 1; + return !!memcmp(str, name->name, len); +} +EXPORT_SYMBOL(generic_ci_d_compare); + +/** + * generic_ci_d_hash - generic d_hash implementation for casefolding filesystems + * @dentry: dentry of the parent directory + * @str: qstr of name whose hash we should fill in + * + * Return: 0 if hash was successful or unchanged, and -EINVAL on error + */ +int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str) +{ + const struct inode *dir = READ_ONCE(dentry->d_inode); + struct super_block *sb = dentry->d_sb; + const struct unicode_map *um = sb->s_encoding; + int ret = 0; + + if (!dir || !needs_casefold(dir)) + return 0; + + ret = utf8_casefold_hash(um, dentry, str); + if (ret < 0 && sb_has_strict_encoding(sb)) + return -EINVAL; + return 0; +} +EXPORT_SYMBOL(generic_ci_d_hash); +#endif diff --git a/fs/unicode/utf8-core.c b/fs/unicode/utf8-core.c index 2a878b739115..dc25823bfed9 100644 --- a/fs/unicode/utf8-core.c +++ b/fs/unicode/utf8-core.c @@ -6,6 +6,7 @@ #include <linux/parser.h> #include <linux/errno.h> #include <linux/unicode.h> +#include <linux/stringhash.h> #include "utf8n.h" @@ -122,9 +123,29 @@ int utf8_casefold(const struct unicode_map *um, const struct qstr *str, } return -EINVAL; } - EXPORT_SYMBOL(utf8_casefold); +int utf8_casefold_hash(const struct unicode_map *um, const void *salt, + struct qstr *str) +{ + const struct utf8data *data = utf8nfdicf(um->version); + struct utf8cursor cur; + int c; + unsigned long hash = init_name_hash(salt); + + if (utf8ncursor(&cur, data, str->name, str->len) < 0) + return -EINVAL; + + while ((c = utf8byte(&cur))) { + if (c < 0) + return -EINVAL; + hash = partial_name_hash((unsigned char)c, hash); + } + str->hash = end_name_hash(hash); + return 0; +} +EXPORT_SYMBOL(utf8_casefold_hash); + int utf8_normalize(const struct unicode_map *um, const struct qstr *str, unsigned char *dest, size_t dlen) { |