summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2026-02-14 20:48:10 +0300
committerLinus Torvalds <torvalds@linux-foundation.org>2026-02-14 20:48:10 +0300
commit3e48a11675c50698374d4ac596fb506736eb1c53 (patch)
tree19784102302960f534344f97950dd3230a0163f5
parent770aaedb461a055f79b971d538678942b6607894 (diff)
parent52190933c37a96164b271f3f30c16099d9eb8c09 (diff)
downloadlinux-3e48a11675c50698374d4ac596fb506736eb1c53.tar.xz
Merge tag 'f2fs-for-7.0-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/jaegeuk/f2fs
Pull f2fs updates from Jaegeuk Kim: "In this development cycle, we focused on several key performance optimizations: - introducing large folio support to enhance read speeds for immutable files - reducing checkpoint=enable latency by flushing only committed dirty pages - implementing tracepoints to diagnose and resolve lock priority inversion. Additionally, we introduced the packed_ssa feature to optimize the SSA footprint when utilizing large block sizes. Detail summary: Enhancements: - support large folio for immutable non-compressed case - support non-4KB block size without packed_ssa feature - optimize f2fs_enable_checkpoint() to avoid long delay - optimize f2fs_overwrite_io() for f2fs_iomap_begin - optimize NAT block loading during checkpoint write - add write latency stats for NAT and SIT blocks in f2fs_write_checkpoint - pin files do not require sbi->writepages lock for ordering - avoid f2fs_map_blocks() for consecutive holes in readpages - flush plug periodically during GC to maximize readahead effect - add tracepoints to catch lock overheads - add several sysfs entries to tune internal lock priorities Fixes: - fix lock priority inversion issue - fix incomplete block usage in compact SSA summaries - fix to show simulate_lock_timeout correctly - fix to avoid mapping wrong physical block for swapfile - fix IS_CHECKPOINTED flag inconsistency issue caused by concurrent atomic commit and checkpoint writes - fix to avoid UAF in f2fs_write_end_io()" * tag 'f2fs-for-7.0-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/jaegeuk/f2fs: (61 commits) f2fs: sysfs: introduce critical_task_priority f2fs: introduce trace_f2fs_priority_update f2fs: fix lock priority inversion issue f2fs: optimize f2fs_overwrite_io() for f2fs_iomap_begin f2fs: fix incomplete block usage in compact SSA summaries f2fs: decrease maximum flush retry count in f2fs_enable_checkpoint() f2fs: optimize NAT block loading during checkpoint write f2fs: change size parameter of __has_cursum_space() to unsigned int f2fs: add write latency stats for NAT and SIT blocks in f2fs_write_checkpoint f2fs: pin files do not require sbi->writepages lock for ordering f2fs: fix to show simulate_lock_timeout correctly f2fs: introduce FAULT_SKIP_WRITE f2fs: check skipped write in f2fs_enable_checkpoint() Revert "f2fs: add timeout in f2fs_enable_checkpoint()" f2fs: fix to unlock folio in f2fs_read_data_large_folio() f2fs: fix error path handling in f2fs_read_data_large_folio() f2fs: use folio_end_read f2fs: fix to avoid mapping wrong physical block for swapfile f2fs: avoid f2fs_map_blocks() for consecutive holes in readpages f2fs: advance index and offset after zeroing in large folio read ...
-rw-r--r--Documentation/ABI/testing/sysfs-fs-f2fs62
-rw-r--r--Documentation/filesystems/f2fs.rst49
-rw-r--r--fs/f2fs/checkpoint.c247
-rw-r--r--fs/f2fs/compress.c18
-rw-r--r--fs/f2fs/data.c438
-rw-r--r--fs/f2fs/debug.c1
-rw-r--r--fs/f2fs/f2fs.h251
-rw-r--r--fs/f2fs/file.c84
-rw-r--r--fs/f2fs/gc.c86
-rw-r--r--fs/f2fs/inline.c10
-rw-r--r--fs/f2fs/inode.c16
-rw-r--r--fs/f2fs/namei.c65
-rw-r--r--fs/f2fs/node.c99
-rw-r--r--fs/f2fs/node.h8
-rw-r--r--fs/f2fs/recovery.c11
-rw-r--r--fs/f2fs/segment.c133
-rw-r--r--fs/f2fs/segment.h108
-rw-r--r--fs/f2fs/super.c202
-rw-r--r--fs/f2fs/sysfs.c111
-rw-r--r--fs/f2fs/xattr.c5
-rw-r--r--include/linux/f2fs_fs.h73
-rw-r--r--include/trace/events/f2fs.h142
22 files changed, 1672 insertions, 547 deletions
diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs
index 770470e0598b..c1d2b3fd9c65 100644
--- a/Documentation/ABI/testing/sysfs-fs-f2fs
+++ b/Documentation/ABI/testing/sysfs-fs-f2fs
@@ -520,7 +520,7 @@ What: /sys/fs/f2fs/<disk>/ckpt_thread_ioprio
Date: January 2021
Contact: "Daeho Jeong" <daehojeong@google.com>
Description: Give a way to change checkpoint merge daemon's io priority.
- Its default value is "be,3", which means "BE" I/O class and
+ Its default value is "rt,3", which means "RT" I/O class and
I/O priority "3". We can select the class between "rt" and "be",
and set the I/O priority within valid range of it. "," delimiter
is necessary in between I/O class and priority number.
@@ -732,7 +732,7 @@ Description: Support configuring fault injection type, should be
FAULT_TRUNCATE 0x00000400
FAULT_READ_IO 0x00000800
FAULT_CHECKPOINT 0x00001000
- FAULT_DISCARD 0x00002000
+ FAULT_DISCARD 0x00002000 (obsolete)
FAULT_WRITE_IO 0x00004000
FAULT_SLAB_ALLOC 0x00008000
FAULT_DQUOT_INIT 0x00010000
@@ -741,8 +741,10 @@ Description: Support configuring fault injection type, should be
FAULT_BLKADDR_CONSISTENCE 0x00080000
FAULT_NO_SEGMENT 0x00100000
FAULT_INCONSISTENT_FOOTER 0x00200000
- FAULT_TIMEOUT 0x00400000 (1000ms)
+ FAULT_ATOMIC_TIMEOUT 0x00400000 (1000ms)
FAULT_VMALLOC 0x00800000
+ FAULT_LOCK_TIMEOUT 0x01000000 (1000ms)
+ FAULT_SKIP_WRITE 0x02000000
=========================== ==========
What: /sys/fs/f2fs/<disk>/discard_io_aware_gran
@@ -939,3 +941,57 @@ Description: Controls write priority in multi-devices setups. A value of 0 means
allocate_section_policy = 1 Prioritize writing to section before allocate_section_hint
allocate_section_policy = 2 Prioritize writing to section after allocate_section_hint
=========================== ==========================================================
+
+What: /sys/fs/f2fs/<disk>/max_lock_elapsed_time
+Date: December 2025
+Contact: "Chao Yu" <chao@kernel.org>
+Description: This is a threshold, once a thread enters critical region that lock covers, total
+ elapsed time exceeds this threshold, f2fs will print tracepoint to dump information
+ of related context. This sysfs entry can be used to control the value of threshold,
+ by default, the value is 500 ms.
+
+What: /sys/fs/f2fs/<disk>/inject_timeout_type
+Date: December 2025
+Contact: "Chao Yu" <chao@kernel.org>
+Description: This sysfs entry can be used to change type of injected timeout:
+ ========== ===============================
+ Flag_Value Flag_Description
+ ========== ===============================
+ 0x00000000 No timeout (default)
+ 0x00000001 Simulate running time
+ 0x00000002 Simulate IO type sleep time
+ 0x00000003 Simulate Non-IO type sleep time
+ 0x00000004 Simulate runnable time
+ ========== ===============================
+
+What: /sys/fs/f2fs/<disk>/adjust_lock_priority
+Date: January 2026
+Contact: "Chao Yu" <chao@kernel.org>
+Description: This sysfs entry can be used to enable/disable to adjust priority for task
+ which is in critical region covered by lock.
+ ========== ==================
+ Flag_Value Flag_Description
+ ========== ==================
+ 0x00000000 Disabled (default)
+ 0x00000001 cp_rwsem
+ 0x00000002 node_change
+ 0x00000004 node_write
+ 0x00000008 gc_lock
+ 0x00000010 cp_global
+ 0x00000020 io_rwsem
+ ========== ==================
+
+What: /sys/fs/f2fs/<disk>/lock_duration_priority
+Date: January 2026
+Contact: "Chao Yu" <chao@kernel.org>
+Description: f2fs can tune priority of thread which has entered into critical region covered by
+ f2fs rwsemphore lock. This sysfs entry can be used to control priority value, the
+ range is [100,139], by default the value is 120.
+
+What: /sys/fs/f2fs/<disk>/critical_task_priority
+Date: February 2026
+Contact: "Chao Yu" <chao@kernel.org>
+Description: It can be used to tune priority of f2fs critical task, e.g. f2fs_ckpt, f2fs_gc
+ threads, limitation as below:
+ - it requires user has CAP_SYS_NICE capability.
+ - the range is [100, 139], by default the value is 100.
diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst
index cb90d1ae82d0..7e4031631286 100644
--- a/Documentation/filesystems/f2fs.rst
+++ b/Documentation/filesystems/f2fs.rst
@@ -206,7 +206,7 @@ fault_type=%d Support configuring fault injection type, should be
FAULT_TRUNCATE 0x00000400
FAULT_READ_IO 0x00000800
FAULT_CHECKPOINT 0x00001000
- FAULT_DISCARD 0x00002000
+ FAULT_DISCARD 0x00002000 (obsolete)
FAULT_WRITE_IO 0x00004000
FAULT_SLAB_ALLOC 0x00008000
FAULT_DQUOT_INIT 0x00010000
@@ -215,8 +215,10 @@ fault_type=%d Support configuring fault injection type, should be
FAULT_BLKADDR_CONSISTENCE 0x00080000
FAULT_NO_SEGMENT 0x00100000
FAULT_INCONSISTENT_FOOTER 0x00200000
- FAULT_TIMEOUT 0x00400000 (1000ms)
+ FAULT_ATOMIC_TIMEOUT 0x00400000 (1000ms)
FAULT_VMALLOC 0x00800000
+ FAULT_LOCK_TIMEOUT 0x01000000 (1000ms)
+ FAULT_SKIP_WRITE 0x02000000
=========================== ==========
mode=%s Control block allocation mode which supports "adaptive"
and "lfs". In "lfs" mode, there should be no random
@@ -1033,3 +1035,46 @@ the reserved space back to F2FS for its own use.
So, the key idea is, user can do any file operations on /dev/vdc, and
reclaim the space after the use, while the space is counted as /data.
That doesn't require modifying partition size and filesystem format.
+
+Per-file Read-Only Large Folio Support
+--------------------------------------
+
+F2FS implements large folio support on the read path to leverage high-order
+page allocation for significant performance gains. To minimize code complexity,
+this support is currently excluded from the write path, which requires handling
+complex optimizations such as compression and block allocation modes.
+
+This optional feature is triggered only when a file's immutable bit is set.
+Consequently, F2FS will return EOPNOTSUPP if a user attempts to open a cached
+file with write permissions, even immediately after clearing the bit. Write
+access is only restored once the cached inode is dropped. The usage flow is
+demonstrated below:
+
+.. code-block::
+
+ # f2fs_io setflags immutable /data/testfile_read_seq
+
+ /* flush and reload the inode to enable the large folio */
+ # sync && echo 3 > /proc/sys/vm/drop_caches
+
+ /* mmap(MAP_POPULATE) + mlock() */
+ # f2fs_io read 128 0 1024 mmap 1 0 /data/testfile_read_seq
+
+ /* mmap() + fadvise(POSIX_FADV_WILLNEED) + mlock() */
+ # f2fs_io read 128 0 1024 fadvise 1 0 /data/testfile_read_seq
+
+ /* mmap() + mlock2(MLOCK_ONFAULT) + madvise(MADV_POPULATE_READ) */
+ # f2fs_io read 128 0 1024 madvise 1 0 /data/testfile_read_seq
+
+ # f2fs_io clearflags immutable /data/testfile_read_seq
+
+ # f2fs_io write 1 0 1 zero buffered /data/testfile_read_seq
+ Failed to open /mnt/test/test: Operation not supported
+
+ /* flush and reload the inode to disable the large folio */
+ # sync && echo 3 > /proc/sys/vm/drop_caches
+
+ # f2fs_io write 1 0 1 zero buffered /data/testfile_read_seq
+ Written 4096 bytes with pattern = zero, total_time = 29 us, max_latency = 28 us
+
+ # rm /data/testfile_read_seq
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 300664269eb6..6dd39b7de11a 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -14,6 +14,9 @@
#include <linux/pagevec.h>
#include <linux/swap.h>
#include <linux/kthread.h>
+#include <linux/delayacct.h>
+#include <linux/ioprio.h>
+#include <linux/math64.h>
#include "f2fs.h"
#include "node.h"
@@ -21,6 +24,209 @@
#include "iostat.h"
#include <trace/events/f2fs.h>
+static inline void get_lock_elapsed_time(struct f2fs_time_stat *ts)
+{
+ ts->total_time = ktime_get();
+#ifdef CONFIG_64BIT
+ ts->running_time = current->se.sum_exec_runtime;
+#endif
+#if defined(CONFIG_SCHED_INFO) && defined(CONFIG_SCHEDSTATS)
+ ts->runnable_time = current->sched_info.run_delay;
+#endif
+#ifdef CONFIG_TASK_DELAY_ACCT
+ if (current->delays)
+ ts->io_sleep_time = current->delays->blkio_delay;
+#endif
+}
+
+static inline void trace_lock_elapsed_time_start(struct f2fs_rwsem *sem,
+ struct f2fs_lock_context *lc)
+{
+ lc->lock_trace = trace_f2fs_lock_elapsed_time_enabled();
+ if (!lc->lock_trace)
+ return;
+
+ get_lock_elapsed_time(&lc->ts);
+}
+
+static inline void trace_lock_elapsed_time_end(struct f2fs_rwsem *sem,
+ struct f2fs_lock_context *lc, bool is_write)
+{
+ struct f2fs_time_stat tts;
+ unsigned long long total_time;
+ unsigned long long running_time = 0;
+ unsigned long long runnable_time = 0;
+ unsigned long long io_sleep_time = 0;
+ unsigned long long other_time = 0;
+ unsigned npm = NSEC_PER_MSEC;
+
+ if (!lc->lock_trace)
+ return;
+
+ if (time_to_inject(sem->sbi, FAULT_LOCK_TIMEOUT))
+ f2fs_schedule_timeout_killable(DEFAULT_FAULT_TIMEOUT, true);
+
+ get_lock_elapsed_time(&tts);
+
+ total_time = div_u64(tts.total_time - lc->ts.total_time, npm);
+ if (total_time <= sem->sbi->max_lock_elapsed_time)
+ return;
+
+#ifdef CONFIG_64BIT
+ running_time = div_u64(tts.running_time - lc->ts.running_time, npm);
+#endif
+#if defined(CONFIG_SCHED_INFO) && defined(CONFIG_SCHEDSTATS)
+ runnable_time = div_u64(tts.runnable_time - lc->ts.runnable_time, npm);
+#endif
+#ifdef CONFIG_TASK_DELAY_ACCT
+ io_sleep_time = div_u64(tts.io_sleep_time - lc->ts.io_sleep_time, npm);
+#endif
+ if (total_time > running_time + io_sleep_time + runnable_time)
+ other_time = total_time - running_time -
+ io_sleep_time - runnable_time;
+
+ trace_f2fs_lock_elapsed_time(sem->sbi, sem->name, is_write, current,
+ get_current_ioprio(), total_time, running_time,
+ runnable_time, io_sleep_time, other_time);
+}
+
+static bool need_uplift_priority(struct f2fs_rwsem *sem, bool is_write)
+{
+ if (!(sem->sbi->adjust_lock_priority & BIT(sem->name - 1)))
+ return false;
+
+ switch (sem->name) {
+ /*
+ * writer is checkpoint which has high priority, let's just uplift
+ * priority for reader
+ */
+ case LOCK_NAME_CP_RWSEM:
+ case LOCK_NAME_NODE_CHANGE:
+ case LOCK_NAME_NODE_WRITE:
+ return !is_write;
+ case LOCK_NAME_GC_LOCK:
+ case LOCK_NAME_CP_GLOBAL:
+ case LOCK_NAME_IO_RWSEM:
+ return true;
+ default:
+ f2fs_bug_on(sem->sbi, 1);
+ }
+ return false;
+}
+
+static void uplift_priority(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc,
+ bool is_write)
+{
+ lc->need_restore = false;
+ if (!sem->sbi->adjust_lock_priority)
+ return;
+ if (rt_task(current))
+ return;
+ if (!need_uplift_priority(sem, is_write))
+ return;
+ lc->orig_nice = task_nice(current);
+ lc->new_nice = PRIO_TO_NICE(sem->sbi->lock_duration_priority);
+ if (lc->orig_nice <= lc->new_nice)
+ return;
+ set_user_nice(current, lc->new_nice);
+ lc->need_restore = true;
+
+ trace_f2fs_priority_uplift(sem->sbi, sem->name, is_write, current,
+ NICE_TO_PRIO(lc->orig_nice), NICE_TO_PRIO(lc->new_nice));
+}
+
+static void restore_priority(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc,
+ bool is_write)
+{
+ if (!lc->need_restore)
+ return;
+ /* someone has updated the priority */
+ if (task_nice(current) != lc->new_nice)
+ return;
+ set_user_nice(current, lc->orig_nice);
+
+ trace_f2fs_priority_restore(sem->sbi, sem->name, is_write, current,
+ NICE_TO_PRIO(lc->orig_nice), NICE_TO_PRIO(lc->new_nice));
+}
+
+void f2fs_down_read_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc)
+{
+ uplift_priority(sem, lc, false);
+ f2fs_down_read(sem);
+ trace_lock_elapsed_time_start(sem, lc);
+}
+
+int f2fs_down_read_trylock_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc)
+{
+ uplift_priority(sem, lc, false);
+ if (!f2fs_down_read_trylock(sem)) {
+ restore_priority(sem, lc, false);
+ return 0;
+ }
+ trace_lock_elapsed_time_start(sem, lc);
+ return 1;
+}
+
+void f2fs_up_read_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc)
+{
+ f2fs_up_read(sem);
+ restore_priority(sem, lc, false);
+ trace_lock_elapsed_time_end(sem, lc, false);
+}
+
+void f2fs_down_write_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc)
+{
+ uplift_priority(sem, lc, true);
+ f2fs_down_write(sem);
+ trace_lock_elapsed_time_start(sem, lc);
+}
+
+int f2fs_down_write_trylock_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc)
+{
+ uplift_priority(sem, lc, true);
+ if (!f2fs_down_write_trylock(sem)) {
+ restore_priority(sem, lc, true);
+ return 0;
+ }
+ trace_lock_elapsed_time_start(sem, lc);
+ return 1;
+}
+
+void f2fs_up_write_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc)
+{
+ f2fs_up_write(sem);
+ restore_priority(sem, lc, true);
+ trace_lock_elapsed_time_end(sem, lc, true);
+}
+
+void f2fs_lock_op(struct f2fs_sb_info *sbi, struct f2fs_lock_context *lc)
+{
+ f2fs_down_read_trace(&sbi->cp_rwsem, lc);
+}
+
+int f2fs_trylock_op(struct f2fs_sb_info *sbi, struct f2fs_lock_context *lc)
+{
+ if (time_to_inject(sbi, FAULT_LOCK_OP))
+ return 0;
+
+ return f2fs_down_read_trylock_trace(&sbi->cp_rwsem, lc);
+}
+
+void f2fs_unlock_op(struct f2fs_sb_info *sbi, struct f2fs_lock_context *lc)
+{
+ f2fs_up_read_trace(&sbi->cp_rwsem, lc);
+}
+
+static inline void f2fs_lock_all(struct f2fs_sb_info *sbi)
+{
+ f2fs_down_write(&sbi->cp_rwsem);
+}
+
+static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi)
+{
+ f2fs_up_write(&sbi->cp_rwsem);
+}
+
#define DEFAULT_CHECKPOINT_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_RT, 3))
static struct kmem_cache *ino_entry_slab;
@@ -379,6 +585,7 @@ static int f2fs_write_meta_pages(struct address_space *mapping,
struct writeback_control *wbc)
{
struct f2fs_sb_info *sbi = F2FS_M_SB(mapping);
+ struct f2fs_lock_context lc;
long diff, written;
if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
@@ -391,13 +598,13 @@ static int f2fs_write_meta_pages(struct address_space *mapping,
goto skip_write;
/* if locked failed, cp will flush dirty pages instead */
- if (!f2fs_down_write_trylock(&sbi->cp_global_sem))
+ if (!f2fs_down_write_trylock_trace(&sbi->cp_global_sem, &lc))
goto skip_write;
trace_f2fs_writepages(mapping->host, wbc, META);
diff = nr_pages_to_write(sbi, META, wbc);
- written = f2fs_sync_meta_pages(sbi, META, wbc->nr_to_write, FS_META_IO);
- f2fs_up_write(&sbi->cp_global_sem);
+ written = f2fs_sync_meta_pages(sbi, wbc->nr_to_write, FS_META_IO);
+ f2fs_up_write_trace(&sbi->cp_global_sem, &lc);
wbc->nr_to_write = max((long)0, wbc->nr_to_write - written - diff);
return 0;
@@ -407,8 +614,8 @@ skip_write:
return 0;
}
-long f2fs_sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type,
- long nr_to_write, enum iostat_type io_type)
+long f2fs_sync_meta_pages(struct f2fs_sb_info *sbi, long nr_to_write,
+ enum iostat_type io_type)
{
struct address_space *mapping = META_MAPPING(sbi);
pgoff_t index = 0, prev = ULONG_MAX;
@@ -469,7 +676,7 @@ continue_unlock:
}
stop:
if (nwritten)
- f2fs_submit_merged_write(sbi, type);
+ f2fs_submit_merged_write(sbi, META);
blk_finish_plug(&plug);
@@ -1312,8 +1519,7 @@ void f2fs_wait_on_all_pages(struct f2fs_sb_info *sbi, int type)
break;
if (type == F2FS_DIRTY_META)
- f2fs_sync_meta_pages(sbi, META, LONG_MAX,
- FS_CP_META_IO);
+ f2fs_sync_meta_pages(sbi, LONG_MAX, FS_CP_META_IO);
else if (type == F2FS_WB_CP_DATA)
f2fs_submit_merged_write(sbi, DATA);
@@ -1485,7 +1691,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
int err;
/* Flush all the NAT/SIT pages */
- f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO);
+ f2fs_sync_meta_pages(sbi, LONG_MAX, FS_CP_META_IO);
stat_cp_time(cpc, CP_TIME_SYNC_META);
@@ -1584,7 +1790,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
}
/* Here, we have one bio having CP pack except cp pack 2 page */
- f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO);
+ f2fs_sync_meta_pages(sbi, LONG_MAX, FS_CP_META_IO);
stat_cp_time(cpc, CP_TIME_SYNC_CP_META);
/* Wait for all dirty meta pages to be submitted for IO */
@@ -1646,6 +1852,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
{
struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
+ struct f2fs_lock_context lc;
unsigned long long ckpt_ver;
int err = 0;
@@ -1660,7 +1867,7 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
f2fs_warn(sbi, "Start checkpoint disabled!");
}
if (cpc->reason != CP_RESIZE)
- f2fs_down_write(&sbi->cp_global_sem);
+ f2fs_down_write_trace(&sbi->cp_global_sem, &lc);
stat_cp_time(cpc, CP_TIME_LOCK);
@@ -1701,6 +1908,7 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
goto out;
}
}
+ stat_cp_time(cpc, CP_TIME_MERGE_WRITE);
/*
* update checkpoint pack index
@@ -1717,10 +1925,11 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
f2fs_bug_on(sbi, !f2fs_cp_error(sbi));
goto stop;
}
+ stat_cp_time(cpc, CP_TIME_FLUSH_NAT);
f2fs_flush_sit_entries(sbi, cpc);
- stat_cp_time(cpc, CP_TIME_FLUSH_META);
+ stat_cp_time(cpc, CP_TIME_FLUSH_SIT);
/* save inmem log status */
f2fs_save_inmem_curseg(sbi);
@@ -1750,7 +1959,7 @@ stop:
trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, CP_PHASE_FINISH_CHECKPOINT);
out:
if (cpc->reason != CP_RESIZE)
- f2fs_up_write(&sbi->cp_global_sem);
+ f2fs_up_write_trace(&sbi->cp_global_sem, &lc);
return err;
}
@@ -1796,11 +2005,12 @@ void f2fs_destroy_checkpoint_caches(void)
static int __write_checkpoint_sync(struct f2fs_sb_info *sbi)
{
struct cp_control cpc = { .reason = CP_SYNC, };
+ struct f2fs_lock_context lc;
int err;
- f2fs_down_write(&sbi->gc_lock);
+ f2fs_down_write_trace(&sbi->gc_lock, &lc);
err = f2fs_write_checkpoint(sbi, &cpc);
- f2fs_up_write(&sbi->gc_lock);
+ f2fs_up_write_trace(&sbi->gc_lock, &lc);
return err;
}
@@ -1888,11 +2098,12 @@ int f2fs_issue_checkpoint(struct f2fs_sb_info *sbi)
cpc.reason = __get_cp_reason(sbi);
if (!test_opt(sbi, MERGE_CHECKPOINT) || cpc.reason != CP_SYNC ||
sbi->umount_lock_holder == current) {
+ struct f2fs_lock_context lc;
int ret;
- f2fs_down_write(&sbi->gc_lock);
+ f2fs_down_write_trace(&sbi->gc_lock, &lc);
ret = f2fs_write_checkpoint(sbi, &cpc);
- f2fs_up_write(&sbi->gc_lock);
+ f2fs_up_write_trace(&sbi->gc_lock, &lc);
return ret;
}
@@ -1947,6 +2158,8 @@ int f2fs_start_ckpt_thread(struct f2fs_sb_info *sbi)
}
set_task_ioprio(cprc->f2fs_issue_ckpt, cprc->ckpt_thread_ioprio);
+ set_user_nice(cprc->f2fs_issue_ckpt,
+ PRIO_TO_NICE(sbi->critical_task_priority));
return 0;
}
diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index ef1225af2acf..006a80acd1de 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -1291,6 +1291,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc,
struct dnode_of_data dn;
struct node_info ni;
struct compress_io_ctx *cic;
+ struct f2fs_lock_context lc;
pgoff_t start_idx = start_idx_of_cluster(cc);
unsigned int last_index = cc->cluster_size - 1;
loff_t psize;
@@ -1309,8 +1310,8 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc,
* checkpoint. This can only happen to quota writes which can cause
* the below discard race condition.
*/
- f2fs_down_read(&sbi->node_write);
- } else if (!f2fs_trylock_op(sbi)) {
+ f2fs_down_read_trace(&sbi->node_write, &lc);
+ } else if (!f2fs_trylock_op(sbi, &lc)) {
goto out_free;
}
@@ -1434,9 +1435,9 @@ unlock_continue:
f2fs_put_dnode(&dn);
if (quota_inode)
- f2fs_up_read(&sbi->node_write);
+ f2fs_up_read_trace(&sbi->node_write, &lc);
else
- f2fs_unlock_op(sbi);
+ f2fs_unlock_op(sbi, &lc);
spin_lock(&fi->i_size_lock);
if (fi->last_disk_size < psize)
@@ -1463,9 +1464,9 @@ out_put_dnode:
f2fs_put_dnode(&dn);
out_unlock_op:
if (quota_inode)
- f2fs_up_read(&sbi->node_write);
+ f2fs_up_read_trace(&sbi->node_write, &lc);
else
- f2fs_unlock_op(sbi);
+ f2fs_unlock_op(sbi, &lc);
out_free:
for (i = 0; i < cc->valid_nr_cpages; i++) {
f2fs_compress_free_page(cc->cpages[i]);
@@ -1512,6 +1513,7 @@ static int f2fs_write_raw_pages(struct compress_ctx *cc,
{
struct address_space *mapping = cc->inode->i_mapping;
struct f2fs_sb_info *sbi = F2FS_M_SB(mapping);
+ struct f2fs_lock_context lc;
int submitted, compr_blocks, i;
int ret = 0;
@@ -1530,7 +1532,7 @@ static int f2fs_write_raw_pages(struct compress_ctx *cc,
/* overwrite compressed cluster w/ normal cluster */
if (compr_blocks > 0)
- f2fs_lock_op(sbi);
+ f2fs_lock_op(sbi, &lc);
for (i = 0; i < cc->cluster_size; i++) {
struct folio *folio;
@@ -1586,7 +1588,7 @@ continue_unlock:
out:
if (compr_blocks > 0)
- f2fs_unlock_op(sbi);
+ f2fs_unlock_op(sbi, &lc);
f2fs_balance_fs(sbi, true);
return ret;
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 491f66511201..338df7a2aea6 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -31,9 +31,15 @@
static struct kmem_cache *bio_post_read_ctx_cache;
static struct kmem_cache *bio_entry_slab;
+static struct kmem_cache *ffs_entry_slab;
static mempool_t *bio_post_read_ctx_pool;
static struct bio_set f2fs_bioset;
+struct f2fs_folio_state {
+ spinlock_t state_lock;
+ unsigned int read_pages_pending;
+};
+
#define F2FS_BIO_POOL_SIZE NR_CURSEG_TYPE
int __init f2fs_init_bioset(void)
@@ -139,11 +145,15 @@ static void f2fs_finish_read_bio(struct bio *bio, bool in_task)
{
struct folio_iter fi;
struct bio_post_read_ctx *ctx = bio->bi_private;
+ unsigned long flags;
bio_for_each_folio_all(fi, bio) {
struct folio *folio = fi.folio;
+ unsigned nr_pages = fi.length >> PAGE_SHIFT;
+ bool finished = true;
- if (f2fs_is_compressed_page(folio)) {
+ if (!folio_test_large(folio) &&
+ f2fs_is_compressed_page(folio)) {
if (ctx && !ctx->decompression_attempted)
f2fs_end_read_compressed_page(folio, true, 0,
in_task);
@@ -151,8 +161,25 @@ static void f2fs_finish_read_bio(struct bio *bio, bool in_task)
continue;
}
- dec_page_count(F2FS_F_SB(folio), __read_io_type(folio));
- folio_end_read(folio, bio->bi_status == BLK_STS_OK);
+ if (folio_test_large(folio)) {
+ struct f2fs_folio_state *ffs = folio->private;
+
+ spin_lock_irqsave(&ffs->state_lock, flags);
+ ffs->read_pages_pending -= nr_pages;
+ finished = !ffs->read_pages_pending;
+ spin_unlock_irqrestore(&ffs->state_lock, flags);
+ }
+
+ while (nr_pages--)
+ dec_page_count(F2FS_F_SB(folio), __read_io_type(folio));
+
+ if (F2FS_F_SB(folio)->node_inode && is_node_folio(folio) &&
+ f2fs_sanity_check_node_footer(F2FS_F_SB(folio),
+ folio, folio->index, NODE_TYPE_REGULAR, true))
+ bio->bi_status = BLK_STS_IOERR;
+
+ if (finished)
+ folio_end_read(folio, bio->bi_status == BLK_STS_OK);
}
if (ctx)
@@ -189,7 +216,7 @@ static void f2fs_verify_bio(struct work_struct *work)
struct folio *folio = fi.folio;
if (!f2fs_is_compressed_page(folio) &&
- !fsverity_verify_page(vi, &folio->page)) {
+ !fsverity_verify_folio(vi, folio)) {
bio->bi_status = BLK_STS_IOERR;
break;
}
@@ -354,18 +381,27 @@ static void f2fs_write_end_io(struct bio *bio)
STOP_CP_REASON_WRITE_FAIL);
}
- f2fs_bug_on(sbi, is_node_folio(folio) &&
- folio->index != nid_of_node(folio));
+ if (is_node_folio(folio)) {
+ f2fs_sanity_check_node_footer(sbi, folio,
+ folio->index, NODE_TYPE_REGULAR, true);
+ f2fs_bug_on(sbi, folio->index != nid_of_node(folio));
+ }
dec_page_count(sbi, type);
+
+ /*
+ * we should access sbi before folio_end_writeback() to
+ * avoid racing w/ kill_f2fs_super()
+ */
+ if (type == F2FS_WB_CP_DATA && !get_pages(sbi, type) &&
+ wq_has_sleeper(&sbi->cp_wait))
+ wake_up(&sbi->cp_wait);
+
if (f2fs_in_warm_node_list(sbi, folio))
f2fs_del_fsync_node_entry(sbi, folio);
folio_clear_f2fs_gcing(folio);
folio_end_writeback(folio);
}
- if (!get_pages(sbi, F2FS_WB_CP_DATA) &&
- wq_has_sleeper(&sbi->cp_wait))
- wake_up(&sbi->cp_wait);
bio_put(bio);
}
@@ -511,6 +547,9 @@ static bool f2fs_crypt_mergeable_bio(struct bio *bio, const struct inode *inode,
void f2fs_submit_read_bio(struct f2fs_sb_info *sbi, struct bio *bio,
enum page_type type)
{
+ if (!bio)
+ return;
+
WARN_ON_ONCE(!is_read_io(bio_op(bio)));
trace_f2fs_submit_read_bio(sbi->sb, type, bio);
@@ -597,7 +636,8 @@ int f2fs_init_write_merge_io(struct f2fs_sb_info *sbi)
for (j = HOT; j < n; j++) {
struct f2fs_bio_info *io = &sbi->write_io[i][j];
- init_f2fs_rwsem(&io->io_rwsem);
+ init_f2fs_rwsem_trace(&io->io_rwsem, sbi,
+ LOCK_NAME_IO_RWSEM);
io->sbi = sbi;
io->bio = NULL;
io->last_block_in_bio = 0;
@@ -621,8 +661,9 @@ static void __f2fs_submit_merged_write(struct f2fs_sb_info *sbi,
{
enum page_type btype = PAGE_TYPE_OF_BIO(type);
struct f2fs_bio_info *io = sbi->write_io[btype] + temp;
+ struct f2fs_lock_context lc;
- f2fs_down_write(&io->io_rwsem);
+ f2fs_down_write_trace(&io->io_rwsem, &lc);
if (!io->bio)
goto unlock_out;
@@ -636,27 +677,37 @@ static void __f2fs_submit_merged_write(struct f2fs_sb_info *sbi,
}
__submit_merged_bio(io);
unlock_out:
- f2fs_up_write(&io->io_rwsem);
+ f2fs_up_write_trace(&io->io_rwsem, &lc);
}
static void __submit_merged_write_cond(struct f2fs_sb_info *sbi,
struct inode *inode, struct folio *folio,
- nid_t ino, enum page_type type, bool force)
+ nid_t ino, enum page_type type, bool writeback)
{
enum temp_type temp;
bool ret = true;
+ bool force = !inode && !folio && !ino;
for (temp = HOT; temp < NR_TEMP_TYPE; temp++) {
if (!force) {
enum page_type btype = PAGE_TYPE_OF_BIO(type);
struct f2fs_bio_info *io = sbi->write_io[btype] + temp;
+ struct f2fs_lock_context lc;
- f2fs_down_read(&io->io_rwsem);
+ f2fs_down_read_trace(&io->io_rwsem, &lc);
ret = __has_merged_page(io->bio, inode, folio, ino);
- f2fs_up_read(&io->io_rwsem);
+ f2fs_up_read_trace(&io->io_rwsem, &lc);
}
- if (ret)
+ if (ret) {
__f2fs_submit_merged_write(sbi, type, temp);
+ /*
+ * For waitting writebck case, if the bio owned by the
+ * folio is already submitted, we do not need to submit
+ * other types of bios.
+ */
+ if (writeback)
+ break;
+ }
/* TODO: use HOT temp only for meta pages now. */
if (type >= META)
@@ -666,7 +717,7 @@ static void __submit_merged_write_cond(struct f2fs_sb_info *sbi,
void f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type)
{
- __submit_merged_write_cond(sbi, NULL, NULL, 0, type, true);
+ __submit_merged_write_cond(sbi, NULL, NULL, 0, type, false);
}
void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi,
@@ -676,6 +727,12 @@ void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi,
__submit_merged_write_cond(sbi, inode, folio, ino, type, false);
}
+void f2fs_submit_merged_write_folio(struct f2fs_sb_info *sbi,
+ struct folio *folio, enum page_type type)
+{
+ __submit_merged_write_cond(sbi, NULL, folio, 0, type, true);
+}
+
void f2fs_flush_merged_writes(struct f2fs_sb_info *sbi)
{
f2fs_submit_merged_write(sbi, DATA);
@@ -949,11 +1006,12 @@ void f2fs_submit_page_write(struct f2fs_io_info *fio)
enum page_type btype = PAGE_TYPE_OF_BIO(fio->type);
struct f2fs_bio_info *io = sbi->write_io[btype] + fio->temp;
struct folio *bio_folio;
+ struct f2fs_lock_context lc;
enum count_type type;
f2fs_bug_on(sbi, is_read_io(fio->op));
- f2fs_down_write(&io->io_rwsem);
+ f2fs_down_write_trace(&io->io_rwsem, &lc);
next:
#ifdef CONFIG_BLK_DEV_ZONED
if (f2fs_sb_has_blkzoned(sbi) && btype < META && io->zone_pending_bio) {
@@ -1035,7 +1093,7 @@ out:
if (is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN) ||
!f2fs_is_checkpoint_ready(sbi))
__submit_merged_bio(io);
- f2fs_up_write(&io->io_rwsem);
+ f2fs_up_write_trace(&io->io_rwsem, &lc);
}
static struct bio *f2fs_grab_read_bio(struct inode *inode,
@@ -1212,11 +1270,21 @@ struct folio *f2fs_get_read_data_folio(struct inode *inode, pgoff_t index,
struct dnode_of_data dn;
struct folio *folio;
int err;
-
+retry:
folio = f2fs_grab_cache_folio(mapping, index, for_write);
if (IS_ERR(folio))
return folio;
+ if (folio_test_large(folio)) {
+ pgoff_t folio_index = mapping_align_index(mapping, index);
+
+ f2fs_folio_put(folio, true);
+ invalidate_inode_pages2_range(mapping, folio_index,
+ folio_index + folio_nr_pages(folio) - 1);
+ f2fs_schedule_timeout(DEFAULT_SCHEDULE_TIMEOUT);
+ goto retry;
+ }
+
if (f2fs_lookup_read_extent_cache_block(inode, index,
&dn.data_blkaddr)) {
if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), dn.data_blkaddr,
@@ -1428,34 +1496,37 @@ static int __allocate_data_block(struct dnode_of_data *dn, int seg_type)
return 0;
}
-static void f2fs_map_lock(struct f2fs_sb_info *sbi, int flag)
+static void f2fs_map_lock(struct f2fs_sb_info *sbi,
+ struct f2fs_lock_context *lc,
+ int flag)
{
- f2fs_down_read(&sbi->cp_enable_rwsem);
if (flag == F2FS_GET_BLOCK_PRE_AIO)
- f2fs_down_read(&sbi->node_change);
+ f2fs_down_read_trace(&sbi->node_change, lc);
else
- f2fs_lock_op(sbi);
+ f2fs_lock_op(sbi, lc);
}
-static void f2fs_map_unlock(struct f2fs_sb_info *sbi, int flag)
+static void f2fs_map_unlock(struct f2fs_sb_info *sbi,
+ struct f2fs_lock_context *lc,
+ int flag)
{
if (flag == F2FS_GET_BLOCK_PRE_AIO)
- f2fs_up_read(&sbi->node_change);
+ f2fs_up_read_trace(&sbi->node_change, lc);
else
- f2fs_unlock_op(sbi);
- f2fs_up_read(&sbi->cp_enable_rwsem);
+ f2fs_unlock_op(sbi, lc);
}
int f2fs_get_block_locked(struct dnode_of_data *dn, pgoff_t index)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
+ struct f2fs_lock_context lc;
int err = 0;
- f2fs_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO);
+ f2fs_map_lock(sbi, &lc, F2FS_GET_BLOCK_PRE_AIO);
if (!f2fs_lookup_read_extent_cache_block(dn->inode, index,
&dn->data_blkaddr))
err = f2fs_reserve_block(dn, index);
- f2fs_map_unlock(sbi, F2FS_GET_BLOCK_PRE_AIO);
+ f2fs_map_unlock(sbi, &lc, F2FS_GET_BLOCK_PRE_AIO);
return err;
}
@@ -1546,6 +1617,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, int flag)
unsigned int maxblocks = map->m_len;
struct dnode_of_data dn;
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct f2fs_lock_context lc;
int mode = map->m_may_create ? ALLOC_NODE : LOOKUP_NODE;
pgoff_t pgofs, end_offset, end;
int err = 0, ofs = 1;
@@ -1584,7 +1656,7 @@ next_dnode:
if (map->m_may_create) {
if (f2fs_lfs_mode(sbi))
f2fs_balance_fs(sbi, true);
- f2fs_map_lock(sbi, flag);
+ f2fs_map_lock(sbi, &lc, flag);
}
/* When reading holes, we need its node page */
@@ -1750,7 +1822,7 @@ skip:
f2fs_put_dnode(&dn);
if (map->m_may_create) {
- f2fs_map_unlock(sbi, flag);
+ f2fs_map_unlock(sbi, &lc, flag);
f2fs_balance_fs(sbi, dn.node_changed);
}
goto next_dnode;
@@ -1797,7 +1869,7 @@ sync_out:
f2fs_put_dnode(&dn);
unlock_out:
if (map->m_may_create) {
- f2fs_map_unlock(sbi, flag);
+ f2fs_map_unlock(sbi, &lc, flag);
f2fs_balance_fs(sbi, dn.node_changed);
}
out:
@@ -1805,7 +1877,8 @@ out:
return err;
}
-bool f2fs_overwrite_io(struct inode *inode, loff_t pos, size_t len)
+static bool __f2fs_overwrite_io(struct inode *inode, loff_t pos, size_t len,
+ bool check_first)
{
struct f2fs_map_blocks map;
block_t last_lblk;
@@ -1827,10 +1900,17 @@ bool f2fs_overwrite_io(struct inode *inode, loff_t pos, size_t len)
if (err || map.m_len == 0)
return false;
map.m_lblk += map.m_len;
+ if (check_first)
+ break;
}
return true;
}
+bool f2fs_overwrite_io(struct inode *inode, loff_t pos, size_t len)
+{
+ return __f2fs_overwrite_io(inode, pos, len, false);
+}
+
static int f2fs_xattr_fiemap(struct inode *inode,
struct fiemap_extent_info *fieinfo)
{
@@ -2104,10 +2184,13 @@ static int f2fs_read_single_page(struct inode *inode, struct fsverity_info *vi,
/*
* Map blocks using the previous result first.
*/
- if ((map->m_flags & F2FS_MAP_MAPPED) &&
- block_in_file > map->m_lblk &&
+ if (map->m_flags & F2FS_MAP_MAPPED) {
+ if (block_in_file > map->m_lblk &&
block_in_file < (map->m_lblk + map->m_len))
+ goto got_it;
+ } else if (block_in_file < *map->m_next_pgofs) {
goto got_it;
+ }
/*
* Then do more f2fs_map_blocks() calls until we are
@@ -2343,6 +2426,185 @@ out:
}
#endif
+static struct f2fs_folio_state *ffs_find_or_alloc(struct folio *folio)
+{
+ struct f2fs_folio_state *ffs = folio->private;
+
+ if (ffs)
+ return ffs;
+
+ ffs = f2fs_kmem_cache_alloc(ffs_entry_slab,
+ GFP_NOIO | __GFP_ZERO, true, NULL);
+
+ spin_lock_init(&ffs->state_lock);
+ folio_attach_private(folio, ffs);
+ return ffs;
+}
+
+static void ffs_detach_free(struct folio *folio)
+{
+ struct f2fs_folio_state *ffs;
+
+ if (!folio_test_large(folio)) {
+ folio_detach_private(folio);
+ return;
+ }
+
+ ffs = folio_detach_private(folio);
+ if (!ffs)
+ return;
+
+ WARN_ON_ONCE(ffs->read_pages_pending != 0);
+ kmem_cache_free(ffs_entry_slab, ffs);
+}
+
+static int f2fs_read_data_large_folio(struct inode *inode,
+ struct fsverity_info *vi,
+ struct readahead_control *rac, struct folio *folio)
+{
+ struct bio *bio = NULL;
+ sector_t last_block_in_bio = 0;
+ struct f2fs_map_blocks map = {0, };
+ pgoff_t index, offset, next_pgofs = 0;
+ unsigned max_nr_pages = rac ? readahead_count(rac) :
+ folio_nr_pages(folio);
+ unsigned nrpages;
+ struct f2fs_folio_state *ffs;
+ int ret = 0;
+ bool folio_in_bio;
+
+ if (!IS_IMMUTABLE(inode) || f2fs_compressed_file(inode)) {
+ if (folio)
+ folio_unlock(folio);
+ return -EOPNOTSUPP;
+ }
+
+ map.m_seg_type = NO_CHECK_TYPE;
+
+ if (rac)
+ folio = readahead_folio(rac);
+next_folio:
+ if (!folio)
+ goto out;
+
+ folio_in_bio = false;
+ index = folio->index;
+ offset = 0;
+ ffs = NULL;
+ nrpages = folio_nr_pages(folio);
+
+ for (; nrpages; nrpages--, max_nr_pages--, index++, offset++) {
+ sector_t block_nr;
+ /*
+ * Map blocks using the previous result first.
+ */
+ if (map.m_flags & F2FS_MAP_MAPPED) {
+ if (index > map.m_lblk &&
+ index < (map.m_lblk + map.m_len))
+ goto got_it;
+ } else if (index < next_pgofs) {
+ /* hole case */
+ goto got_it;
+ }
+
+ /*
+ * Then do more f2fs_map_blocks() calls until we are
+ * done with this page.
+ */
+ memset(&map, 0, sizeof(map));
+ map.m_next_pgofs = &next_pgofs;
+ map.m_seg_type = NO_CHECK_TYPE;
+ map.m_lblk = index;
+ map.m_len = max_nr_pages;
+
+ ret = f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_DEFAULT);
+ if (ret)
+ goto err_out;
+got_it:
+ if ((map.m_flags & F2FS_MAP_MAPPED)) {
+ block_nr = map.m_pblk + index - map.m_lblk;
+ if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), block_nr,
+ DATA_GENERIC_ENHANCE_READ)) {
+ ret = -EFSCORRUPTED;
+ goto err_out;
+ }
+ } else {
+ size_t page_offset = offset << PAGE_SHIFT;
+ folio_zero_range(folio, page_offset, PAGE_SIZE);
+ if (vi && !fsverity_verify_blocks(vi, folio, PAGE_SIZE, page_offset)) {
+ ret = -EIO;
+ goto err_out;
+ }
+ continue;
+ }
+
+ /* We must increment read_pages_pending before possible BIOs submitting
+ * to prevent from premature folio_end_read() call on folio
+ */
+ if (folio_test_large(folio)) {
+ ffs = ffs_find_or_alloc(folio);
+
+ /* set the bitmap to wait */
+ spin_lock_irq(&ffs->state_lock);
+ ffs->read_pages_pending++;
+ spin_unlock_irq(&ffs->state_lock);
+ }
+
+ /*
+ * This page will go to BIO. Do we need to send this
+ * BIO off first?
+ */
+ if (bio && (!page_is_mergeable(F2FS_I_SB(inode), bio,
+ last_block_in_bio, block_nr) ||
+ !f2fs_crypt_mergeable_bio(bio, inode, index, NULL))) {
+submit_and_realloc:
+ f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
+ bio = NULL;
+ }
+ if (bio == NULL)
+ bio = f2fs_grab_read_bio(inode, vi,
+ block_nr, max_nr_pages,
+ f2fs_ra_op_flags(rac),
+ index, false);
+
+ /*
+ * If the page is under writeback, we need to wait for
+ * its completion to see the correct decrypted data.
+ */
+ f2fs_wait_on_block_writeback(inode, block_nr);
+
+ if (!bio_add_folio(bio, folio, F2FS_BLKSIZE,
+ offset << PAGE_SHIFT))
+ goto submit_and_realloc;
+
+ folio_in_bio = true;
+ inc_page_count(F2FS_I_SB(inode), F2FS_RD_DATA);
+ f2fs_update_iostat(F2FS_I_SB(inode), NULL, FS_DATA_READ_IO,
+ F2FS_BLKSIZE);
+ last_block_in_bio = block_nr;
+ }
+ trace_f2fs_read_folio(folio, DATA);
+err_out:
+ if (!folio_in_bio) {
+ folio_end_read(folio, !ret);
+ if (ret)
+ return ret;
+ }
+ if (rac) {
+ folio = readahead_folio(rac);
+ goto next_folio;
+ }
+out:
+ f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
+ if (ret) {
+ /* Wait bios and clear uptodate. */
+ folio_lock(folio);
+ folio_clear_uptodate(folio);
+ folio_unlock(folio);
+ }
+ return ret;
+}
+
/*
* This function was originally taken from fs/mpage.c, and customized for f2fs.
* Major change was from block_size == page_size in f2fs by default.
@@ -2367,10 +2629,15 @@ static int f2fs_mpage_readpages(struct inode *inode, struct fsverity_info *vi,
pgoff_t nc_cluster_idx = NULL_CLUSTER;
pgoff_t index;
#endif
+ pgoff_t next_pgofs = 0;
unsigned nr_pages = rac ? readahead_count(rac) : 1;
+ struct address_space *mapping = rac ? rac->mapping : folio->mapping;
unsigned max_nr_pages = nr_pages;
int ret = 0;
+ if (mapping_large_folio_support(mapping))
+ return f2fs_read_data_large_folio(inode, vi, rac, folio);
+
#ifdef CONFIG_F2FS_FS_COMPRESSION
if (f2fs_compressed_file(inode)) {
index = rac ? readahead_index(rac) : folio->index;
@@ -2383,7 +2650,7 @@ static int f2fs_mpage_readpages(struct inode *inode, struct fsverity_info *vi,
map.m_lblk = 0;
map.m_len = 0;
map.m_flags = 0;
- map.m_next_pgofs = NULL;
+ map.m_next_pgofs = &next_pgofs;
map.m_next_extent = NULL;
map.m_seg_type = NO_CHECK_TYPE;
map.m_may_create = false;
@@ -2464,8 +2731,7 @@ next_page:
}
#endif
}
- if (bio)
- f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
+ f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
return ret;
}
@@ -2663,6 +2929,7 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio)
struct inode *inode = folio->mapping->host;
struct dnode_of_data dn;
struct node_info ni;
+ struct f2fs_lock_context lc;
bool ipu_force = false;
bool atomic_commit;
int err = 0;
@@ -2687,8 +2954,12 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio)
goto got_it;
}
+ if (is_sbi_flag_set(fio->sbi, SBI_ENABLE_CHECKPOINT) &&
+ time_to_inject(fio->sbi, FAULT_SKIP_WRITE))
+ return -EINVAL;
+
/* Deadlock due to between page->lock and f2fs_lock_op */
- if (fio->need_lock == LOCK_REQ && !f2fs_trylock_op(fio->sbi))
+ if (fio->need_lock == LOCK_REQ && !f2fs_trylock_op(fio->sbi, &lc))
return -EAGAIN;
err = f2fs_get_dnode_of_data(&dn, folio->index, LOOKUP_NODE);
@@ -2729,7 +3000,7 @@ got_it:
folio_start_writeback(folio);
f2fs_put_dnode(&dn);
if (fio->need_lock == LOCK_REQ)
- f2fs_unlock_op(fio->sbi);
+ f2fs_unlock_op(fio->sbi, &lc);
err = f2fs_inplace_write_data(fio);
if (err) {
if (fscrypt_inode_uses_fs_layer_crypto(inode))
@@ -2743,7 +3014,7 @@ got_it:
}
if (fio->need_lock == LOCK_RETRY) {
- if (!f2fs_trylock_op(fio->sbi)) {
+ if (!f2fs_trylock_op(fio->sbi, &lc)) {
err = -EAGAIN;
goto out_writepage;
}
@@ -2775,7 +3046,7 @@ out_writepage:
f2fs_put_dnode(&dn);
out:
if (fio->need_lock == LOCK_REQ)
- f2fs_unlock_op(fio->sbi);
+ f2fs_unlock_op(fio->sbi, &lc);
return err;
}
@@ -2855,19 +3126,21 @@ int f2fs_write_single_data_page(struct folio *folio, int *submitted,
write:
/* Dentry/quota blocks are controlled by checkpoint */
if (S_ISDIR(inode->i_mode) || quota_inode) {
+ struct f2fs_lock_context lc;
+
/*
* We need to wait for node_write to avoid block allocation during
* checkpoint. This can only happen to quota writes which can cause
* the below discard race condition.
*/
if (quota_inode)
- f2fs_down_read(&sbi->node_write);
+ f2fs_down_read_trace(&sbi->node_write, &lc);
fio.need_lock = LOCK_DONE;
err = f2fs_do_write_data_page(&fio);
if (quota_inode)
- f2fs_up_read(&sbi->node_write);
+ f2fs_up_read_trace(&sbi->node_write, &lc);
goto done;
}
@@ -3237,6 +3510,8 @@ static inline bool __should_serialize_io(struct inode *inode,
if (IS_NOQUOTA(inode))
return false;
+ if (f2fs_is_pinned_file(inode))
+ return false;
if (f2fs_need_compress_data(inode))
return true;
if (wbc->sync_mode != WB_SYNC_ALL)
@@ -3259,6 +3534,16 @@ static inline void account_writeback(struct inode *inode, bool inc)
f2fs_up_read(&F2FS_I(inode)->i_sem);
}
+static inline void update_skipped_write(struct f2fs_sb_info *sbi,
+ struct writeback_control *wbc)
+{
+ long skipped = wbc->pages_skipped;
+
+ if (is_sbi_flag_set(sbi, SBI_ENABLE_CHECKPOINT) && skipped &&
+ wbc->sync_mode == WB_SYNC_ALL)
+ atomic_add(skipped, &sbi->nr_pages[F2FS_SKIPPED_WRITE]);
+}
+
static int __f2fs_write_data_pages(struct address_space *mapping,
struct writeback_control *wbc,
enum iostat_type io_type)
@@ -3323,10 +3608,19 @@ static int __f2fs_write_data_pages(struct address_space *mapping,
*/
f2fs_remove_dirty_inode(inode);
+
+ /*
+ * f2fs_write_cache_pages() has retry logic for EAGAIN case which is
+ * common when racing w/ checkpoint, so only update skipped write
+ * when ret is non-zero.
+ */
+ if (ret)
+ update_skipped_write(sbi, wbc);
return ret;
skip_write:
wbc->pages_skipped += get_dirty_pages(inode);
+ update_skipped_write(sbi, wbc);
trace_f2fs_writepages(mapping->host, wbc, DATA);
return 0;
}
@@ -3368,6 +3662,7 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi,
struct inode *inode = folio->mapping->host;
pgoff_t index = folio->index;
struct dnode_of_data dn;
+ struct f2fs_lock_context lc;
struct folio *ifolio;
bool locked = false;
int flag = F2FS_GET_BLOCK_PRE_AIO;
@@ -3384,10 +3679,10 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi,
if (f2fs_has_inline_data(inode)) {
if (pos + len > MAX_INLINE_DATA(inode))
flag = F2FS_GET_BLOCK_DEFAULT;
- f2fs_map_lock(sbi, flag);
+ f2fs_map_lock(sbi, &lc, flag);
locked = true;
} else if ((pos & PAGE_MASK) >= i_size_read(inode)) {
- f2fs_map_lock(sbi, flag);
+ f2fs_map_lock(sbi, &lc, flag);
locked = true;
}
@@ -3431,7 +3726,7 @@ restart:
if (!err && dn.data_blkaddr != NULL_ADDR)
goto out;
f2fs_put_dnode(&dn);
- f2fs_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO);
+ f2fs_map_lock(sbi, &lc, F2FS_GET_BLOCK_PRE_AIO);
WARN_ON(flag != F2FS_GET_BLOCK_PRE_AIO);
locked = true;
goto restart;
@@ -3445,7 +3740,7 @@ out:
f2fs_put_dnode(&dn);
unlock_out:
if (locked)
- f2fs_map_unlock(sbi, flag);
+ f2fs_map_unlock(sbi, &lc, flag);
return err;
}
@@ -3481,10 +3776,11 @@ static int __reserve_data_block(struct inode *inode, pgoff_t index,
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct dnode_of_data dn;
+ struct f2fs_lock_context lc;
struct folio *ifolio;
int err = 0;
- f2fs_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO);
+ f2fs_map_lock(sbi, &lc, F2FS_GET_BLOCK_PRE_AIO);
ifolio = f2fs_get_inode_folio(sbi, inode->i_ino);
if (IS_ERR(ifolio)) {
@@ -3502,7 +3798,7 @@ static int __reserve_data_block(struct inode *inode, pgoff_t index,
f2fs_put_dnode(&dn);
unlock_out:
- f2fs_map_unlock(sbi, F2FS_GET_BLOCK_PRE_AIO);
+ f2fs_map_unlock(sbi, &lc, F2FS_GET_BLOCK_PRE_AIO);
return err;
}
@@ -3761,7 +4057,12 @@ void f2fs_invalidate_folio(struct folio *folio, size_t offset, size_t length)
f2fs_remove_dirty_inode(inode);
}
}
- folio_detach_private(folio);
+
+ if (offset || length != folio_size(folio))
+ return;
+
+ folio_cancel_dirty(folio);
+ ffs_detach_free(folio);
}
bool f2fs_release_folio(struct folio *folio, gfp_t wait)
@@ -3770,7 +4071,7 @@ bool f2fs_release_folio(struct folio *folio, gfp_t wait)
if (folio_test_dirty(folio))
return false;
- folio_detach_private(folio);
+ ffs_detach_free(folio);
return true;
}
@@ -3955,6 +4256,7 @@ static int check_swap_activate(struct swap_info_struct *sis,
while (cur_lblock < last_lblock && cur_lblock < sis->max) {
struct f2fs_map_blocks map;
+ bool last_extent = false;
retry:
cond_resched();
@@ -3980,11 +4282,10 @@ retry:
pblock = map.m_pblk;
nr_pblocks = map.m_len;
- if ((pblock - SM_I(sbi)->main_blkaddr) % blks_per_sec ||
- nr_pblocks % blks_per_sec ||
- f2fs_is_sequential_zone_area(sbi, pblock)) {
- bool last_extent = false;
-
+ if (!last_extent &&
+ ((pblock - SM_I(sbi)->main_blkaddr) % blks_per_sec ||
+ nr_pblocks % blks_per_sec ||
+ f2fs_is_sequential_zone_area(sbi, pblock))) {
not_aligned++;
nr_pblocks = roundup(nr_pblocks, blks_per_sec);
@@ -4005,8 +4306,8 @@ retry:
goto out;
}
- if (!last_extent)
- goto retry;
+ /* lookup block mapping info after block migration */
+ goto retry;
}
if (cur_lblock + nr_pblocks >= sis->max)
@@ -4176,12 +4477,25 @@ int __init f2fs_init_bio_entry_cache(void)
{
bio_entry_slab = f2fs_kmem_cache_create("f2fs_bio_entry_slab",
sizeof(struct bio_entry));
- return bio_entry_slab ? 0 : -ENOMEM;
+
+ if (!bio_entry_slab)
+ return -ENOMEM;
+
+ ffs_entry_slab = f2fs_kmem_cache_create("f2fs_ffs_slab",
+ sizeof(struct f2fs_folio_state));
+
+ if (!ffs_entry_slab) {
+ kmem_cache_destroy(bio_entry_slab);
+ return -ENOMEM;
+ }
+
+ return 0;
}
void f2fs_destroy_bio_entry_cache(void)
{
kmem_cache_destroy(bio_entry_slab);
+ kmem_cache_destroy(ffs_entry_slab);
}
static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
@@ -4207,7 +4521,7 @@ static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
* f2fs_map_lock and f2fs_balance_fs are not necessary.
*/
if ((flags & IOMAP_WRITE) &&
- !f2fs_overwrite_io(inode, offset, length))
+ !__f2fs_overwrite_io(inode, offset, length, true))
map.m_may_create = true;
err = f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_DIO);
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index 032683835569..8e1040e375a7 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -423,6 +423,7 @@ static const char *s_flag[MAX_SBI_FLAG] = {
[SBI_IS_RESIZEFS] = "resizefs",
[SBI_IS_FREEZING] = "freezefs",
[SBI_IS_WRITABLE] = "writable",
+ [SBI_ENABLE_CHECKPOINT] = "enable_checkpoint",
};
static const char *ipu_mode_names[F2FS_IPU_MAX] = {
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index a90a62cfe617..bb34e864d0ef 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -54,7 +54,7 @@ enum {
FAULT_TRUNCATE,
FAULT_READ_IO,
FAULT_CHECKPOINT,
- FAULT_DISCARD,
+ FAULT_DISCARD, /* it's obsolete due to __blkdev_issue_discard() will never fail */
FAULT_WRITE_IO,
FAULT_SLAB_ALLOC,
FAULT_DQUOT_INIT,
@@ -63,8 +63,10 @@ enum {
FAULT_BLKADDR_CONSISTENCE,
FAULT_NO_SEGMENT,
FAULT_INCONSISTENT_FOOTER,
- FAULT_TIMEOUT,
+ FAULT_ATOMIC_TIMEOUT,
FAULT_VMALLOC,
+ FAULT_LOCK_TIMEOUT,
+ FAULT_SKIP_WRITE,
FAULT_MAX,
};
@@ -72,7 +74,8 @@ enum {
enum fault_option {
FAULT_RATE = 1, /* only update fault rate */
FAULT_TYPE = 2, /* only update fault type */
- FAULT_ALL = 4, /* reset all fault injection options/stats */
+ FAULT_TIMEOUT = 4, /* only update fault timeout type */
+ FAULT_ALL = 8, /* reset all fault injection options/stats */
};
#ifdef CONFIG_F2FS_FAULT_INJECTION
@@ -82,6 +85,7 @@ struct f2fs_fault_info {
unsigned int inject_type;
/* Used to account total count of injection for each type */
unsigned int inject_count[FAULT_MAX];
+ unsigned int inject_lock_timeout; /* inject lock timeout */
};
extern const char *f2fs_fault_name[FAULT_MAX];
@@ -173,6 +177,26 @@ enum device_allocation_policy {
ALLOCATE_FORWARD_FROM_HINT,
};
+enum f2fs_lock_name {
+ LOCK_NAME_NONE,
+ LOCK_NAME_CP_RWSEM,
+ LOCK_NAME_NODE_CHANGE,
+ LOCK_NAME_NODE_WRITE,
+ LOCK_NAME_GC_LOCK,
+ LOCK_NAME_CP_GLOBAL,
+ LOCK_NAME_IO_RWSEM,
+ LOCK_NAME_MAX,
+};
+
+enum f2fs_timeout_type {
+ TIMEOUT_TYPE_NONE,
+ TIMEOUT_TYPE_RUNNING,
+ TIMEOUT_TYPE_IO_SLEEP,
+ TIMEOUT_TYPE_NONIO_SLEEP,
+ TIMEOUT_TYPE_RUNNABLE,
+ TIMEOUT_TYPE_MAX,
+};
+
/*
* An implementation of an rwsem that is explicitly unfair to readers. This
* prevents priority inversion when a low-priority reader acquires the read lock
@@ -181,6 +205,8 @@ enum device_allocation_policy {
*/
struct f2fs_rwsem {
+ struct f2fs_sb_info *sbi;
+ enum f2fs_lock_name name;
struct rw_semaphore internal_rwsem;
#ifdef CONFIG_F2FS_UNFAIR_RWSEM
wait_queue_head_t read_waiters;
@@ -287,7 +313,6 @@ enum {
#define DEF_CP_INTERVAL 60 /* 60 secs */
#define DEF_IDLE_INTERVAL 5 /* 5 secs */
#define DEF_DISABLE_INTERVAL 5 /* 5 secs */
-#define DEF_ENABLE_INTERVAL 5 /* 5 secs */
#define DEF_DISABLE_QUICK_INTERVAL 1 /* 1 secs */
#define DEF_UMOUNT_DISCARD_TIMEOUT 5 /* 5 secs */
@@ -295,7 +320,9 @@ enum cp_time {
CP_TIME_START, /* begin */
CP_TIME_LOCK, /* after cp_global_sem */
CP_TIME_OP_LOCK, /* after block_operation */
- CP_TIME_FLUSH_META, /* after flush sit/nat */
+ CP_TIME_MERGE_WRITE, /* after flush DATA/NODE/META */
+ CP_TIME_FLUSH_NAT, /* after flush nat */
+ CP_TIME_FLUSH_SIT, /* after flush sit */
CP_TIME_SYNC_META, /* after sync_meta_pages */
CP_TIME_SYNC_CP_META, /* after sync cp meta pages */
CP_TIME_WAIT_DIRTY_META,/* after wait on dirty meta */
@@ -521,13 +548,25 @@ struct fsync_inode_entry {
#define nats_in_cursum(jnl) (le16_to_cpu((jnl)->n_nats))
#define sits_in_cursum(jnl) (le16_to_cpu((jnl)->n_sits))
-#define nat_in_journal(jnl, i) ((jnl)->nat_j.entries[i].ne)
-#define nid_in_journal(jnl, i) ((jnl)->nat_j.entries[i].nid)
-#define sit_in_journal(jnl, i) ((jnl)->sit_j.entries[i].se)
-#define segno_in_journal(jnl, i) ((jnl)->sit_j.entries[i].segno)
-
-#define MAX_NAT_JENTRIES(jnl) (NAT_JOURNAL_ENTRIES - nats_in_cursum(jnl))
-#define MAX_SIT_JENTRIES(jnl) (SIT_JOURNAL_ENTRIES - sits_in_cursum(jnl))
+#define nat_in_journal(jnl, i) \
+ (((struct nat_journal_entry *)(jnl)->nat_j.entries)[i].ne)
+#define nid_in_journal(jnl, i) \
+ (((struct nat_journal_entry *)(jnl)->nat_j.entries)[i].nid)
+#define sit_in_journal(jnl, i) \
+ (((struct sit_journal_entry *)(jnl)->sit_j.entries)[i].se)
+#define segno_in_journal(jnl, i) \
+ (((struct sit_journal_entry *)(jnl)->sit_j.entries)[i].segno)
+
+#define sum_entries(sum) ((struct f2fs_summary *)(sum))
+#define sum_journal(sbi, sum) \
+ ((struct f2fs_journal *)((char *)(sum) + \
+ ((sbi)->entries_in_sum * sizeof(struct f2fs_summary))))
+#define sum_footer(sbi, sum) \
+ ((struct summary_footer *)((char *)(sum) + (sbi)->sum_blocksize - \
+ sizeof(struct summary_footer)))
+
+#define MAX_NAT_JENTRIES(sbi, jnl) ((sbi)->nat_journal_entries - nats_in_cursum(jnl))
+#define MAX_SIT_JENTRIES(sbi, jnl) ((sbi)->sit_journal_entries - sits_in_cursum(jnl))
static inline int update_nats_in_cursum(struct f2fs_journal *journal, int i)
{
@@ -545,14 +584,6 @@ static inline int update_sits_in_cursum(struct f2fs_journal *journal, int i)
return before;
}
-static inline bool __has_cursum_space(struct f2fs_journal *journal,
- int size, int type)
-{
- if (type == NAT_JOURNAL)
- return size <= MAX_NAT_JENTRIES(journal);
- return size <= MAX_SIT_JENTRIES(journal);
-}
-
/* for inline stuff */
#define DEF_INLINE_RESERVED_SIZE 1
static inline int get_extra_isize(struct inode *inode);
@@ -669,8 +700,10 @@ enum {
#define DEFAULT_RETRY_IO_COUNT 8 /* maximum retry read IO or flush count */
-/* IO/non-IO congestion wait timeout value, default: 1ms */
-#define DEFAULT_SCHEDULE_TIMEOUT (msecs_to_jiffies(1))
+#define MAX_FLUSH_RETRY_COUNT 3 /* maximum flush retry count in f2fs_enable_checkpoint() */
+
+/* IO/non-IO congestion wait timeout value, default: 1 jiffies */
+#define DEFAULT_SCHEDULE_TIMEOUT 1
/* timeout value injected, default: 1000ms */
#define DEFAULT_FAULT_TIMEOUT (msecs_to_jiffies(1000))
@@ -1208,6 +1241,7 @@ enum count_type {
F2FS_RD_META,
F2FS_DIO_WRITE,
F2FS_DIO_READ,
+ F2FS_SKIPPED_WRITE, /* skip or fail during f2fs_enable_checkpoint() */
NR_COUNT_TYPE,
};
@@ -1396,6 +1430,27 @@ struct atgc_management {
unsigned long long age_threshold; /* age threshold */
};
+struct f2fs_time_stat {
+ unsigned long long total_time; /* total wall clock time */
+#ifdef CONFIG_64BIT
+ unsigned long long running_time; /* running time */
+#endif
+#if defined(CONFIG_SCHED_INFO) && defined(CONFIG_SCHEDSTATS)
+ unsigned long long runnable_time; /* runnable(including preempted) time */
+#endif
+#ifdef CONFIG_TASK_DELAY_ACCT
+ unsigned long long io_sleep_time; /* IO sleep time */
+#endif
+};
+
+struct f2fs_lock_context {
+ struct f2fs_time_stat ts;
+ int orig_nice;
+ int new_nice;
+ bool lock_trace;
+ bool need_restore;
+};
+
struct f2fs_gc_control {
unsigned int victim_segno; /* target victim segment number */
int init_gc_type; /* FG_GC or BG_GC */
@@ -1404,6 +1459,7 @@ struct f2fs_gc_control {
bool err_gc_skipped; /* return EAGAIN if GC skipped */
bool one_time; /* require one time GC in one migration unit */
unsigned int nr_free_secs; /* # of free sections to do GC */
+ struct f2fs_lock_context lc; /* lock context for gc_lock */
};
/*
@@ -1427,6 +1483,7 @@ enum {
SBI_IS_RESIZEFS, /* resizefs is in process */
SBI_IS_FREEZING, /* freezefs is in process */
SBI_IS_WRITABLE, /* remove ro mountoption transiently */
+ SBI_ENABLE_CHECKPOINT, /* indicate it's during f2fs_enable_checkpoint() */
MAX_SBI_FLAG,
};
@@ -1436,7 +1493,6 @@ enum {
DISCARD_TIME,
GC_TIME,
DISABLE_TIME,
- ENABLE_TIME,
UMOUNT_DISCARD_TIMEOUT,
MAX_TIME,
};
@@ -1522,6 +1578,20 @@ enum f2fs_lookup_mode {
LOOKUP_AUTO,
};
+/* For node type in __get_node_folio() */
+enum node_type {
+ NODE_TYPE_REGULAR,
+ NODE_TYPE_INODE,
+ NODE_TYPE_XATTR,
+ NODE_TYPE_NON_INODE,
+};
+
+/* a threshold of maximum elapsed time in critical region to print tracepoint */
+#define MAX_LOCK_ELAPSED_TIME 500
+
+#define F2FS_DEFAULT_TASK_PRIORITY (DEFAULT_PRIO)
+#define F2FS_CRITICAL_TASK_PRIORITY NICE_TO_PRIO(0)
+
static inline int f2fs_test_bit(unsigned int nr, char *addr);
static inline void f2fs_set_bit(unsigned int nr, char *addr);
static inline void f2fs_clear_bit(unsigned int nr, char *addr);
@@ -1714,7 +1784,6 @@ struct f2fs_sb_info {
long interval_time[MAX_TIME]; /* to store thresholds */
struct ckpt_req_control cprc_info; /* for checkpoint request control */
struct cp_stats cp_stats; /* for time stat of checkpoint */
- struct f2fs_rwsem cp_enable_rwsem; /* block cache/dio write */
struct inode_management im[MAX_INO_ENTRY]; /* manage inode cache */
@@ -1760,7 +1829,16 @@ struct f2fs_sb_info {
unsigned int total_valid_node_count; /* valid node block count */
int dir_level; /* directory level */
bool readdir_ra; /* readahead inode in readdir */
- u64 max_io_bytes; /* max io bytes to merge IOs */
+ unsigned int max_io_bytes; /* max io bytes to merge IOs */
+
+ /* variable summary block units */
+ unsigned int sum_blocksize; /* sum block size */
+ unsigned int sums_per_block; /* sum block count per block */
+ unsigned int entries_in_sum; /* entry count in sum block */
+ unsigned int sum_entry_size; /* total entry size in sum block */
+ unsigned int sum_journal_size; /* journal size in sum block */
+ unsigned int nat_journal_entries; /* nat journal entry count in the journal */
+ unsigned int sit_journal_entries; /* sit journal entry count in the journal */
block_t user_block_count; /* # of user blocks */
block_t total_valid_block_count; /* # of valid blocks */
@@ -1908,7 +1986,7 @@ struct f2fs_sb_info {
unsigned int gc_segment_mode; /* GC state for reclaimed segments */
unsigned int gc_reclaimed_segs[MAX_GC_MODE]; /* Reclaimed segs for each mode */
- unsigned long seq_file_ra_mul; /* multiplier for ra_pages of seq. files in fadvise */
+ unsigned int seq_file_ra_mul; /* multiplier for ra_pages of seq. files in fadvise */
int max_fragment_chunk; /* max chunk size for block fragmentation mode */
int max_fragment_hole; /* max hole size for block fragmentation mode */
@@ -1922,6 +2000,18 @@ struct f2fs_sb_info {
/* carve out reserved_blocks from total blocks */
bool carve_out;
+ /* max elapsed time threshold in critical region that lock covered */
+ unsigned long long max_lock_elapsed_time;
+
+ /* enable/disable to adjust task priority in critical region covered by lock */
+ unsigned int adjust_lock_priority;
+
+ /* adjust priority for task which is in critical region covered by lock */
+ unsigned int lock_duration_priority;
+
+ /* priority for critical task, e.g. f2fs_ckpt, f2fs_gc threads */
+ long critical_task_priority;
+
#ifdef CONFIG_F2FS_FS_COMPRESSION
struct kmem_cache *page_array_slab; /* page array entry */
unsigned int page_array_slab_size; /* default page array slab size */
@@ -2261,16 +2351,22 @@ static inline void clear_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f)
spin_unlock_irqrestore(&sbi->cp_lock, flags);
}
-#define init_f2fs_rwsem(sem) \
+#define init_f2fs_rwsem(sem) __init_f2fs_rwsem(sem, NULL, LOCK_NAME_NONE)
+#define init_f2fs_rwsem_trace __init_f2fs_rwsem
+
+#define __init_f2fs_rwsem(sem, sbi, name) \
do { \
static struct lock_class_key __key; \
\
- __init_f2fs_rwsem((sem), #sem, &__key); \
+ do_init_f2fs_rwsem((sem), #sem, &__key, sbi, name); \
} while (0)
-static inline void __init_f2fs_rwsem(struct f2fs_rwsem *sem,
- const char *sem_name, struct lock_class_key *key)
+static inline void do_init_f2fs_rwsem(struct f2fs_rwsem *sem,
+ const char *sem_name, struct lock_class_key *key,
+ struct f2fs_sb_info *sbi, enum f2fs_lock_name name)
{
+ sem->sbi = sbi;
+ sem->name = name;
__init_rwsem(&sem->internal_rwsem, sem_name, key);
#ifdef CONFIG_F2FS_UNFAIR_RWSEM
init_waitqueue_head(&sem->read_waiters);
@@ -2339,6 +2435,16 @@ static inline void f2fs_up_write(struct f2fs_rwsem *sem)
#endif
}
+void f2fs_down_read_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc);
+int f2fs_down_read_trylock_trace(struct f2fs_rwsem *sem,
+ struct f2fs_lock_context *lc);
+void f2fs_up_read_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc);
+void f2fs_down_write_trace(struct f2fs_rwsem *sem,
+ struct f2fs_lock_context *lc);
+int f2fs_down_write_trylock_trace(struct f2fs_rwsem *sem,
+ struct f2fs_lock_context *lc);
+void f2fs_up_write_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc);
+
static inline void disable_nat_bits(struct f2fs_sb_info *sbi, bool lock)
{
unsigned long flags;
@@ -2369,33 +2475,6 @@ static inline bool enabled_nat_bits(struct f2fs_sb_info *sbi,
return (cpc) ? (cpc->reason & CP_UMOUNT) && set : set;
}
-static inline void f2fs_lock_op(struct f2fs_sb_info *sbi)
-{
- f2fs_down_read(&sbi->cp_rwsem);
-}
-
-static inline int f2fs_trylock_op(struct f2fs_sb_info *sbi)
-{
- if (time_to_inject(sbi, FAULT_LOCK_OP))
- return 0;
- return f2fs_down_read_trylock(&sbi->cp_rwsem);
-}
-
-static inline void f2fs_unlock_op(struct f2fs_sb_info *sbi)
-{
- f2fs_up_read(&sbi->cp_rwsem);
-}
-
-static inline void f2fs_lock_all(struct f2fs_sb_info *sbi)
-{
- f2fs_down_write(&sbi->cp_rwsem);
-}
-
-static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi)
-{
- f2fs_up_write(&sbi->cp_rwsem);
-}
-
static inline int __get_cp_reason(struct f2fs_sb_info *sbi)
{
int reason = CP_SYNC;
@@ -2811,6 +2890,14 @@ static inline block_t __start_sum_addr(struct f2fs_sb_info *sbi)
return le32_to_cpu(F2FS_CKPT(sbi)->cp_pack_start_sum);
}
+static inline bool __has_cursum_space(struct f2fs_sb_info *sbi,
+ struct f2fs_journal *journal, unsigned int size, int type)
+{
+ if (type == NAT_JOURNAL)
+ return size <= MAX_NAT_JENTRIES(sbi, journal);
+ return size <= MAX_SIT_JENTRIES(sbi, journal);
+}
+
extern void f2fs_mark_inode_dirty_sync(struct inode *inode, bool sync);
static inline int inc_valid_node_count(struct f2fs_sb_info *sbi,
struct inode *inode, bool is_inode)
@@ -3722,7 +3809,7 @@ void f2fs_update_inode_page(struct inode *inode);
int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc);
void f2fs_remove_donate_inode(struct inode *inode);
void f2fs_evict_inode(struct inode *inode);
-void f2fs_handle_failed_inode(struct inode *inode);
+void f2fs_handle_failed_inode(struct inode *inode, struct f2fs_lock_context *lc);
/*
* namei.c
@@ -3855,6 +3942,9 @@ struct folio *f2fs_new_node_folio(struct dnode_of_data *dn, unsigned int ofs);
void f2fs_ra_node_page(struct f2fs_sb_info *sbi, nid_t nid);
struct folio *f2fs_get_node_folio(struct f2fs_sb_info *sbi, pgoff_t nid,
enum node_type node_type);
+int f2fs_sanity_check_node_footer(struct f2fs_sb_info *sbi,
+ struct folio *folio, pgoff_t nid,
+ enum node_type ntype, bool in_irq);
struct folio *f2fs_get_inode_folio(struct f2fs_sb_info *sbi, pgoff_t ino);
struct folio *f2fs_get_xnode_folio(struct f2fs_sb_info *sbi, pgoff_t xnid);
int f2fs_move_node_folio(struct folio *node_folio, int gc_type);
@@ -3954,7 +4044,8 @@ void f2fs_wait_on_block_writeback_range(struct inode *inode, block_t blkaddr,
block_t len);
void f2fs_write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk);
void f2fs_write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk);
-int f2fs_lookup_journal_in_cursum(struct f2fs_journal *journal, int type,
+int f2fs_lookup_journal_in_cursum(struct f2fs_sb_info *sbi,
+ struct f2fs_journal *journal, int type,
unsigned int val, int alloc);
void f2fs_flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc);
int f2fs_check_and_fix_write_pointer(struct f2fs_sb_info *sbi);
@@ -3989,6 +4080,9 @@ static inline bool f2fs_need_rand_seg(struct f2fs_sb_info *sbi)
/*
* checkpoint.c
*/
+void f2fs_lock_op(struct f2fs_sb_info *sbi, struct f2fs_lock_context *lc);
+int f2fs_trylock_op(struct f2fs_sb_info *sbi, struct f2fs_lock_context *lc);
+void f2fs_unlock_op(struct f2fs_sb_info *sbi, struct f2fs_lock_context *lc);
void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io,
unsigned char reason);
void f2fs_flush_ckpt_thread(struct f2fs_sb_info *sbi);
@@ -4004,8 +4098,8 @@ int f2fs_ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages,
int type, bool sync);
void f2fs_ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index,
unsigned int ra_blocks);
-long f2fs_sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type,
- long nr_to_write, enum iostat_type io_type);
+long f2fs_sync_meta_pages(struct f2fs_sb_info *sbi, long nr_to_write,
+ enum iostat_type io_type);
void f2fs_add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type);
void f2fs_remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type);
void f2fs_release_ino_entry(struct f2fs_sb_info *sbi, bool all);
@@ -4050,6 +4144,8 @@ void f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type);
void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi,
struct inode *inode, struct folio *folio,
nid_t ino, enum page_type type);
+void f2fs_submit_merged_write_folio(struct f2fs_sb_info *sbi,
+ struct folio *folio, enum page_type type);
void f2fs_submit_merged_ipu_write(struct f2fs_sb_info *sbi,
struct bio **bio, struct folio *folio);
void f2fs_flush_merged_writes(struct f2fs_sb_info *sbi);
@@ -4887,6 +4983,7 @@ static inline bool f2fs_allow_multi_device_dio(struct f2fs_sb_info *sbi,
#ifdef CONFIG_F2FS_FAULT_INJECTION
extern int f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned long rate,
unsigned long type, enum fault_option fo);
+extern void f2fs_simulate_lock_timeout(struct f2fs_sb_info *sbi);
#else
static inline int f2fs_build_fault_attr(struct f2fs_sb_info *sbi,
unsigned long rate, unsigned long type,
@@ -4894,6 +4991,10 @@ static inline int f2fs_build_fault_attr(struct f2fs_sb_info *sbi,
{
return 0;
}
+static inline void f2fs_simulate_lock_timeout(struct f2fs_sb_info *sbi)
+{
+ return;
+}
#endif
static inline bool is_journalled_quota(struct f2fs_sb_info *sbi)
@@ -4909,6 +5010,22 @@ static inline bool is_journalled_quota(struct f2fs_sb_info *sbi)
return false;
}
+static inline bool f2fs_quota_file(struct f2fs_sb_info *sbi, nid_t ino)
+{
+#ifdef CONFIG_QUOTA
+ int i;
+
+ if (!f2fs_sb_has_quota_ino(sbi))
+ return false;
+
+ for (i = 0; i < MAXQUOTAS; i++) {
+ if (f2fs_qf_ino(sbi->sb, i) == ino)
+ return true;
+ }
+#endif
+ return false;
+}
+
static inline bool f2fs_block_unit_discard(struct f2fs_sb_info *sbi)
{
return F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_BLOCK;
@@ -4928,16 +5045,14 @@ static inline void __f2fs_schedule_timeout(long timeout, bool io)
#define f2fs_schedule_timeout(timeout) \
__f2fs_schedule_timeout(timeout, false)
-static inline void f2fs_io_schedule_timeout_killable(long timeout)
+static inline void f2fs_schedule_timeout_killable(long timeout, bool io)
{
- while (timeout) {
+ unsigned long last_time = jiffies + timeout;
+
+ while (jiffies < last_time) {
if (fatal_signal_pending(current))
return;
- set_current_state(TASK_UNINTERRUPTIBLE);
- io_schedule_timeout(DEFAULT_SCHEDULE_TIMEOUT);
- if (timeout <= DEFAULT_SCHEDULE_TIMEOUT)
- return;
- timeout -= DEFAULT_SCHEDULE_TIMEOUT;
+ __f2fs_schedule_timeout(DEFAULT_SCHEDULE_TIMEOUT, io);
}
}
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 1fdbe18692be..c8a2f17a8f11 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -626,6 +626,10 @@ static int f2fs_file_open(struct inode *inode, struct file *filp)
if (!f2fs_is_compress_backend_ready(inode))
return -EOPNOTSUPP;
+ if (mapping_large_folio_support(inode->i_mapping) &&
+ filp->f_mode & FMODE_WRITE)
+ return -EOPNOTSUPP;
+
err = fsverity_file_open(inode, filp);
if (err)
return err;
@@ -772,6 +776,7 @@ int f2fs_do_truncate_blocks(struct inode *inode, u64 from, bool lock)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct dnode_of_data dn;
+ struct f2fs_lock_context lc;
pgoff_t free_from;
int count = 0, err = 0;
struct folio *ifolio;
@@ -790,7 +795,7 @@ int f2fs_do_truncate_blocks(struct inode *inode, u64 from, bool lock)
goto free_partial;
if (lock)
- f2fs_lock_op(sbi);
+ f2fs_lock_op(sbi, &lc);
ifolio = f2fs_get_inode_folio(sbi, inode->i_ino);
if (IS_ERR(ifolio)) {
@@ -841,7 +846,7 @@ free_next:
err = f2fs_truncate_inode_blocks(inode, free_from);
out:
if (lock)
- f2fs_unlock_op(sbi);
+ f2fs_unlock_op(sbi, &lc);
free_partial:
/* lastly zero out the first data page */
if (!err)
@@ -1112,11 +1117,13 @@ int f2fs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
}
if (i_uid_needs_update(idmap, attr, inode) ||
i_gid_needs_update(idmap, attr, inode)) {
- f2fs_lock_op(sbi);
+ struct f2fs_lock_context lc;
+
+ f2fs_lock_op(sbi, &lc);
err = dquot_transfer(idmap, inode, attr);
if (err) {
set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR);
- f2fs_unlock_op(sbi);
+ f2fs_unlock_op(sbi, &lc);
return err;
}
/*
@@ -1126,7 +1133,7 @@ int f2fs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
i_uid_update(idmap, attr, inode);
i_gid_update(idmap, attr, inode);
f2fs_mark_inode_dirty_sync(inode, true);
- f2fs_unlock_op(sbi);
+ f2fs_unlock_op(sbi, &lc);
}
if (attr->ia_valid & ATTR_SIZE) {
@@ -1210,15 +1217,16 @@ static int fill_zero(struct inode *inode, pgoff_t index,
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct folio *folio;
+ struct f2fs_lock_context lc;
if (!len)
return 0;
f2fs_balance_fs(sbi, true);
- f2fs_lock_op(sbi);
+ f2fs_lock_op(sbi, &lc);
folio = f2fs_get_new_data_folio(inode, NULL, index, false);
- f2fs_unlock_op(sbi);
+ f2fs_unlock_op(sbi, &lc);
if (IS_ERR(folio))
return PTR_ERR(folio);
@@ -1301,6 +1309,7 @@ static int f2fs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
if (pg_start < pg_end) {
loff_t blk_start, blk_end;
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct f2fs_lock_context lc;
f2fs_balance_fs(sbi, true);
@@ -1312,9 +1321,9 @@ static int f2fs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
truncate_pagecache_range(inode, blk_start, blk_end - 1);
- f2fs_lock_op(sbi);
+ f2fs_lock_op(sbi, &lc);
ret = f2fs_truncate_hole(inode, pg_start, pg_end);
- f2fs_unlock_op(sbi);
+ f2fs_unlock_op(sbi, &lc);
filemap_invalidate_unlock(inode->i_mapping);
f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
@@ -1546,6 +1555,7 @@ roll_back:
static int f2fs_do_collapse(struct inode *inode, loff_t offset, loff_t len)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct f2fs_lock_context lc;
pgoff_t nrpages = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
pgoff_t start = offset >> PAGE_SHIFT;
pgoff_t end = (offset + len) >> PAGE_SHIFT;
@@ -1559,11 +1569,11 @@ static int f2fs_do_collapse(struct inode *inode, loff_t offset, loff_t len)
f2fs_zero_post_eof_page(inode, offset + len, false);
- f2fs_lock_op(sbi);
+ f2fs_lock_op(sbi, &lc);
f2fs_drop_extent_tree(inode);
truncate_pagecache(inode, offset);
ret = __exchange_data_block(inode, inode, end, start, nrpages - end, true);
- f2fs_unlock_op(sbi);
+ f2fs_unlock_op(sbi, &lc);
filemap_invalidate_unlock(inode->i_mapping);
f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
@@ -1711,6 +1721,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
for (index = pg_start; index < pg_end;) {
struct dnode_of_data dn;
+ struct f2fs_lock_context lc;
unsigned int end_offset;
pgoff_t end;
@@ -1721,12 +1732,12 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
(loff_t)index << PAGE_SHIFT,
((loff_t)pg_end << PAGE_SHIFT) - 1);
- f2fs_lock_op(sbi);
+ f2fs_lock_op(sbi, &lc);
set_new_dnode(&dn, inode, NULL, NULL, 0);
ret = f2fs_get_dnode_of_data(&dn, index, ALLOC_NODE);
if (ret) {
- f2fs_unlock_op(sbi);
+ f2fs_unlock_op(sbi, &lc);
filemap_invalidate_unlock(mapping);
f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
goto out;
@@ -1738,7 +1749,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
ret = f2fs_do_zero_range(&dn, index, end);
f2fs_put_dnode(&dn);
- f2fs_unlock_op(sbi);
+ f2fs_unlock_op(sbi, &lc);
filemap_invalidate_unlock(mapping);
f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
@@ -1821,17 +1832,19 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
truncate_pagecache(inode, offset);
while (!ret && idx > pg_start) {
+ struct f2fs_lock_context lc;
+
nr = idx - pg_start;
if (nr > delta)
nr = delta;
idx -= nr;
- f2fs_lock_op(sbi);
+ f2fs_lock_op(sbi, &lc);
f2fs_drop_extent_tree(inode);
ret = __exchange_data_block(inode, inode, idx,
idx + delta, nr, false);
- f2fs_unlock_op(sbi);
+ f2fs_unlock_op(sbi, &lc);
}
filemap_invalidate_unlock(mapping);
f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
@@ -1913,7 +1926,7 @@ next_alloc:
if (has_not_enough_free_secs(sbi, 0,
sbi->reserved_pin_section)) {
- f2fs_down_write(&sbi->gc_lock);
+ f2fs_down_write_trace(&sbi->gc_lock, &gc_control.lc);
stat_inc_gc_call_count(sbi, FOREGROUND);
err = f2fs_gc(sbi, &gc_control);
if (err && err != -ENODATA) {
@@ -2448,7 +2461,7 @@ int f2fs_do_shutdown(struct f2fs_sb_info *sbi, unsigned int flag,
f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_SHUTDOWN);
break;
case F2FS_GOING_DOWN_METAFLUSH:
- f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_META_IO);
+ f2fs_sync_meta_pages(sbi, LONG_MAX, FS_META_IO);
f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_SHUTDOWN);
break;
case F2FS_GOING_DOWN_NEED_FSCK:
@@ -2764,12 +2777,13 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg)
return ret;
if (!sync) {
- if (!f2fs_down_write_trylock(&sbi->gc_lock)) {
+ if (!f2fs_down_write_trylock_trace(&sbi->gc_lock,
+ &gc_control.lc)) {
ret = -EBUSY;
goto out;
}
} else {
- f2fs_down_write(&sbi->gc_lock);
+ f2fs_down_write_trace(&sbi->gc_lock, &gc_control.lc);
}
gc_control.init_gc_type = sync ? FG_GC : BG_GC;
@@ -2809,12 +2823,12 @@ static int __f2fs_ioc_gc_range(struct file *filp, struct f2fs_gc_range *range)
do_more:
if (!range->sync) {
- if (!f2fs_down_write_trylock(&sbi->gc_lock)) {
+ if (!f2fs_down_write_trylock_trace(&sbi->gc_lock, &gc_control.lc)) {
ret = -EBUSY;
goto out;
}
} else {
- f2fs_down_write(&sbi->gc_lock);
+ f2fs_down_write_trace(&sbi->gc_lock, &gc_control.lc);
}
gc_control.victim_segno = GET_SEGNO(sbi, range->start);
@@ -3087,6 +3101,7 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in,
struct inode *src = file_inode(file_in);
struct inode *dst = file_inode(file_out);
struct f2fs_sb_info *sbi = F2FS_I_SB(src);
+ struct f2fs_lock_context lc;
size_t olen = len, dst_max_i_size = 0;
size_t dst_osize;
int ret;
@@ -3182,7 +3197,7 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in,
goto out_src;
}
- f2fs_lock_op(sbi);
+ f2fs_lock_op(sbi, &lc);
ret = __exchange_data_block(src, dst, F2FS_BYTES_TO_BLK(pos_in),
F2FS_BYTES_TO_BLK(pos_out),
F2FS_BYTES_TO_BLK(len), false);
@@ -3193,7 +3208,7 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in,
else if (dst_osize != dst->i_size)
f2fs_i_size_write(dst, dst_osize);
}
- f2fs_unlock_op(sbi);
+ f2fs_unlock_op(sbi, &lc);
if (src != dst)
f2fs_up_write(&F2FS_I(dst)->i_gc_rwsem[WRITE]);
@@ -3304,7 +3319,7 @@ static int f2fs_ioc_flush_device(struct file *filp, unsigned long arg)
end_segno = min(start_segno + range.segments, dev_end_segno);
while (start_segno < end_segno) {
- if (!f2fs_down_write_trylock(&sbi->gc_lock)) {
+ if (!f2fs_down_write_trylock_trace(&sbi->gc_lock, &gc_control.lc)) {
ret = -EBUSY;
goto out;
}
@@ -3361,6 +3376,7 @@ static int f2fs_ioc_setproject(struct inode *inode, __u32 projid)
struct f2fs_inode_info *fi = F2FS_I(inode);
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct f2fs_inode *ri = NULL;
+ struct f2fs_lock_context lc;
kprojid_t kprojid;
int err;
@@ -3391,7 +3407,7 @@ static int f2fs_ioc_setproject(struct inode *inode, __u32 projid)
if (err)
return err;
- f2fs_lock_op(sbi);
+ f2fs_lock_op(sbi, &lc);
err = f2fs_transfer_project_quota(inode, kprojid);
if (err)
goto out_unlock;
@@ -3400,7 +3416,7 @@ static int f2fs_ioc_setproject(struct inode *inode, __u32 projid)
inode_set_ctime_current(inode);
f2fs_mark_inode_dirty_sync(inode, true);
out_unlock:
- f2fs_unlock_op(sbi);
+ f2fs_unlock_op(sbi, &lc);
return err;
}
#else
@@ -3833,6 +3849,7 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg)
struct inode *inode = file_inode(filp);
struct f2fs_inode_info *fi = F2FS_I(inode);
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct f2fs_lock_context lc;
pgoff_t page_idx = 0, last_idx;
unsigned int released_blocks = 0;
int ret;
@@ -3887,12 +3904,12 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg)
struct dnode_of_data dn;
pgoff_t end_offset, count;
- f2fs_lock_op(sbi);
+ f2fs_lock_op(sbi, &lc);
set_new_dnode(&dn, inode, NULL, NULL, 0);
ret = f2fs_get_dnode_of_data(&dn, page_idx, LOOKUP_NODE);
if (ret) {
- f2fs_unlock_op(sbi);
+ f2fs_unlock_op(sbi, &lc);
if (ret == -ENOENT) {
page_idx = f2fs_get_next_page_offset(&dn,
page_idx);
@@ -3910,7 +3927,7 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg)
f2fs_put_dnode(&dn);
- f2fs_unlock_op(sbi);
+ f2fs_unlock_op(sbi, &lc);
if (ret < 0)
break;
@@ -4063,14 +4080,15 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg)
while (page_idx < last_idx) {
struct dnode_of_data dn;
+ struct f2fs_lock_context lc;
pgoff_t end_offset, count;
- f2fs_lock_op(sbi);
+ f2fs_lock_op(sbi, &lc);
set_new_dnode(&dn, inode, NULL, NULL, 0);
ret = f2fs_get_dnode_of_data(&dn, page_idx, LOOKUP_NODE);
if (ret) {
- f2fs_unlock_op(sbi);
+ f2fs_unlock_op(sbi, &lc);
if (ret == -ENOENT) {
page_idx = f2fs_get_next_page_offset(&dn,
page_idx);
@@ -4088,7 +4106,7 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg)
f2fs_put_dnode(&dn);
- f2fs_unlock_op(sbi);
+ f2fs_unlock_op(sbi, &lc);
if (ret < 0)
break;
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 384fa7e2085b..f46b2673d31f 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -102,21 +102,22 @@ static int gc_thread_func(void *data)
if (sbi->gc_mode == GC_URGENT_HIGH ||
sbi->gc_mode == GC_URGENT_MID) {
wait_ms = gc_th->urgent_sleep_time;
- f2fs_down_write(&sbi->gc_lock);
+ f2fs_down_write_trace(&sbi->gc_lock, &gc_control.lc);
goto do_gc;
}
if (foreground) {
- f2fs_down_write(&sbi->gc_lock);
+ f2fs_down_write_trace(&sbi->gc_lock, &gc_control.lc);
goto do_gc;
- } else if (!f2fs_down_write_trylock(&sbi->gc_lock)) {
+ } else if (!f2fs_down_write_trylock_trace(&sbi->gc_lock,
+ &gc_control.lc)) {
stat_other_skip_bggc_count(sbi);
goto next;
}
if (!is_idle(sbi, GC_TIME)) {
increase_sleep_time(gc_th, &wait_ms);
- f2fs_up_write(&sbi->gc_lock);
+ f2fs_up_write_trace(&sbi->gc_lock, &gc_control.lc);
stat_io_skip_bggc_count(sbi);
goto next;
}
@@ -125,7 +126,8 @@ static int gc_thread_func(void *data)
if (has_enough_free_blocks(sbi,
gc_th->no_zoned_gc_percent)) {
wait_ms = gc_th->no_gc_sleep_time;
- f2fs_up_write(&sbi->gc_lock);
+ f2fs_up_write_trace(&sbi->gc_lock,
+ &gc_control.lc);
goto next;
}
if (wait_ms == gc_th->no_gc_sleep_time)
@@ -232,6 +234,8 @@ int f2fs_start_gc_thread(struct f2fs_sb_info *sbi)
return err;
}
+ set_user_nice(gc_th->f2fs_gc_task,
+ PRIO_TO_NICE(sbi->critical_task_priority));
return 0;
}
@@ -1031,7 +1035,8 @@ static int check_valid_map(struct f2fs_sb_info *sbi,
* ignore that.
*/
static int gc_node_segment(struct f2fs_sb_info *sbi,
- struct f2fs_summary *sum, unsigned int segno, int gc_type)
+ struct f2fs_summary *sum, unsigned int segno, int gc_type,
+ struct blk_plug *plug)
{
struct f2fs_summary *entry;
block_t start_addr;
@@ -1100,8 +1105,11 @@ next_step:
stat_inc_node_blk_count(sbi, 1, gc_type);
}
- if (++phase < 3)
+ if (++phase < 3) {
+ blk_finish_plug(plug);
+ blk_start_plug(plug);
goto next_step;
+ }
if (fggc)
atomic_dec(&sbi->wb_sync_req[NODE]);
@@ -1453,7 +1461,11 @@ up_out:
put_out:
f2fs_put_dnode(&dn);
out:
- f2fs_folio_put(folio, true);
+ if (!folio_test_uptodate(folio))
+ __folio_set_dropbehind(folio);
+ folio_unlock(folio);
+ folio_end_dropbehind(folio);
+ folio_put(folio);
return err;
}
@@ -1535,7 +1547,7 @@ out:
*/
static int gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
struct gc_inode_list *gc_list, unsigned int segno, int gc_type,
- bool force_migrate)
+ bool force_migrate, struct blk_plug *plug)
{
struct super_block *sb = sbi->sb;
struct f2fs_summary *entry;
@@ -1703,8 +1715,11 @@ next_step:
}
}
- if (++phase < 5)
+ if (++phase < 5) {
+ blk_finish_plug(plug);
+ blk_start_plug(plug);
goto next_step;
+ }
return submitted;
}
@@ -1769,8 +1784,8 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
sanity_check_seg_type(sbi, get_seg_entry(sbi, segno)->type);
- segno = rounddown(segno, SUMS_PER_BLOCK);
- sum_blk_cnt = DIV_ROUND_UP(end_segno - segno, SUMS_PER_BLOCK);
+ segno = rounddown(segno, sbi->sums_per_block);
+ sum_blk_cnt = DIV_ROUND_UP(end_segno - segno, sbi->sums_per_block);
/* readahead multi ssa blocks those have contiguous address */
if (__is_large_section(sbi))
f2fs_ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno),
@@ -1780,17 +1795,17 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
while (segno < end_segno) {
struct folio *sum_folio = f2fs_get_sum_folio(sbi, segno);
- segno += SUMS_PER_BLOCK;
+ segno += sbi->sums_per_block;
if (IS_ERR(sum_folio)) {
int err = PTR_ERR(sum_folio);
- end_segno = segno - SUMS_PER_BLOCK;
- segno = rounddown(start_segno, SUMS_PER_BLOCK);
+ end_segno = segno - sbi->sums_per_block;
+ segno = rounddown(start_segno, sbi->sums_per_block);
while (segno < end_segno) {
sum_folio = filemap_get_folio(META_MAPPING(sbi),
GET_SUM_BLOCK(sbi, segno));
folio_put_refs(sum_folio, 2);
- segno += SUMS_PER_BLOCK;
+ segno += sbi->sums_per_block;
}
return err;
}
@@ -1806,8 +1821,8 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
/* find segment summary of victim */
struct folio *sum_folio = filemap_get_folio(META_MAPPING(sbi),
GET_SUM_BLOCK(sbi, segno));
- unsigned int block_end_segno = rounddown(segno, SUMS_PER_BLOCK)
- + SUMS_PER_BLOCK;
+ unsigned int block_end_segno = rounddown(segno, sbi->sums_per_block)
+ + sbi->sums_per_block;
if (block_end_segno > end_segno)
block_end_segno = end_segno;
@@ -1833,12 +1848,13 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
migrated >= sbi->migration_granularity)
continue;
- sum = SUM_BLK_PAGE_ADDR(sum_folio, cur_segno);
- if (type != GET_SUM_TYPE((&sum->footer))) {
+ sum = SUM_BLK_PAGE_ADDR(sbi, sum_folio, cur_segno);
+ if (type != GET_SUM_TYPE(sum_footer(sbi, sum))) {
f2fs_err(sbi, "Inconsistent segment (%u) type "
"[%d, %d] in SSA and SIT",
cur_segno, type,
- GET_SUM_TYPE((&sum->footer)));
+ GET_SUM_TYPE(
+ sum_footer(sbi, sum)));
f2fs_stop_checkpoint(sbi, false,
STOP_CP_REASON_CORRUPTED_SUMMARY);
continue;
@@ -1853,11 +1869,11 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
*/
if (type == SUM_TYPE_NODE)
submitted += gc_node_segment(sbi, sum->entries,
- cur_segno, gc_type);
+ cur_segno, gc_type, &plug);
else
submitted += gc_data_segment(sbi, sum->entries,
gc_list, cur_segno,
- gc_type, force_migrate);
+ gc_type, force_migrate, &plug);
stat_inc_gc_seg_count(sbi, data_type, gc_type);
sbi->gc_reclaimed_segs[sbi->gc_mode]++;
@@ -2000,7 +2016,7 @@ retry:
goto stop;
}
- __get_secs_required(sbi, NULL, &upper_secs, NULL);
+ upper_secs = __get_secs_required(sbi);
/*
* Write checkpoint to reclaim prefree segments.
@@ -2035,7 +2051,7 @@ stop:
reserved_segments(sbi),
prefree_segments(sbi));
- f2fs_up_write(&sbi->gc_lock);
+ f2fs_up_write_trace(&sbi->gc_lock, &gc_control->lc);
put_gc_inode(&gc_list);
@@ -2096,6 +2112,7 @@ int f2fs_gc_range(struct f2fs_sb_info *sbi,
if (unlikely(f2fs_cp_error(sbi)))
return -EIO;
+ stat_inc_gc_call_count(sbi, FOREGROUND);
for (segno = start_seg; segno <= end_seg; segno += SEGS_PER_SEC(sbi)) {
struct gc_inode_list gc_list = {
.ilist = LIST_HEAD_INIT(gc_list.ilist),
@@ -2251,6 +2268,9 @@ int f2fs_resize_fs(struct file *filp, __u64 block_count)
struct f2fs_sb_info *sbi = F2FS_I_SB(file_inode(filp));
__u64 old_block_count, shrunk_blocks;
struct cp_control cpc = { CP_RESIZE, 0, 0, 0 };
+ struct f2fs_lock_context lc;
+ struct f2fs_lock_context glc;
+ struct f2fs_lock_context clc;
unsigned int secs;
int err = 0;
__u32 rem;
@@ -2294,13 +2314,13 @@ int f2fs_resize_fs(struct file *filp, __u64 block_count)
secs = div_u64(shrunk_blocks, BLKS_PER_SEC(sbi));
/* stop other GC */
- if (!f2fs_down_write_trylock(&sbi->gc_lock)) {
+ if (!f2fs_down_write_trylock_trace(&sbi->gc_lock, &glc)) {
err = -EAGAIN;
goto out_drop_write;
}
/* stop CP to protect MAIN_SEC in free_segment_range */
- f2fs_lock_op(sbi);
+ f2fs_lock_op(sbi, &lc);
spin_lock(&sbi->stat_lock);
if (shrunk_blocks + valid_user_blocks(sbi) +
@@ -2315,8 +2335,8 @@ int f2fs_resize_fs(struct file *filp, __u64 block_count)
err = free_segment_range(sbi, secs, true);
out_unlock:
- f2fs_unlock_op(sbi);
- f2fs_up_write(&sbi->gc_lock);
+ f2fs_unlock_op(sbi, &lc);
+ f2fs_up_write_trace(&sbi->gc_lock, &glc);
out_drop_write:
mnt_drop_write_file(filp);
if (err)
@@ -2333,8 +2353,8 @@ out_drop_write:
return -EROFS;
}
- f2fs_down_write(&sbi->gc_lock);
- f2fs_down_write(&sbi->cp_global_sem);
+ f2fs_down_write_trace(&sbi->gc_lock, &glc);
+ f2fs_down_write_trace(&sbi->cp_global_sem, &clc);
spin_lock(&sbi->stat_lock);
if (shrunk_blocks + valid_user_blocks(sbi) +
@@ -2382,8 +2402,8 @@ recover_out:
spin_unlock(&sbi->stat_lock);
}
out_err:
- f2fs_up_write(&sbi->cp_global_sem);
- f2fs_up_write(&sbi->gc_lock);
+ f2fs_up_write_trace(&sbi->cp_global_sem, &clc);
+ f2fs_up_write_trace(&sbi->gc_lock, &glc);
thaw_super(sbi->sb, FREEZE_HOLDER_KERNEL, NULL);
return err;
}
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index e5c6a08b7e4f..0a1052d5ee62 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -218,6 +218,7 @@ int f2fs_convert_inline_inode(struct inode *inode)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct dnode_of_data dn;
+ struct f2fs_lock_context lc;
struct folio *ifolio, *folio;
int err = 0;
@@ -235,7 +236,7 @@ int f2fs_convert_inline_inode(struct inode *inode)
if (IS_ERR(folio))
return PTR_ERR(folio);
- f2fs_lock_op(sbi);
+ f2fs_lock_op(sbi, &lc);
ifolio = f2fs_get_inode_folio(sbi, inode->i_ino);
if (IS_ERR(ifolio)) {
@@ -250,7 +251,7 @@ int f2fs_convert_inline_inode(struct inode *inode)
f2fs_put_dnode(&dn);
out:
- f2fs_unlock_op(sbi);
+ f2fs_unlock_op(sbi, &lc);
f2fs_folio_put(folio, true);
@@ -597,13 +598,14 @@ int f2fs_try_convert_inline_dir(struct inode *dir, struct dentry *dentry)
struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
struct folio *ifolio;
struct f2fs_filename fname;
+ struct f2fs_lock_context lc;
void *inline_dentry = NULL;
int err = 0;
if (!f2fs_has_inline_dentry(dir))
return 0;
- f2fs_lock_op(sbi);
+ f2fs_lock_op(sbi, &lc);
err = f2fs_setup_filename(dir, &dentry->d_name, 0, &fname);
if (err)
@@ -628,7 +630,7 @@ int f2fs_try_convert_inline_dir(struct inode *dir, struct dentry *dentry)
out_fname:
f2fs_free_filename(&fname);
out:
- f2fs_unlock_op(sbi);
+ f2fs_unlock_op(sbi, &lc);
return err;
}
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index ee332b994348..e0f850b3f0c3 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -597,6 +597,8 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino)
if (ret)
goto bad_inode;
make_now:
+ f2fs_set_inode_flags(inode);
+
if (ino == F2FS_NODE_INO(sbi)) {
inode->i_mapping->a_ops = &f2fs_node_aops;
mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
@@ -618,6 +620,9 @@ make_now:
inode->i_op = &f2fs_file_inode_operations;
inode->i_fop = &f2fs_file_operations;
inode->i_mapping->a_ops = &f2fs_dblock_aops;
+ if (IS_IMMUTABLE(inode) && !f2fs_compressed_file(inode) &&
+ !f2fs_quota_file(sbi, inode->i_ino))
+ mapping_set_folio_min_order(inode->i_mapping, 0);
} else if (S_ISDIR(inode->i_mode)) {
inode->i_op = &f2fs_dir_inode_operations;
inode->i_fop = &f2fs_dir_operations;
@@ -638,7 +643,6 @@ make_now:
ret = -EIO;
goto bad_inode;
}
- f2fs_set_inode_flags(inode);
unlock_new_inode(inode);
trace_f2fs_iget(inode);
@@ -906,9 +910,11 @@ retry:
err = -EIO;
if (!err) {
- f2fs_lock_op(sbi);
+ struct f2fs_lock_context lc;
+
+ f2fs_lock_op(sbi, &lc);
err = f2fs_remove_inode_page(inode);
- f2fs_unlock_op(sbi);
+ f2fs_unlock_op(sbi, &lc);
if (err == -ENOENT) {
err = 0;
@@ -1004,7 +1010,7 @@ out_clear:
}
/* caller should call f2fs_lock_op() */
-void f2fs_handle_failed_inode(struct inode *inode)
+void f2fs_handle_failed_inode(struct inode *inode, struct f2fs_lock_context *lc)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct node_info ni;
@@ -1053,7 +1059,7 @@ void f2fs_handle_failed_inode(struct inode *inode)
}
out:
- f2fs_unlock_op(sbi);
+ f2fs_unlock_op(sbi, lc);
/* iput will drop the inode object */
iput(inode);
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 043d20516a21..e360f08a9586 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -354,6 +354,7 @@ static int f2fs_create(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry, umode_t mode, bool excl)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
+ struct f2fs_lock_context lc;
struct inode *inode;
nid_t ino = 0;
int err;
@@ -376,11 +377,11 @@ static int f2fs_create(struct mnt_idmap *idmap, struct inode *dir,
inode->i_mapping->a_ops = &f2fs_dblock_aops;
ino = inode->i_ino;
- f2fs_lock_op(sbi);
+ f2fs_lock_op(sbi, &lc);
err = f2fs_add_link(dentry, inode);
if (err)
goto out;
- f2fs_unlock_op(sbi);
+ f2fs_unlock_op(sbi, &lc);
f2fs_alloc_nid_done(sbi, ino);
@@ -392,7 +393,7 @@ static int f2fs_create(struct mnt_idmap *idmap, struct inode *dir,
f2fs_balance_fs(sbi, true);
return 0;
out:
- f2fs_handle_failed_inode(inode);
+ f2fs_handle_failed_inode(inode, &lc);
return err;
}
@@ -401,6 +402,7 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
{
struct inode *inode = d_inode(old_dentry);
struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
+ struct f2fs_lock_context lc;
int err;
if (unlikely(f2fs_cp_error(sbi)))
@@ -427,11 +429,11 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
ihold(inode);
set_inode_flag(inode, FI_INC_LINK);
- f2fs_lock_op(sbi);
+ f2fs_lock_op(sbi, &lc);
err = f2fs_add_link(dentry, inode);
if (err)
goto out;
- f2fs_unlock_op(sbi);
+ f2fs_unlock_op(sbi, &lc);
d_instantiate(dentry, inode);
@@ -441,7 +443,7 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
out:
clear_inode_flag(inode, FI_INC_LINK);
iput(inode);
- f2fs_unlock_op(sbi);
+ f2fs_unlock_op(sbi, &lc);
return err;
}
@@ -545,6 +547,7 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
struct inode *inode = d_inode(dentry);
struct f2fs_dir_entry *de;
+ struct f2fs_lock_context lc;
struct folio *folio;
int err;
@@ -581,15 +584,15 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
f2fs_balance_fs(sbi, true);
- f2fs_lock_op(sbi);
+ f2fs_lock_op(sbi, &lc);
err = f2fs_acquire_orphan_inode(sbi);
if (err) {
- f2fs_unlock_op(sbi);
+ f2fs_unlock_op(sbi, &lc);
f2fs_folio_put(folio, false);
goto out;
}
f2fs_delete_entry(de, folio, dir, inode);
- f2fs_unlock_op(sbi);
+ f2fs_unlock_op(sbi, &lc);
/* VFS negative dentries are incompatible with Encoding and
* Case-insensitiveness. Eventually we'll want avoid
@@ -632,6 +635,7 @@ static int f2fs_symlink(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry, const char *symname)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
+ struct f2fs_lock_context lc;
struct inode *inode;
size_t len = strlen(symname);
struct fscrypt_str disk_link;
@@ -662,11 +666,11 @@ static int f2fs_symlink(struct mnt_idmap *idmap, struct inode *dir,
inode_nohighmem(inode);
inode->i_mapping->a_ops = &f2fs_dblock_aops;
- f2fs_lock_op(sbi);
+ f2fs_lock_op(sbi, &lc);
err = f2fs_add_link(dentry, inode);
if (err)
goto out_f2fs_handle_failed_inode;
- f2fs_unlock_op(sbi);
+ f2fs_unlock_op(sbi, &lc);
f2fs_alloc_nid_done(sbi, inode->i_ino);
err = fscrypt_encrypt_symlink(inode, symname, len, &disk_link);
@@ -701,7 +705,7 @@ err_out:
goto out_free_encrypted_link;
out_f2fs_handle_failed_inode:
- f2fs_handle_failed_inode(inode);
+ f2fs_handle_failed_inode(inode, &lc);
out_free_encrypted_link:
if (disk_link.name != (unsigned char *)symname)
kfree(disk_link.name);
@@ -712,6 +716,7 @@ static struct dentry *f2fs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry, umode_t mode)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
+ struct f2fs_lock_context lc;
struct inode *inode;
int err;
@@ -732,11 +737,11 @@ static struct dentry *f2fs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
set_inode_flag(inode, FI_INC_LINK);
- f2fs_lock_op(sbi);
+ f2fs_lock_op(sbi, &lc);
err = f2fs_add_link(dentry, inode);
if (err)
goto out_fail;
- f2fs_unlock_op(sbi);
+ f2fs_unlock_op(sbi, &lc);
f2fs_alloc_nid_done(sbi, inode->i_ino);
@@ -750,7 +755,7 @@ static struct dentry *f2fs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
out_fail:
clear_inode_flag(inode, FI_INC_LINK);
- f2fs_handle_failed_inode(inode);
+ f2fs_handle_failed_inode(inode, &lc);
return ERR_PTR(err);
}
@@ -767,6 +772,7 @@ static int f2fs_mknod(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry, umode_t mode, dev_t rdev)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
+ struct f2fs_lock_context lc;
struct inode *inode;
int err = 0;
@@ -786,11 +792,11 @@ static int f2fs_mknod(struct mnt_idmap *idmap, struct inode *dir,
init_special_inode(inode, inode->i_mode, rdev);
inode->i_op = &f2fs_special_inode_operations;
- f2fs_lock_op(sbi);
+ f2fs_lock_op(sbi, &lc);
err = f2fs_add_link(dentry, inode);
if (err)
goto out;
- f2fs_unlock_op(sbi);
+ f2fs_unlock_op(sbi, &lc);
f2fs_alloc_nid_done(sbi, inode->i_ino);
@@ -802,7 +808,7 @@ static int f2fs_mknod(struct mnt_idmap *idmap, struct inode *dir,
f2fs_balance_fs(sbi, true);
return 0;
out:
- f2fs_handle_failed_inode(inode);
+ f2fs_handle_failed_inode(inode, &lc);
return err;
}
@@ -811,6 +817,7 @@ static int __f2fs_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
struct inode **new_inode, struct f2fs_filename *fname)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
+ struct f2fs_lock_context lc;
struct inode *inode;
int err;
@@ -831,7 +838,7 @@ static int __f2fs_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
inode->i_mapping->a_ops = &f2fs_dblock_aops;
}
- f2fs_lock_op(sbi);
+ f2fs_lock_op(sbi, &lc);
err = f2fs_acquire_orphan_inode(sbi);
if (err)
goto out;
@@ -860,7 +867,7 @@ static int __f2fs_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
f2fs_i_links_write(inode, false);
}
/* link_count was changed by d_tmpfile as well. */
- f2fs_unlock_op(sbi);
+ f2fs_unlock_op(sbi, &lc);
unlock_new_inode(inode);
if (new_inode)
@@ -872,7 +879,7 @@ static int __f2fs_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
release_out:
f2fs_release_orphan_inode(sbi);
out:
- f2fs_handle_failed_inode(inode);
+ f2fs_handle_failed_inode(inode, &lc);
return err;
}
@@ -920,6 +927,7 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
struct f2fs_dir_entry *old_dir_entry = NULL;
struct f2fs_dir_entry *old_entry;
struct f2fs_dir_entry *new_entry;
+ struct f2fs_lock_context lc;
bool old_is_dir = S_ISDIR(old_inode->i_mode);
int err;
@@ -1008,7 +1016,7 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
f2fs_balance_fs(sbi, true);
- f2fs_lock_op(sbi);
+ f2fs_lock_op(sbi, &lc);
err = f2fs_acquire_orphan_inode(sbi);
if (err)
@@ -1031,11 +1039,11 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
} else {
f2fs_balance_fs(sbi, true);
- f2fs_lock_op(sbi);
+ f2fs_lock_op(sbi, &lc);
err = f2fs_add_link(new_dentry, old_inode);
if (err) {
- f2fs_unlock_op(sbi);
+ f2fs_unlock_op(sbi, &lc);
goto out_dir;
}
@@ -1084,7 +1092,7 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
TRANS_DIR_INO);
}
- f2fs_unlock_op(sbi);
+ f2fs_unlock_op(sbi, &lc);
if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
f2fs_sync_fs(sbi->sb, 1);
@@ -1093,7 +1101,7 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
return 0;
put_out_dir:
- f2fs_unlock_op(sbi);
+ f2fs_unlock_op(sbi, &lc);
f2fs_folio_put(new_folio, false);
out_dir:
if (old_dir_entry)
@@ -1115,6 +1123,7 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
struct folio *old_folio, *new_folio;
struct f2fs_dir_entry *old_dir_entry = NULL, *new_dir_entry = NULL;
struct f2fs_dir_entry *old_entry, *new_entry;
+ struct f2fs_lock_context lc;
int old_nlink = 0, new_nlink = 0;
int err;
@@ -1194,7 +1203,7 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
f2fs_balance_fs(sbi, true);
- f2fs_lock_op(sbi);
+ f2fs_lock_op(sbi, &lc);
/* update ".." directory entry info of old dentry */
if (old_dir_entry)
@@ -1247,7 +1256,7 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
f2fs_add_ino_entry(sbi, new_dir->i_ino, TRANS_DIR_INO);
}
- f2fs_unlock_op(sbi);
+ f2fs_unlock_op(sbi, &lc);
if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
f2fs_sync_fs(sbi->sb, 1);
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 482a362f2625..74992fd9c9b6 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -606,7 +606,7 @@ retry:
goto retry;
}
- i = f2fs_lookup_journal_in_cursum(journal, NAT_JOURNAL, nid, 0);
+ i = f2fs_lookup_journal_in_cursum(sbi, journal, NAT_JOURNAL, nid, 0);
if (i >= 0) {
ne = nat_in_journal(journal, i);
node_info_from_raw_nat(ni, &ne);
@@ -643,6 +643,17 @@ sanity_check:
return -EFSCORRUPTED;
}
+ if (unlikely(f2fs_quota_file(sbi, ni->nid) &&
+ !__is_valid_data_blkaddr(ni->blk_addr))) {
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
+ f2fs_err_ratelimited(sbi,
+ "f2fs_get_node_info of %pS: inconsistent nat entry from qf_ino, "
+ "ino:%u, nid:%u, blkaddr:%u, ver:%u, flag:%u",
+ __builtin_return_address(0),
+ ni->ino, ni->nid, ni->blk_addr, ni->version, ni->flag);
+ f2fs_handle_error(sbi, ERROR_INCONSISTENT_NAT);
+ }
+
/* cache nat entry */
if (need_cache)
cache_nat_entry(sbi, nid, &ne);
@@ -1500,24 +1511,33 @@ void f2fs_ra_node_page(struct f2fs_sb_info *sbi, nid_t nid)
f2fs_folio_put(afolio, err ? true : false);
}
-static int sanity_check_node_footer(struct f2fs_sb_info *sbi,
+int f2fs_sanity_check_node_footer(struct f2fs_sb_info *sbi,
struct folio *folio, pgoff_t nid,
- enum node_type ntype)
+ enum node_type ntype, bool in_irq)
{
+ bool is_inode, is_xnode;
+
if (unlikely(nid != nid_of_node(folio)))
goto out_err;
+ is_inode = IS_INODE(folio);
+ is_xnode = f2fs_has_xattr_block(ofs_of_node(folio));
+
switch (ntype) {
+ case NODE_TYPE_REGULAR:
+ if (is_inode && is_xnode)
+ goto out_err;
+ break;
case NODE_TYPE_INODE:
- if (!IS_INODE(folio))
+ if (!is_inode || is_xnode)
goto out_err;
break;
case NODE_TYPE_XATTR:
- if (!f2fs_has_xattr_block(ofs_of_node(folio)))
+ if (is_inode || !is_xnode)
goto out_err;
break;
case NODE_TYPE_NON_INODE:
- if (IS_INODE(folio))
+ if (is_inode)
goto out_err;
break;
default:
@@ -1527,12 +1547,13 @@ static int sanity_check_node_footer(struct f2fs_sb_info *sbi,
goto out_err;
return 0;
out_err:
- f2fs_warn(sbi, "inconsistent node block, node_type:%d, nid:%lu, "
- "node_footer[nid:%u,ino:%u,ofs:%u,cpver:%llu,blkaddr:%u]",
- ntype, nid, nid_of_node(folio), ino_of_node(folio),
- ofs_of_node(folio), cpver_of_node(folio),
- next_blkaddr_of_node(folio));
set_sbi_flag(sbi, SBI_NEED_FSCK);
+ f2fs_warn_ratelimited(sbi, "inconsistent node block, node_type:%d, nid:%lu, "
+ "node_footer[nid:%u,ino:%u,ofs:%u,cpver:%llu,blkaddr:%u]",
+ ntype, nid, nid_of_node(folio), ino_of_node(folio),
+ ofs_of_node(folio), cpver_of_node(folio),
+ next_blkaddr_of_node(folio));
+
f2fs_handle_error(sbi, ERROR_INCONSISTENT_FOOTER);
return -EFSCORRUPTED;
}
@@ -1578,7 +1599,7 @@ repeat:
goto out_err;
}
page_hit:
- err = sanity_check_node_footer(sbi, folio, nid, ntype);
+ err = f2fs_sanity_check_node_footer(sbi, folio, nid, ntype, false);
if (!err)
return folio;
out_err:
@@ -1727,6 +1748,7 @@ static bool __write_node_folio(struct folio *folio, bool atomic, bool *submitted
.io_type = io_type,
.io_wbc = wbc,
};
+ struct f2fs_lock_context lc;
unsigned int seq;
trace_f2fs_writepage(folio, NODE);
@@ -1751,18 +1773,23 @@ static bool __write_node_folio(struct folio *folio, bool atomic, bool *submitted
/* get old block addr of this node page */
nid = nid_of_node(folio);
- f2fs_bug_on(sbi, folio->index != nid);
+
+ if (f2fs_sanity_check_node_footer(sbi, folio, nid,
+ NODE_TYPE_REGULAR, false)) {
+ f2fs_handle_critical_error(sbi, STOP_CP_REASON_CORRUPTED_NID);
+ goto redirty_out;
+ }
if (f2fs_get_node_info(sbi, nid, &ni, !do_balance))
goto redirty_out;
- f2fs_down_read(&sbi->node_write);
+ f2fs_down_read_trace(&sbi->node_write, &lc);
/* This page is already truncated */
if (unlikely(ni.blk_addr == NULL_ADDR)) {
folio_clear_uptodate(folio);
dec_page_count(sbi, F2FS_DIRTY_NODES);
- f2fs_up_read(&sbi->node_write);
+ f2fs_up_read_trace(&sbi->node_write, &lc);
folio_unlock(folio);
return true;
}
@@ -1770,12 +1797,17 @@ static bool __write_node_folio(struct folio *folio, bool atomic, bool *submitted
if (__is_valid_data_blkaddr(ni.blk_addr) &&
!f2fs_is_valid_blkaddr(sbi, ni.blk_addr,
DATA_GENERIC_ENHANCE)) {
- f2fs_up_read(&sbi->node_write);
+ f2fs_up_read_trace(&sbi->node_write, &lc);
goto redirty_out;
}
- if (atomic && !test_opt(sbi, NOBARRIER))
- fio.op_flags |= REQ_PREFLUSH | REQ_FUA;
+ if (atomic) {
+ if (!test_opt(sbi, NOBARRIER))
+ fio.op_flags |= REQ_PREFLUSH | REQ_FUA;
+ if (IS_INODE(folio))
+ set_dentry_mark(folio,
+ f2fs_need_dentry_mark(sbi, ino_of_node(folio)));
+ }
/* should add to global list before clearing PAGECACHE status */
if (f2fs_in_warm_node_list(sbi, folio)) {
@@ -1790,7 +1822,7 @@ static bool __write_node_folio(struct folio *folio, bool atomic, bool *submitted
f2fs_do_write_node_page(nid, &fio);
set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(folio));
dec_page_count(sbi, F2FS_DIRTY_NODES);
- f2fs_up_read(&sbi->node_write);
+ f2fs_up_read_trace(&sbi->node_write, &lc);
folio_unlock(folio);
@@ -1916,8 +1948,9 @@ continue_unlock:
if (is_inode_flag_set(inode,
FI_DIRTY_INODE))
f2fs_update_inode(inode, folio);
- set_dentry_mark(folio,
- f2fs_need_dentry_mark(sbi, ino));
+ if (!atomic)
+ set_dentry_mark(folio,
+ f2fs_need_dentry_mark(sbi, ino));
}
/* may be written by other thread */
if (!folio_test_dirty(folio))
@@ -2937,7 +2970,7 @@ int f2fs_restore_node_summary(struct f2fs_sb_info *sbi,
/* scan the node segment */
last_offset = BLKS_PER_SEG(sbi);
addr = START_BLOCK(sbi, segno);
- sum_entry = &sum->entries[0];
+ sum_entry = sum_entries(sum);
for (i = 0; i < last_offset; i += nrpages, addr += nrpages) {
nrpages = bio_max_segs(last_offset - i);
@@ -3078,7 +3111,7 @@ static int __flush_nat_entry_set(struct f2fs_sb_info *sbi,
* #2, flush nat entries to nat page.
*/
if (enabled_nat_bits(sbi, cpc) ||
- !__has_cursum_space(journal, set->entry_cnt, NAT_JOURNAL))
+ !__has_cursum_space(sbi, journal, set->entry_cnt, NAT_JOURNAL))
to_journal = false;
if (to_journal) {
@@ -3101,7 +3134,7 @@ static int __flush_nat_entry_set(struct f2fs_sb_info *sbi,
f2fs_bug_on(sbi, nat_get_blkaddr(ne) == NEW_ADDR);
if (to_journal) {
- offset = f2fs_lookup_journal_in_cursum(journal,
+ offset = f2fs_lookup_journal_in_cursum(sbi, journal,
NAT_JOURNAL, nid, 1);
f2fs_bug_on(sbi, offset < 0);
raw_ne = &nat_in_journal(journal, offset);
@@ -3146,7 +3179,7 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
struct f2fs_journal *journal = curseg->journal;
struct nat_entry_set *setvec[NAT_VEC_SIZE];
struct nat_entry_set *set, *tmp;
- unsigned int found;
+ unsigned int found, entry_count = 0;
nid_t set_idx = 0;
LIST_HEAD(sets);
int err = 0;
@@ -3172,7 +3205,7 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
* into nat entry set.
*/
if (enabled_nat_bits(sbi, cpc) ||
- !__has_cursum_space(journal,
+ !__has_cursum_space(sbi, journal,
nm_i->nat_cnt[DIRTY_NAT], NAT_JOURNAL))
remove_nats_in_journal(sbi);
@@ -3183,9 +3216,21 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
set_idx = setvec[found - 1]->set + 1;
for (idx = 0; idx < found; idx++)
__adjust_nat_entry_set(setvec[idx], &sets,
- MAX_NAT_JENTRIES(journal));
+ MAX_NAT_JENTRIES(sbi, journal));
}
+ /*
+ * Readahead the current NAT block to prevent read requests from
+ * being issued and waited on one by one.
+ */
+ list_for_each_entry(set, &sets, set_list) {
+ entry_count += set->entry_cnt;
+ if (!enabled_nat_bits(sbi, cpc) &&
+ __has_cursum_space(sbi, journal,
+ entry_count, NAT_JOURNAL))
+ continue;
+ f2fs_ra_meta_pages(sbi, set->set, 1, META_NAT, true);
+ }
/* flush dirty nats in nat entry set */
list_for_each_entry_safe(set, tmp, &sets, set_list) {
err = __flush_nat_entry_set(sbi, set, cpc);
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index 9cb8dcf8d417..824ac9f0e6e4 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -52,14 +52,6 @@ enum {
IS_PREALLOC, /* nat entry is preallocated */
};
-/* For node type in __get_node_folio() */
-enum node_type {
- NODE_TYPE_REGULAR,
- NODE_TYPE_INODE,
- NODE_TYPE_XATTR,
- NODE_TYPE_NON_INODE,
-};
-
/*
* For node information
*/
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index c3415ebb9f50..a26071f2b0bc 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -514,7 +514,7 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
struct curseg_info *curseg = CURSEG_I(sbi, i);
if (curseg->segno == segno) {
- sum = curseg->sum_blk->entries[blkoff];
+ sum = sum_entries(curseg->sum_blk)[blkoff];
goto got_it;
}
}
@@ -522,8 +522,8 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
sum_folio = f2fs_get_sum_folio(sbi, segno);
if (IS_ERR(sum_folio))
return PTR_ERR(sum_folio);
- sum_node = SUM_BLK_PAGE_ADDR(sum_folio, segno);
- sum = sum_node->entries[blkoff];
+ sum_node = SUM_BLK_PAGE_ADDR(sbi, sum_folio, segno);
+ sum = sum_entries(sum_node)[blkoff];
f2fs_folio_put(sum_folio, true);
got_it:
/* Use the locked dnode page and inode */
@@ -875,6 +875,7 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only)
LIST_HEAD(inode_list);
LIST_HEAD(tmp_inode_list);
LIST_HEAD(dir_list);
+ struct f2fs_lock_context lc;
int err;
int ret = 0;
unsigned long s_flags = sbi->sb->s_flags;
@@ -888,7 +889,7 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only)
f2fs_info(sbi, "recover fsync data on readonly fs");
/* prevent checkpoint */
- f2fs_down_write(&sbi->cp_global_sem);
+ f2fs_down_write_trace(&sbi->cp_global_sem, &lc);
/* step #1: find fsynced inode numbers */
err = find_fsync_dnodes(sbi, &inode_list, check_only, &new_inode);
@@ -932,7 +933,7 @@ skip:
if (!err)
clear_sbi_flag(sbi, SBI_POR_DOING);
- f2fs_up_write(&sbi->cp_global_sem);
+ f2fs_up_write_trace(&sbi->cp_global_sem, &lc);
/* let's drop all the directory inodes for clean checkpoint */
destroy_fsync_dnodes(&dir_list, err);
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index c26424f47686..6a97fe76712b 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -371,8 +371,8 @@ next:
}
out:
- if (time_to_inject(sbi, FAULT_TIMEOUT))
- f2fs_io_schedule_timeout_killable(DEFAULT_FAULT_TIMEOUT);
+ if (time_to_inject(sbi, FAULT_ATOMIC_TIMEOUT))
+ f2fs_schedule_timeout_killable(DEFAULT_FAULT_TIMEOUT, true);
if (ret) {
sbi->revoked_atomic_block += fi->atomic_write_cnt;
@@ -400,6 +400,7 @@ int f2fs_commit_atomic_write(struct inode *inode)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct f2fs_inode_info *fi = F2FS_I(inode);
+ struct f2fs_lock_context lc;
int err;
err = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX);
@@ -407,11 +408,11 @@ int f2fs_commit_atomic_write(struct inode *inode)
return err;
f2fs_down_write(&fi->i_gc_rwsem[WRITE]);
- f2fs_lock_op(sbi);
+ f2fs_lock_op(sbi, &lc);
err = __f2fs_commit_atomic_write(inode);
- f2fs_unlock_op(sbi);
+ f2fs_unlock_op(sbi, &lc);
f2fs_up_write(&fi->i_gc_rwsem[WRITE]);
return err;
@@ -461,7 +462,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need)
.should_migrate_blocks = false,
.err_gc_skipped = false,
.nr_free_secs = 1 };
- f2fs_down_write(&sbi->gc_lock);
+ f2fs_down_write_trace(&sbi->gc_lock, &gc_control.lc);
stat_inc_gc_call_count(sbi, FOREGROUND);
f2fs_gc(sbi, &gc_control);
}
@@ -1286,7 +1287,6 @@ static int __submit_discard_cmd(struct f2fs_sb_info *sbi,
&(dcc->fstrim_list) : &(dcc->wait_list);
blk_opf_t flag = dpolicy->sync ? REQ_SYNC : 0;
block_t lstart, start, len, total_len;
- int err = 0;
if (dc->state != D_PREP)
return 0;
@@ -1327,7 +1327,7 @@ static int __submit_discard_cmd(struct f2fs_sb_info *sbi,
dc->di.len = 0;
- while (total_len && *issued < dpolicy->max_requests && !err) {
+ while (total_len && *issued < dpolicy->max_requests) {
struct bio *bio = NULL;
unsigned long flags;
bool last = true;
@@ -1343,17 +1343,6 @@ static int __submit_discard_cmd(struct f2fs_sb_info *sbi,
dc->di.len += len;
- err = 0;
- if (time_to_inject(sbi, FAULT_DISCARD)) {
- err = -EIO;
- spin_lock_irqsave(&dc->lock, flags);
- if (dc->state == D_PARTIAL)
- dc->state = D_SUBMIT;
- spin_unlock_irqrestore(&dc->lock, flags);
-
- break;
- }
-
__blkdev_issue_discard(bdev, SECTOR_FROM_BLOCK(start),
SECTOR_FROM_BLOCK(len), GFP_NOFS, &bio);
f2fs_bug_on(sbi, !bio);
@@ -1392,11 +1381,11 @@ static int __submit_discard_cmd(struct f2fs_sb_info *sbi,
len = total_len;
}
- if (!err && len) {
+ if (len) {
dcc->undiscard_blks -= len;
__update_discard_tree_range(sbi, bdev, lstart, start, len);
}
- return err;
+ return 0;
}
static void __insert_discard_cmd(struct f2fs_sb_info *sbi,
@@ -2685,12 +2674,12 @@ int f2fs_npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra)
valid_sum_count += f2fs_curseg_valid_blocks(sbi, i);
}
- sum_in_page = (PAGE_SIZE - 2 * SUM_JOURNAL_SIZE -
+ sum_in_page = (sbi->blocksize - 2 * sbi->sum_journal_size -
SUM_FOOTER_SIZE) / SUMMARY_SIZE;
if (valid_sum_count <= sum_in_page)
return 1;
else if ((valid_sum_count - sum_in_page) <=
- (PAGE_SIZE - SUM_FOOTER_SIZE) / SUMMARY_SIZE)
+ (sbi->blocksize - SUM_FOOTER_SIZE) / SUMMARY_SIZE)
return 2;
return 3;
}
@@ -2710,7 +2699,7 @@ void f2fs_update_meta_page(struct f2fs_sb_info *sbi,
{
struct folio *folio;
- if (SUMS_PER_BLOCK == 1)
+ if (!f2fs_sb_has_packed_ssa(sbi))
folio = f2fs_grab_meta_folio(sbi, blk_addr);
else
folio = f2fs_get_meta_folio_retry(sbi, blk_addr);
@@ -2728,7 +2717,7 @@ static void write_sum_page(struct f2fs_sb_info *sbi,
{
struct folio *folio;
- if (SUMS_PER_BLOCK == 1)
+ if (!f2fs_sb_has_packed_ssa(sbi))
return f2fs_update_meta_page(sbi, (void *)sum_blk,
GET_SUM_BLOCK(sbi, segno));
@@ -2736,7 +2725,8 @@ static void write_sum_page(struct f2fs_sb_info *sbi,
if (IS_ERR(folio))
return;
- memcpy(SUM_BLK_PAGE_ADDR(folio, segno), sum_blk, sizeof(*sum_blk));
+ memcpy(SUM_BLK_PAGE_ADDR(sbi, folio, segno), sum_blk,
+ sbi->sum_blocksize);
folio_mark_dirty(folio);
f2fs_folio_put(folio, true);
}
@@ -2755,11 +2745,11 @@ static void write_current_sum_page(struct f2fs_sb_info *sbi,
mutex_lock(&curseg->curseg_mutex);
down_read(&curseg->journal_rwsem);
- memcpy(&dst->journal, curseg->journal, SUM_JOURNAL_SIZE);
+ memcpy(sum_journal(sbi, dst), curseg->journal, sbi->sum_journal_size);
up_read(&curseg->journal_rwsem);
- memcpy(dst->entries, src->entries, SUM_ENTRY_SIZE);
- memcpy(&dst->footer, &src->footer, SUM_FOOTER_SIZE);
+ memcpy(sum_entries(dst), sum_entries(src), sbi->sum_entry_size);
+ memcpy(sum_footer(sbi, dst), sum_footer(sbi, src), SUM_FOOTER_SIZE);
mutex_unlock(&curseg->curseg_mutex);
@@ -2932,7 +2922,7 @@ static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified)
curseg->next_blkoff = 0;
curseg->next_segno = NULL_SEGNO;
- sum_footer = &(curseg->sum_blk->footer);
+ sum_footer = sum_footer(sbi, curseg->sum_blk);
memset(sum_footer, 0, sizeof(struct summary_footer));
sanity_check_seg_type(sbi, seg_type);
@@ -3078,11 +3068,11 @@ static int change_curseg(struct f2fs_sb_info *sbi, int type)
sum_folio = f2fs_get_sum_folio(sbi, new_segno);
if (IS_ERR(sum_folio)) {
/* GC won't be able to use stale summary pages by cp_error */
- memset(curseg->sum_blk, 0, SUM_ENTRY_SIZE);
+ memset(curseg->sum_blk, 0, sbi->sum_entry_size);
return PTR_ERR(sum_folio);
}
- sum_node = SUM_BLK_PAGE_ADDR(sum_folio, new_segno);
- memcpy(curseg->sum_blk, sum_node, SUM_ENTRY_SIZE);
+ sum_node = SUM_BLK_PAGE_ADDR(sbi, sum_folio, new_segno);
+ memcpy(curseg->sum_blk, sum_node, sbi->sum_entry_size);
f2fs_folio_put(sum_folio, true);
return 0;
}
@@ -3362,19 +3352,20 @@ int f2fs_allocate_new_section(struct f2fs_sb_info *sbi, int type, bool force)
int f2fs_allocate_pinning_section(struct f2fs_sb_info *sbi)
{
+ struct f2fs_lock_context lc;
int err;
bool gc_required = true;
retry:
- f2fs_lock_op(sbi);
+ f2fs_lock_op(sbi, &lc);
err = f2fs_allocate_new_section(sbi, CURSEG_COLD_DATA_PINNED, false);
- f2fs_unlock_op(sbi);
+ f2fs_unlock_op(sbi, &lc);
if (f2fs_sb_has_blkzoned(sbi) && err == -EAGAIN && gc_required) {
- f2fs_down_write(&sbi->gc_lock);
+ f2fs_down_write_trace(&sbi->gc_lock, &lc);
err = f2fs_gc_range(sbi, 0, sbi->first_seq_zone_segno - 1,
true, ZONED_PIN_SEC_REQUIRED_COUNT);
- f2fs_up_write(&sbi->gc_lock);
+ f2fs_up_write_trace(&sbi->gc_lock, &lc);
gc_required = false;
if (!err)
@@ -3494,6 +3485,7 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
block_t start_block, end_block;
struct cp_control cpc;
struct discard_policy dpolicy;
+ struct f2fs_lock_context lc;
unsigned long long trimmed = 0;
int err = 0;
bool need_align = f2fs_lfs_mode(sbi) && __is_large_section(sbi);
@@ -3526,10 +3518,10 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
if (sbi->discard_blks == 0)
goto out;
- f2fs_down_write(&sbi->gc_lock);
+ f2fs_down_write_trace(&sbi->gc_lock, &lc);
stat_inc_cp_call_count(sbi, TOTAL_CALL);
err = f2fs_write_checkpoint(sbi, &cpc);
- f2fs_up_write(&sbi->gc_lock);
+ f2fs_up_write_trace(&sbi->gc_lock, &lc);
if (err)
goto out;
@@ -3814,7 +3806,7 @@ int f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct folio *folio,
f2fs_wait_discard_bio(sbi, *new_blkaddr);
- curseg->sum_blk->entries[curseg->next_blkoff] = *sum;
+ sum_entries(curseg->sum_blk)[curseg->next_blkoff] = *sum;
if (curseg->alloc_type == SSR) {
curseg->next_blkoff = f2fs_find_next_ssr_block(sbi, curseg);
} else {
@@ -4183,7 +4175,7 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
}
curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr);
- curseg->sum_blk->entries[curseg->next_blkoff] = *sum;
+ sum_entries(curseg->sum_blk)[curseg->next_blkoff] = *sum;
if (!recover_curseg || recover_newaddr) {
if (!from_gc)
@@ -4240,7 +4232,7 @@ void f2fs_folio_wait_writeback(struct folio *folio, enum page_type type,
struct f2fs_sb_info *sbi = F2FS_F_SB(folio);
/* submit cached LFS IO */
- f2fs_submit_merged_write_cond(sbi, NULL, folio, 0, type);
+ f2fs_submit_merged_write_folio(sbi, folio, type);
/* submit cached IPU IO */
f2fs_submit_merged_ipu_write(sbi, NULL, folio);
if (ordered) {
@@ -4303,12 +4295,12 @@ static int read_compacted_summaries(struct f2fs_sb_info *sbi)
/* Step 1: restore nat cache */
seg_i = CURSEG_I(sbi, CURSEG_HOT_DATA);
- memcpy(seg_i->journal, kaddr, SUM_JOURNAL_SIZE);
+ memcpy(seg_i->journal, kaddr, sbi->sum_journal_size);
/* Step 2: restore sit cache */
seg_i = CURSEG_I(sbi, CURSEG_COLD_DATA);
- memcpy(seg_i->journal, kaddr + SUM_JOURNAL_SIZE, SUM_JOURNAL_SIZE);
- offset = 2 * SUM_JOURNAL_SIZE;
+ memcpy(seg_i->journal, kaddr + sbi->sum_journal_size, sbi->sum_journal_size);
+ offset = 2 * sbi->sum_journal_size;
/* Step 3: restore summary entries */
for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) {
@@ -4330,9 +4322,9 @@ static int read_compacted_summaries(struct f2fs_sb_info *sbi)
struct f2fs_summary *s;
s = (struct f2fs_summary *)(kaddr + offset);
- seg_i->sum_blk->entries[j] = *s;
+ sum_entries(seg_i->sum_blk)[j] = *s;
offset += SUMMARY_SIZE;
- if (offset + SUMMARY_SIZE <= PAGE_SIZE -
+ if (offset + SUMMARY_SIZE <= sbi->blocksize -
SUM_FOOTER_SIZE)
continue;
@@ -4388,7 +4380,7 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type)
if (IS_NODESEG(type)) {
if (__exist_node_summaries(sbi)) {
- struct f2fs_summary *ns = &sum->entries[0];
+ struct f2fs_summary *ns = sum_entries(sum);
int i;
for (i = 0; i < BLKS_PER_SEG(sbi); i++, ns++) {
@@ -4408,11 +4400,13 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type)
/* update journal info */
down_write(&curseg->journal_rwsem);
- memcpy(curseg->journal, &sum->journal, SUM_JOURNAL_SIZE);
+ memcpy(curseg->journal, sum_journal(sbi, sum), sbi->sum_journal_size);
up_write(&curseg->journal_rwsem);
- memcpy(curseg->sum_blk->entries, sum->entries, SUM_ENTRY_SIZE);
- memcpy(&curseg->sum_blk->footer, &sum->footer, SUM_FOOTER_SIZE);
+ memcpy(sum_entries(curseg->sum_blk), sum_entries(sum),
+ sbi->sum_entry_size);
+ memcpy(sum_footer(sbi, curseg->sum_blk), sum_footer(sbi, sum),
+ SUM_FOOTER_SIZE);
curseg->next_segno = segno;
reset_curseg(sbi, type, 0);
curseg->alloc_type = ckpt->alloc_type[type];
@@ -4456,8 +4450,8 @@ static int restore_curseg_summaries(struct f2fs_sb_info *sbi)
}
/* sanity check for summary blocks */
- if (nats_in_cursum(nat_j) > NAT_JOURNAL_ENTRIES ||
- sits_in_cursum(sit_j) > SIT_JOURNAL_ENTRIES) {
+ if (nats_in_cursum(nat_j) > sbi->nat_journal_entries ||
+ sits_in_cursum(sit_j) > sbi->sit_journal_entries) {
f2fs_err(sbi, "invalid journal entries nats %u sits %u",
nats_in_cursum(nat_j), sits_in_cursum(sit_j));
return -EINVAL;
@@ -4481,13 +4475,13 @@ static void write_compacted_summaries(struct f2fs_sb_info *sbi, block_t blkaddr)
/* Step 1: write nat cache */
seg_i = CURSEG_I(sbi, CURSEG_HOT_DATA);
- memcpy(kaddr, seg_i->journal, SUM_JOURNAL_SIZE);
- written_size += SUM_JOURNAL_SIZE;
+ memcpy(kaddr, seg_i->journal, sbi->sum_journal_size);
+ written_size += sbi->sum_journal_size;
/* Step 2: write sit cache */
seg_i = CURSEG_I(sbi, CURSEG_COLD_DATA);
- memcpy(kaddr + written_size, seg_i->journal, SUM_JOURNAL_SIZE);
- written_size += SUM_JOURNAL_SIZE;
+ memcpy(kaddr + written_size, seg_i->journal, sbi->sum_journal_size);
+ written_size += sbi->sum_journal_size;
/* Step 3: write summary entries */
for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) {
@@ -4500,10 +4494,10 @@ static void write_compacted_summaries(struct f2fs_sb_info *sbi, block_t blkaddr)
written_size = 0;
}
summary = (struct f2fs_summary *)(kaddr + written_size);
- *summary = seg_i->sum_blk->entries[j];
+ *summary = sum_entries(seg_i->sum_blk)[j];
written_size += SUMMARY_SIZE;
- if (written_size + SUMMARY_SIZE <= PAGE_SIZE -
+ if (written_size + SUMMARY_SIZE <= sbi->blocksize -
SUM_FOOTER_SIZE)
continue;
@@ -4545,8 +4539,9 @@ void f2fs_write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk)
write_normal_summaries(sbi, start_blk, CURSEG_HOT_NODE);
}
-int f2fs_lookup_journal_in_cursum(struct f2fs_journal *journal, int type,
- unsigned int val, int alloc)
+int f2fs_lookup_journal_in_cursum(struct f2fs_sb_info *sbi,
+ struct f2fs_journal *journal, int type,
+ unsigned int val, int alloc)
{
int i;
@@ -4555,13 +4550,13 @@ int f2fs_lookup_journal_in_cursum(struct f2fs_journal *journal, int type,
if (le32_to_cpu(nid_in_journal(journal, i)) == val)
return i;
}
- if (alloc && __has_cursum_space(journal, 1, NAT_JOURNAL))
+ if (alloc && __has_cursum_space(sbi, journal, 1, NAT_JOURNAL))
return update_nats_in_cursum(journal, 1);
} else if (type == SIT_JOURNAL) {
for (i = 0; i < sits_in_cursum(journal); i++)
if (le32_to_cpu(segno_in_journal(journal, i)) == val)
return i;
- if (alloc && __has_cursum_space(journal, 1, SIT_JOURNAL))
+ if (alloc && __has_cursum_space(sbi, journal, 1, SIT_JOURNAL))
return update_sits_in_cursum(journal, 1);
}
return -1;
@@ -4709,8 +4704,8 @@ void f2fs_flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
* entries, remove all entries from journal and add and account
* them in sit entry set.
*/
- if (!__has_cursum_space(journal, sit_i->dirty_sentries, SIT_JOURNAL) ||
- !to_journal)
+ if (!__has_cursum_space(sbi, journal,
+ sit_i->dirty_sentries, SIT_JOURNAL) || !to_journal)
remove_sits_in_journal(sbi);
/*
@@ -4727,7 +4722,8 @@ void f2fs_flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
unsigned int segno = start_segno;
if (to_journal &&
- !__has_cursum_space(journal, ses->entry_cnt, SIT_JOURNAL))
+ !__has_cursum_space(sbi, journal, ses->entry_cnt,
+ SIT_JOURNAL))
to_journal = false;
if (to_journal) {
@@ -4755,7 +4751,7 @@ void f2fs_flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
}
if (to_journal) {
- offset = f2fs_lookup_journal_in_cursum(journal,
+ offset = f2fs_lookup_journal_in_cursum(sbi, journal,
SIT_JOURNAL, segno, 1);
f2fs_bug_on(sbi, offset < 0);
segno_in_journal(journal, offset) =
@@ -4962,12 +4958,13 @@ static int build_curseg(struct f2fs_sb_info *sbi)
for (i = 0; i < NO_CHECK_TYPE; i++) {
mutex_init(&array[i].curseg_mutex);
- array[i].sum_blk = f2fs_kzalloc(sbi, PAGE_SIZE, GFP_KERNEL);
+ array[i].sum_blk = f2fs_kzalloc(sbi, sbi->sum_blocksize,
+ GFP_KERNEL);
if (!array[i].sum_blk)
return -ENOMEM;
init_rwsem(&array[i].journal_rwsem);
array[i].journal = f2fs_kzalloc(sbi,
- sizeof(struct f2fs_journal), GFP_KERNEL);
+ sbi->sum_journal_size, GFP_KERNEL);
if (!array[i].journal)
return -ENOMEM;
array[i].seg_type = log_type_to_seg_type(i);
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 07dcbcbeb7c6..068845660b0f 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -90,12 +90,11 @@ static inline void sanity_check_seg_type(struct f2fs_sb_info *sbi,
#define GET_ZONE_FROM_SEG(sbi, segno) \
GET_ZONE_FROM_SEC(sbi, GET_SEC_FROM_SEG(sbi, segno))
-#define SUMS_PER_BLOCK (F2FS_BLKSIZE / F2FS_SUM_BLKSIZE)
#define GET_SUM_BLOCK(sbi, segno) \
- (SM_I(sbi)->ssa_blkaddr + (segno / SUMS_PER_BLOCK))
-#define GET_SUM_BLKOFF(segno) (segno % SUMS_PER_BLOCK)
-#define SUM_BLK_PAGE_ADDR(folio, segno) \
- (folio_address(folio) + GET_SUM_BLKOFF(segno) * F2FS_SUM_BLKSIZE)
+ (SM_I(sbi)->ssa_blkaddr + (segno / (sbi)->sums_per_block))
+#define GET_SUM_BLKOFF(sbi, segno) (segno % (sbi)->sums_per_block)
+#define SUM_BLK_PAGE_ADDR(sbi, folio, segno) \
+ (folio_address(folio) + GET_SUM_BLKOFF(sbi, segno) * (sbi)->sum_blocksize)
#define GET_SUM_TYPE(footer) ((footer)->entry_type)
#define SET_SUM_TYPE(footer, type) ((footer)->entry_type = (type))
@@ -621,97 +620,90 @@ static inline unsigned int get_left_section_blocks(struct f2fs_sb_info *sbi,
return CAP_BLKS_PER_SEC(sbi) - get_ckpt_valid_blocks(sbi, segno, true);
}
-static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi,
- unsigned int node_blocks, unsigned int data_blocks,
- unsigned int dent_blocks)
+static inline void get_additional_blocks_required(struct f2fs_sb_info *sbi,
+ unsigned int *total_node_blocks, unsigned int *total_data_blocks,
+ unsigned int *total_dent_blocks, bool separate_dent)
{
- unsigned int segno, left_blocks, blocks;
+ unsigned int segno, left_blocks;
int i;
+ unsigned int min_free_node_blocks = CAP_BLKS_PER_SEC(sbi);
+ unsigned int min_free_dent_blocks = CAP_BLKS_PER_SEC(sbi);
+ unsigned int min_free_data_blocks = CAP_BLKS_PER_SEC(sbi);
/* check current data/node sections in the worst case. */
for (i = CURSEG_HOT_DATA; i < NR_PERSISTENT_LOG; i++) {
segno = CURSEG_I(sbi, i)->segno;
if (unlikely(segno == NULL_SEGNO))
- return false;
+ return;
left_blocks = get_left_section_blocks(sbi, i, segno);
- blocks = i <= CURSEG_COLD_DATA ? data_blocks : node_blocks;
- if (blocks > left_blocks)
- return false;
+ if (i > CURSEG_COLD_DATA)
+ min_free_node_blocks = min(min_free_node_blocks, left_blocks);
+ else if (i == CURSEG_HOT_DATA && separate_dent)
+ min_free_dent_blocks = left_blocks;
+ else
+ min_free_data_blocks = min(min_free_data_blocks, left_blocks);
}
- /* check current data section for dentry blocks. */
- segno = CURSEG_I(sbi, CURSEG_HOT_DATA)->segno;
-
- if (unlikely(segno == NULL_SEGNO))
- return false;
-
- left_blocks = get_left_section_blocks(sbi, CURSEG_HOT_DATA, segno);
-
- if (dent_blocks > left_blocks)
- return false;
- return true;
+ *total_node_blocks = (*total_node_blocks > min_free_node_blocks) ?
+ *total_node_blocks - min_free_node_blocks : 0;
+ *total_dent_blocks = (*total_dent_blocks > min_free_dent_blocks) ?
+ *total_dent_blocks - min_free_dent_blocks : 0;
+ *total_data_blocks = (*total_data_blocks > min_free_data_blocks) ?
+ *total_data_blocks - min_free_data_blocks : 0;
}
/*
- * calculate needed sections for dirty node/dentry and call
- * has_curseg_enough_space, please note that, it needs to account
- * dirty data as well in lfs mode when checkpoint is disabled.
+ * call get_additional_blocks_required to calculate dirty blocks
+ * needing to be placed in free sections, please note that, it
+ * needs to account dirty data as well in lfs mode when checkpoint
+ * is disabled.
*/
-static inline void __get_secs_required(struct f2fs_sb_info *sbi,
- unsigned int *lower_p, unsigned int *upper_p, bool *curseg_p)
+static inline int __get_secs_required(struct f2fs_sb_info *sbi)
{
unsigned int total_node_blocks = get_pages(sbi, F2FS_DIRTY_NODES) +
get_pages(sbi, F2FS_DIRTY_DENTS) +
get_pages(sbi, F2FS_DIRTY_IMETA);
unsigned int total_dent_blocks = get_pages(sbi, F2FS_DIRTY_DENTS);
unsigned int total_data_blocks = 0;
- unsigned int node_secs = total_node_blocks / CAP_BLKS_PER_SEC(sbi);
- unsigned int dent_secs = total_dent_blocks / CAP_BLKS_PER_SEC(sbi);
- unsigned int data_secs = 0;
- unsigned int node_blocks = total_node_blocks % CAP_BLKS_PER_SEC(sbi);
- unsigned int dent_blocks = total_dent_blocks % CAP_BLKS_PER_SEC(sbi);
- unsigned int data_blocks = 0;
+ bool separate_dent = true;
- if (f2fs_lfs_mode(sbi)) {
+ if (f2fs_lfs_mode(sbi))
total_data_blocks = get_pages(sbi, F2FS_DIRTY_DATA);
- data_secs = total_data_blocks / CAP_BLKS_PER_SEC(sbi);
- data_blocks = total_data_blocks % CAP_BLKS_PER_SEC(sbi);
+
+ /*
+ * When active_logs != 4, dentry blocks and data blocks can be
+ * mixed in the same logs, so check their space together.
+ */
+ if (F2FS_OPTION(sbi).active_logs != 4) {
+ total_data_blocks += total_dent_blocks;
+ total_dent_blocks = 0;
+ separate_dent = false;
}
- if (lower_p)
- *lower_p = node_secs + dent_secs + data_secs;
- if (upper_p)
- *upper_p = node_secs + dent_secs + data_secs +
- (node_blocks ? 1 : 0) + (dent_blocks ? 1 : 0) +
- (data_blocks ? 1 : 0);
- if (curseg_p)
- *curseg_p = has_curseg_enough_space(sbi,
- node_blocks, data_blocks, dent_blocks);
+ get_additional_blocks_required(sbi, &total_node_blocks, &total_dent_blocks,
+ &total_data_blocks, separate_dent);
+
+ return DIV_ROUND_UP(total_node_blocks, CAP_BLKS_PER_SEC(sbi)) +
+ DIV_ROUND_UP(total_dent_blocks, CAP_BLKS_PER_SEC(sbi)) +
+ DIV_ROUND_UP(total_data_blocks, CAP_BLKS_PER_SEC(sbi));
}
static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi,
int freed, int needed)
{
- unsigned int free_secs, lower_secs, upper_secs;
- bool curseg_space;
+ unsigned int free_secs, required_secs;
if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
return false;
- __get_secs_required(sbi, &lower_secs, &upper_secs, &curseg_space);
-
free_secs = free_sections(sbi) + freed;
- lower_secs += needed + reserved_sections(sbi);
- upper_secs += needed + reserved_sections(sbi);
+ required_secs = needed + reserved_sections(sbi) +
+ __get_secs_required(sbi);
- if (free_secs > upper_secs)
- return false;
- if (free_secs <= lower_secs)
- return true;
- return !curseg_space;
+ return free_secs < required_secs;
}
static inline bool has_enough_free_secs(struct f2fs_sb_info *sbi,
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index cd00d030edda..7c8e6eea60df 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -67,8 +67,10 @@ const char *f2fs_fault_name[FAULT_MAX] = {
[FAULT_BLKADDR_CONSISTENCE] = "inconsistent blkaddr",
[FAULT_NO_SEGMENT] = "no free segment",
[FAULT_INCONSISTENT_FOOTER] = "inconsistent footer",
- [FAULT_TIMEOUT] = "timeout",
+ [FAULT_ATOMIC_TIMEOUT] = "atomic timeout",
[FAULT_VMALLOC] = "vmalloc",
+ [FAULT_LOCK_TIMEOUT] = "lock timeout",
+ [FAULT_SKIP_WRITE] = "skip write",
};
int f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned long rate,
@@ -96,8 +98,57 @@ int f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned long rate,
f2fs_info(sbi, "build fault injection type: 0x%lx", type);
}
+ if (fo & FAULT_TIMEOUT) {
+ if (type >= TIMEOUT_TYPE_MAX)
+ return -EINVAL;
+ ffi->inject_lock_timeout = (unsigned int)type;
+ f2fs_info(sbi, "build fault timeout injection type: 0x%lx", type);
+ }
+
return 0;
}
+
+static void inject_timeout(struct f2fs_sb_info *sbi)
+{
+ struct f2fs_fault_info *ffi = &F2FS_OPTION(sbi).fault_info;
+ enum f2fs_timeout_type type = ffi->inject_lock_timeout;
+ unsigned long start_time = jiffies;
+ unsigned long timeout = HZ;
+
+ switch (type) {
+ case TIMEOUT_TYPE_RUNNING:
+ while (!time_after(jiffies, start_time + timeout)) {
+ if (fatal_signal_pending(current))
+ return;
+ ;
+ }
+ break;
+ case TIMEOUT_TYPE_IO_SLEEP:
+ f2fs_schedule_timeout_killable(timeout, true);
+ break;
+ case TIMEOUT_TYPE_NONIO_SLEEP:
+ f2fs_schedule_timeout_killable(timeout, false);
+ break;
+ case TIMEOUT_TYPE_RUNNABLE:
+ while (!time_after(jiffies, start_time + timeout)) {
+ if (fatal_signal_pending(current))
+ return;
+ schedule();
+ }
+ break;
+ default:
+ return;
+ }
+}
+
+void f2fs_simulate_lock_timeout(struct f2fs_sb_info *sbi)
+{
+ struct f2fs_lock_context lc;
+
+ f2fs_lock_op(sbi, &lc);
+ inject_timeout(sbi);
+ f2fs_unlock_op(sbi, &lc);
+}
#endif
/* f2fs-wide shrinker description */
@@ -2556,6 +2607,7 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi)
{
unsigned int s_flags = sbi->sb->s_flags;
struct cp_control cpc;
+ struct f2fs_lock_context lc;
unsigned int gc_mode = sbi->gc_mode;
int err = 0;
int ret;
@@ -2585,7 +2637,7 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi)
.no_bg_gc = true,
.nr_free_secs = 1 };
- f2fs_down_write(&sbi->gc_lock);
+ f2fs_down_write_trace(&sbi->gc_lock, &gc_control.lc);
stat_inc_gc_call_count(sbi, FOREGROUND);
err = f2fs_gc(sbi, &gc_control);
if (err == -ENODATA) {
@@ -2609,7 +2661,7 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi)
}
skip_gc:
- f2fs_down_write(&sbi->gc_lock);
+ f2fs_down_write_trace(&sbi->gc_lock, &lc);
cpc.reason = CP_PAUSE;
set_sbi_flag(sbi, SBI_CP_DISABLED);
stat_inc_cp_call_count(sbi, TOTAL_CALL);
@@ -2622,7 +2674,7 @@ skip_gc:
spin_unlock(&sbi->stat_lock);
out_unlock:
- f2fs_up_write(&sbi->gc_lock);
+ f2fs_up_write_trace(&sbi->gc_lock, &lc);
restore_flag:
sbi->gc_mode = gc_mode;
sbi->sb->s_flags = s_flags; /* Restore SB_RDONLY status */
@@ -2632,57 +2684,66 @@ restore_flag:
static int f2fs_enable_checkpoint(struct f2fs_sb_info *sbi)
{
- unsigned int nr_pages = get_pages(sbi, F2FS_DIRTY_DATA) / 16;
- long long start, writeback, lock, sync_inode, end;
+ int retry = MAX_FLUSH_RETRY_COUNT;
+ long long start, writeback, end;
int ret;
+ struct f2fs_lock_context lc;
+ long long skipped_write, dirty_data;
- f2fs_info(sbi, "%s start, meta: %lld, node: %lld, data: %lld",
- __func__,
+ f2fs_info(sbi, "f2fs_enable_checkpoint() starts, meta: %lld, node: %lld, data: %lld",
get_pages(sbi, F2FS_DIRTY_META),
get_pages(sbi, F2FS_DIRTY_NODES),
get_pages(sbi, F2FS_DIRTY_DATA));
- f2fs_update_time(sbi, ENABLE_TIME);
-
start = ktime_get();
+ set_sbi_flag(sbi, SBI_ENABLE_CHECKPOINT);
+
/* we should flush all the data to keep data consistency */
- while (get_pages(sbi, F2FS_DIRTY_DATA)) {
- writeback_inodes_sb_nr(sbi->sb, nr_pages, WB_REASON_SYNC);
+ do {
+ skipped_write = get_pages(sbi, F2FS_SKIPPED_WRITE);
+ dirty_data = get_pages(sbi, F2FS_DIRTY_DATA);
+
+ sync_inodes_sb(sbi->sb);
f2fs_io_schedule_timeout(DEFAULT_SCHEDULE_TIMEOUT);
- if (f2fs_time_over(sbi, ENABLE_TIME))
- break;
- }
- writeback = ktime_get();
+ f2fs_info(sbi, "sync_inode_sb done, dirty_data: %lld, %lld, "
+ "skipped write: %lld, %lld, retry: %d",
+ get_pages(sbi, F2FS_DIRTY_DATA),
+ dirty_data,
+ get_pages(sbi, F2FS_SKIPPED_WRITE),
+ skipped_write, retry);
- f2fs_down_write(&sbi->cp_enable_rwsem);
+ /*
+ * sync_inodes_sb() has retry logic, so let's check dirty_data
+ * in prior to skipped_write in case there is no dirty data.
+ */
+ if (!get_pages(sbi, F2FS_DIRTY_DATA))
+ break;
+ if (get_pages(sbi, F2FS_SKIPPED_WRITE) == skipped_write)
+ break;
+ } while (retry--);
- lock = ktime_get();
+ clear_sbi_flag(sbi, SBI_ENABLE_CHECKPOINT);
- if (get_pages(sbi, F2FS_DIRTY_DATA))
- sync_inodes_sb(sbi->sb);
+ writeback = ktime_get();
- if (unlikely(get_pages(sbi, F2FS_DIRTY_DATA)))
- f2fs_warn(sbi, "%s: has some unwritten data: %lld",
- __func__, get_pages(sbi, F2FS_DIRTY_DATA));
+ if (unlikely(get_pages(sbi, F2FS_DIRTY_DATA) ||
+ get_pages(sbi, F2FS_SKIPPED_WRITE)))
+ f2fs_warn(sbi, "checkpoint=enable unwritten data: %lld, skipped data: %lld, retry: %d",
+ get_pages(sbi, F2FS_DIRTY_DATA),
+ get_pages(sbi, F2FS_SKIPPED_WRITE), retry);
- sync_inode = ktime_get();
+ if (get_pages(sbi, F2FS_SKIPPED_WRITE))
+ atomic_set(&sbi->nr_pages[F2FS_SKIPPED_WRITE], 0);
- f2fs_down_write(&sbi->gc_lock);
+ f2fs_down_write_trace(&sbi->gc_lock, &lc);
f2fs_dirty_to_prefree(sbi);
clear_sbi_flag(sbi, SBI_CP_DISABLED);
set_sbi_flag(sbi, SBI_IS_DIRTY);
- f2fs_up_write(&sbi->gc_lock);
+ f2fs_up_write_trace(&sbi->gc_lock, &lc);
- f2fs_info(sbi, "%s sync_fs, meta: %lld, imeta: %lld, node: %lld, dents: %lld, qdata: %lld",
- __func__,
- get_pages(sbi, F2FS_DIRTY_META),
- get_pages(sbi, F2FS_DIRTY_IMETA),
- get_pages(sbi, F2FS_DIRTY_NODES),
- get_pages(sbi, F2FS_DIRTY_DENTS),
- get_pages(sbi, F2FS_DIRTY_QDATA));
ret = f2fs_sync_fs(sbi->sb, 1);
if (ret)
f2fs_err(sbi, "%s sync_fs failed, ret: %d", __func__, ret);
@@ -2690,17 +2751,11 @@ static int f2fs_enable_checkpoint(struct f2fs_sb_info *sbi)
/* Let's ensure there's no pending checkpoint anymore */
f2fs_flush_ckpt_thread(sbi);
- f2fs_up_write(&sbi->cp_enable_rwsem);
-
end = ktime_get();
- f2fs_info(sbi, "%s end, writeback:%llu, "
- "lock:%llu, sync_inode:%llu, sync_fs:%llu",
- __func__,
- ktime_ms_delta(writeback, start),
- ktime_ms_delta(lock, writeback),
- ktime_ms_delta(sync_inode, lock),
- ktime_ms_delta(end, sync_inode));
+ f2fs_info(sbi, "f2fs_enable_checkpoint() finishes, writeback:%llu, sync:%llu",
+ ktime_ms_delta(writeback, start),
+ ktime_ms_delta(end, writeback));
return ret;
}
@@ -3219,19 +3274,12 @@ int f2fs_enable_quota_files(struct f2fs_sb_info *sbi, bool rdonly)
}
static int f2fs_quota_enable(struct super_block *sb, int type, int format_id,
- unsigned int flags)
+ unsigned int flags, unsigned long qf_inum)
{
struct inode *qf_inode;
- unsigned long qf_inum;
unsigned long qf_flag = F2FS_QUOTA_DEFAULT_FL;
int err;
- BUG_ON(!f2fs_sb_has_quota_ino(F2FS_SB(sb)));
-
- qf_inum = f2fs_qf_ino(sb, type);
- if (!qf_inum)
- return -EPERM;
-
qf_inode = f2fs_iget(sb, qf_inum);
if (IS_ERR(qf_inode)) {
f2fs_err(F2FS_SB(sb), "Bad quota inode %u:%lu", type, qf_inum);
@@ -3264,7 +3312,7 @@ static int f2fs_enable_quotas(struct super_block *sb)
test_opt(sbi, PRJQUOTA),
};
- if (is_set_ckpt_flags(F2FS_SB(sb), CP_QUOTA_NEED_FSCK_FLAG)) {
+ if (is_set_ckpt_flags(sbi, CP_QUOTA_NEED_FSCK_FLAG)) {
f2fs_err(sbi, "quota file may be corrupted, skip loading it");
return 0;
}
@@ -3276,14 +3324,13 @@ static int f2fs_enable_quotas(struct super_block *sb)
if (qf_inum) {
err = f2fs_quota_enable(sb, type, QFMT_VFS_V1,
DQUOT_USAGE_ENABLED |
- (quota_mopt[type] ? DQUOT_LIMITS_ENABLED : 0));
+ (quota_mopt[type] ? DQUOT_LIMITS_ENABLED : 0), qf_inum);
if (err) {
f2fs_err(sbi, "Failed to enable quota tracking (type=%d, err=%d). Please run fsck to fix.",
type, err);
for (type--; type >= 0; type--)
dquot_quota_off(sb, type);
- set_sbi_flag(F2FS_SB(sb),
- SBI_QUOTA_NEED_REPAIR);
+ set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR);
return err;
}
}
@@ -3330,6 +3377,7 @@ int f2fs_do_quota_sync(struct super_block *sb, int type)
* that userspace sees the changes.
*/
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+ struct f2fs_lock_context lc;
if (type != -1 && cnt != type)
continue;
@@ -3349,13 +3397,13 @@ int f2fs_do_quota_sync(struct super_block *sb, int type)
* block_operation
* f2fs_down_read(quota_sem)
*/
- f2fs_lock_op(sbi);
+ f2fs_lock_op(sbi, &lc);
f2fs_down_read(&sbi->quota_sem);
ret = f2fs_quota_sync_file(sbi, cnt);
f2fs_up_read(&sbi->quota_sem);
- f2fs_unlock_op(sbi);
+ f2fs_unlock_op(sbi, &lc);
if (!f2fs_sb_has_quota_ino(sbi))
inode_unlock(dqopt->files[cnt]);
@@ -4077,20 +4125,6 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi,
if (sanity_check_area_boundary(sbi, folio, index))
return -EFSCORRUPTED;
- /*
- * Check for legacy summary layout on 16KB+ block devices.
- * Modern f2fs-tools packs multiple 4KB summary areas into one block,
- * whereas legacy versions used one block per summary, leading
- * to a much larger SSA.
- */
- if (SUMS_PER_BLOCK > 1 &&
- !(__F2FS_HAS_FEATURE(raw_super, F2FS_FEATURE_PACKED_SSA))) {
- f2fs_info(sbi, "Error: Device formatted with a legacy version. "
- "Please reformat with a tool supporting the packed ssa "
- "feature for block sizes larger than 4kb.");
- return -EOPNOTSUPP;
- }
-
return 0;
}
@@ -4300,6 +4334,22 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
sbi->max_fragment_hole = DEF_FRAGMENT_SIZE;
spin_lock_init(&sbi->gc_remaining_trials_lock);
atomic64_set(&sbi->current_atomic_write, 0);
+ sbi->max_lock_elapsed_time = MAX_LOCK_ELAPSED_TIME;
+ sbi->adjust_lock_priority = 0;
+ sbi->lock_duration_priority = F2FS_DEFAULT_TASK_PRIORITY;
+ sbi->critical_task_priority = F2FS_CRITICAL_TASK_PRIORITY;
+
+ sbi->sum_blocksize = f2fs_sb_has_packed_ssa(sbi) ?
+ 4096 : sbi->blocksize;
+ sbi->sums_per_block = sbi->blocksize / sbi->sum_blocksize;
+ sbi->entries_in_sum = sbi->sum_blocksize / 8;
+ sbi->sum_entry_size = SUMMARY_SIZE * sbi->entries_in_sum;
+ sbi->sum_journal_size = sbi->sum_blocksize - SUM_FOOTER_SIZE -
+ sbi->sum_entry_size;
+ sbi->nat_journal_entries = (sbi->sum_journal_size - 2) /
+ sizeof(struct nat_journal_entry);
+ sbi->sit_journal_entries = (sbi->sum_journal_size - 2) /
+ sizeof(struct sit_journal_entry);
sbi->dir_level = DEF_DIR_LEVEL;
sbi->interval_time[CP_TIME] = DEF_CP_INTERVAL;
@@ -4307,7 +4357,6 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
sbi->interval_time[DISCARD_TIME] = DEF_IDLE_INTERVAL;
sbi->interval_time[GC_TIME] = DEF_IDLE_INTERVAL;
sbi->interval_time[DISABLE_TIME] = DEF_DISABLE_INTERVAL;
- sbi->interval_time[ENABLE_TIME] = DEF_ENABLE_INTERVAL;
sbi->interval_time[UMOUNT_DISCARD_TIMEOUT] =
DEF_UMOUNT_DISCARD_TIMEOUT;
clear_sbi_flag(sbi, SBI_NEED_FSCK);
@@ -4896,14 +4945,13 @@ try_onemore:
sbi->sb = sb;
/* initialize locks within allocated memory */
- init_f2fs_rwsem(&sbi->gc_lock);
+ init_f2fs_rwsem_trace(&sbi->gc_lock, sbi, LOCK_NAME_GC_LOCK);
mutex_init(&sbi->writepages);
- init_f2fs_rwsem(&sbi->cp_global_sem);
- init_f2fs_rwsem(&sbi->node_write);
- init_f2fs_rwsem(&sbi->node_change);
+ init_f2fs_rwsem_trace(&sbi->cp_global_sem, sbi, LOCK_NAME_CP_GLOBAL);
+ init_f2fs_rwsem_trace(&sbi->node_write, sbi, LOCK_NAME_NODE_WRITE);
+ init_f2fs_rwsem_trace(&sbi->node_change, sbi, LOCK_NAME_NODE_CHANGE);
spin_lock_init(&sbi->stat_lock);
- init_f2fs_rwsem(&sbi->cp_rwsem);
- init_f2fs_rwsem(&sbi->cp_enable_rwsem);
+ init_f2fs_rwsem_trace(&sbi->cp_rwsem, sbi, LOCK_NAME_CP_RWSEM);
init_f2fs_rwsem(&sbi->quota_sem);
init_waitqueue_head(&sbi->cp_wait);
spin_lock_init(&sbi->error_lock);
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index c42f4f979d13..5fbfdc96e502 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -35,6 +35,7 @@ enum {
#ifdef CONFIG_F2FS_FAULT_INJECTION
FAULT_INFO_RATE, /* struct f2fs_fault_info */
FAULT_INFO_TYPE, /* struct f2fs_fault_info */
+ FAULT_INFO_TIMEOUT, /* struct f2fs_fault_info */
#endif
RESERVED_BLOCKS, /* struct f2fs_sb_info */
CPRC_INFO, /* struct ckpt_req_control */
@@ -58,6 +59,7 @@ struct f2fs_attr {
const char *buf, size_t len);
int struct_type;
int offset;
+ int size;
int id;
};
@@ -84,7 +86,8 @@ static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type)
return (unsigned char *)sbi;
#ifdef CONFIG_F2FS_FAULT_INJECTION
else if (struct_type == FAULT_INFO_RATE ||
- struct_type == FAULT_INFO_TYPE)
+ struct_type == FAULT_INFO_TYPE ||
+ struct_type == FAULT_INFO_TIMEOUT)
return (unsigned char *)&F2FS_OPTION(sbi).fault_info;
#endif
#ifdef CONFIG_F2FS_STAT_FS
@@ -344,11 +347,30 @@ static ssize_t main_blkaddr_show(struct f2fs_attr *a,
(unsigned long long)MAIN_BLKADDR(sbi));
}
+static ssize_t __sbi_show_value(struct f2fs_attr *a,
+ struct f2fs_sb_info *sbi, char *buf,
+ unsigned char *value)
+{
+ switch (a->size) {
+ case 1:
+ return sysfs_emit(buf, "%u\n", *(u8 *)value);
+ case 2:
+ return sysfs_emit(buf, "%u\n", *(u16 *)value);
+ case 4:
+ return sysfs_emit(buf, "%u\n", *(u32 *)value);
+ case 8:
+ return sysfs_emit(buf, "%llu\n", *(u64 *)value);
+ default:
+ f2fs_bug_on(sbi, 1);
+ return sysfs_emit(buf,
+ "show sysfs node value with wrong type\n");
+ }
+}
+
static ssize_t f2fs_sbi_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
unsigned char *ptr = NULL;
- unsigned int *ui;
ptr = __struct_ptr(sbi, a->struct_type);
if (!ptr)
@@ -428,9 +450,30 @@ static ssize_t f2fs_sbi_show(struct f2fs_attr *a,
atomic_read(&sbi->cp_call_count[BACKGROUND]));
#endif
- ui = (unsigned int *)(ptr + a->offset);
+ return __sbi_show_value(a, sbi, buf, ptr + a->offset);
+}
- return sysfs_emit(buf, "%u\n", *ui);
+static void __sbi_store_value(struct f2fs_attr *a,
+ struct f2fs_sb_info *sbi,
+ unsigned char *ui, unsigned long value)
+{
+ switch (a->size) {
+ case 1:
+ *(u8 *)ui = value;
+ break;
+ case 2:
+ *(u16 *)ui = value;
+ break;
+ case 4:
+ *(u32 *)ui = value;
+ break;
+ case 8:
+ *(u64 *)ui = value;
+ break;
+ default:
+ f2fs_bug_on(sbi, 1);
+ f2fs_err(sbi, "store sysfs node value with wrong type");
+ }
}
static ssize_t __sbi_store(struct f2fs_attr *a,
@@ -529,6 +572,12 @@ out:
return -EINVAL;
return count;
}
+ if (a->struct_type == FAULT_INFO_TIMEOUT) {
+ if (f2fs_build_fault_attr(sbi, 0, t, FAULT_TIMEOUT))
+ return -EINVAL;
+ f2fs_simulate_lock_timeout(sbi);
+ return count;
+ }
#endif
if (a->struct_type == RESERVED_BLOCKS) {
spin_lock(&sbi->stat_lock);
@@ -749,7 +798,7 @@ out:
return count;
}
- if (!strcmp(a->attr.name, "gc_pin_file_threshold")) {
+ if (!strcmp(a->attr.name, "gc_pin_file_thresh")) {
if (t > MAX_GC_FAILED_PINNED_FILES)
return -EINVAL;
sbi->gc_pin_file_threshold = t;
@@ -906,7 +955,36 @@ out:
return count;
}
- *ui = (unsigned int)t;
+ if (!strcmp(a->attr.name, "adjust_lock_priority")) {
+ if (t >= BIT(LOCK_NAME_MAX - 1))
+ return -EINVAL;
+ sbi->adjust_lock_priority = t;
+ return count;
+ }
+
+ if (!strcmp(a->attr.name, "lock_duration_priority")) {
+ if (t < NICE_TO_PRIO(MIN_NICE) || t > NICE_TO_PRIO(MAX_NICE))
+ return -EINVAL;
+ sbi->lock_duration_priority = t;
+ return count;
+ }
+
+ if (!strcmp(a->attr.name, "critical_task_priority")) {
+ if (t < NICE_TO_PRIO(MIN_NICE) || t > NICE_TO_PRIO(MAX_NICE))
+ return -EINVAL;
+ if (!capable(CAP_SYS_NICE))
+ return -EPERM;
+ sbi->critical_task_priority = t;
+ if (sbi->cprc_info.f2fs_issue_ckpt)
+ set_user_nice(sbi->cprc_info.f2fs_issue_ckpt,
+ PRIO_TO_NICE(sbi->critical_task_priority));
+ if (sbi->gc_thread && sbi->gc_thread->f2fs_gc_task)
+ set_user_nice(sbi->gc_thread->f2fs_gc_task,
+ PRIO_TO_NICE(sbi->critical_task_priority));
+ return count;
+ }
+
+ __sbi_store_value(a, sbi, ptr + a->offset, t);
return count;
}
@@ -1053,24 +1131,27 @@ static struct f2fs_attr f2fs_attr_sb_##_name = { \
.id = F2FS_FEATURE_##_feat, \
}
-#define F2FS_ATTR_OFFSET(_struct_type, _name, _mode, _show, _store, _offset) \
+#define F2FS_ATTR_OFFSET(_struct_type, _name, _mode, _show, _store, _offset, _size) \
static struct f2fs_attr f2fs_attr_##_name = { \
.attr = {.name = __stringify(_name), .mode = _mode }, \
.show = _show, \
.store = _store, \
.struct_type = _struct_type, \
- .offset = _offset \
+ .offset = _offset, \
+ .size = _size \
}
#define F2FS_RO_ATTR(struct_type, struct_name, name, elname) \
F2FS_ATTR_OFFSET(struct_type, name, 0444, \
f2fs_sbi_show, NULL, \
- offsetof(struct struct_name, elname))
+ offsetof(struct struct_name, elname), \
+ sizeof_field(struct struct_name, elname))
#define F2FS_RW_ATTR(struct_type, struct_name, name, elname) \
F2FS_ATTR_OFFSET(struct_type, name, 0644, \
f2fs_sbi_show, f2fs_sbi_store, \
- offsetof(struct struct_name, elname))
+ offsetof(struct struct_name, elname), \
+ sizeof_field(struct struct_name, elname))
#define F2FS_GENERAL_RO_ATTR(name) \
static struct f2fs_attr f2fs_attr_##name = __ATTR(name, 0444, name##_show, NULL)
@@ -1219,6 +1300,10 @@ F2FS_SBI_GENERAL_RW_ATTR(blkzone_alloc_policy);
F2FS_SBI_GENERAL_RW_ATTR(carve_out);
F2FS_SBI_GENERAL_RW_ATTR(reserved_pin_section);
F2FS_SBI_GENERAL_RW_ATTR(bggc_io_aware);
+F2FS_SBI_GENERAL_RW_ATTR(max_lock_elapsed_time);
+F2FS_SBI_GENERAL_RW_ATTR(lock_duration_priority);
+F2FS_SBI_GENERAL_RW_ATTR(adjust_lock_priority);
+F2FS_SBI_GENERAL_RW_ATTR(critical_task_priority);
/* STAT_INFO ATTR */
#ifdef CONFIG_F2FS_STAT_FS
@@ -1232,6 +1317,7 @@ STAT_INFO_RO_ATTR(gc_background_calls, gc_call_count[BACKGROUND]);
#ifdef CONFIG_F2FS_FAULT_INJECTION
FAULT_INFO_GENERAL_RW_ATTR(FAULT_INFO_RATE, inject_rate);
FAULT_INFO_GENERAL_RW_ATTR(FAULT_INFO_TYPE, inject_type);
+FAULT_INFO_GENERAL_RW_ATTR(FAULT_INFO_TIMEOUT, inject_lock_timeout);
#endif
/* RESERVED_BLOCKS ATTR */
@@ -1361,6 +1447,7 @@ static struct attribute *f2fs_attrs[] = {
#ifdef CONFIG_F2FS_FAULT_INJECTION
ATTR_LIST(inject_rate),
ATTR_LIST(inject_type),
+ ATTR_LIST(inject_lock_timeout),
#endif
ATTR_LIST(data_io_flag),
ATTR_LIST(node_io_flag),
@@ -1422,6 +1509,10 @@ static struct attribute *f2fs_attrs[] = {
ATTR_LIST(reserved_pin_section),
ATTR_LIST(allocate_section_hint),
ATTR_LIST(allocate_section_policy),
+ ATTR_LIST(max_lock_elapsed_time),
+ ATTR_LIST(lock_duration_priority),
+ ATTR_LIST(adjust_lock_priority),
+ ATTR_LIST(critical_task_priority),
NULL,
};
ATTRIBUTE_GROUPS(f2fs);
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index b4e5c406632f..941dc62a6d6f 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -804,6 +804,7 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name,
struct folio *ifolio, int flags)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct f2fs_lock_context lc;
int err;
if (unlikely(f2fs_cp_error(sbi)))
@@ -821,11 +822,11 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name,
size, ifolio, flags);
f2fs_balance_fs(sbi, true);
- f2fs_lock_op(sbi);
+ f2fs_lock_op(sbi, &lc);
f2fs_down_write(&F2FS_I(inode)->i_xattr_sem);
err = __f2fs_setxattr(inode, index, name, value, size, NULL, flags);
f2fs_up_write(&F2FS_I(inode)->i_xattr_sem);
- f2fs_unlock_op(sbi);
+ f2fs_unlock_op(sbi, &lc);
f2fs_update_time(sbi, REQ_TIME);
return err;
diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index a7880787cad3..dc41722fcc9d 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -17,7 +17,6 @@
#define F2FS_LOG_SECTORS_PER_BLOCK (PAGE_SHIFT - 9) /* log number for sector/blk */
#define F2FS_BLKSIZE PAGE_SIZE /* support only block == page */
#define F2FS_BLKSIZE_BITS PAGE_SHIFT /* bits for F2FS_BLKSIZE */
-#define F2FS_SUM_BLKSIZE 4096 /* only support 4096 byte sum block */
#define F2FS_MAX_EXTENSION 64 /* # of extension entries */
#define F2FS_EXTENSION_LEN 8 /* max size of extension */
@@ -442,10 +441,8 @@ struct f2fs_sit_block {
* from node's page's beginning to get a data block address.
* ex) data_blkaddr = (block_t)(nodepage_start_address + ofs_in_node)
*/
-#define ENTRIES_IN_SUM (F2FS_SUM_BLKSIZE / 8)
#define SUMMARY_SIZE (7) /* sizeof(struct f2fs_summary) */
#define SUM_FOOTER_SIZE (5) /* sizeof(struct summary_footer) */
-#define SUM_ENTRY_SIZE (SUMMARY_SIZE * ENTRIES_IN_SUM)
/* a summary entry for a block in a segment */
struct f2fs_summary {
@@ -468,22 +465,6 @@ struct summary_footer {
__le32 check_sum; /* summary checksum */
} __packed;
-#define SUM_JOURNAL_SIZE (F2FS_SUM_BLKSIZE - SUM_FOOTER_SIZE -\
- SUM_ENTRY_SIZE)
-#define NAT_JOURNAL_ENTRIES ((SUM_JOURNAL_SIZE - 2) /\
- sizeof(struct nat_journal_entry))
-#define NAT_JOURNAL_RESERVED ((SUM_JOURNAL_SIZE - 2) %\
- sizeof(struct nat_journal_entry))
-#define SIT_JOURNAL_ENTRIES ((SUM_JOURNAL_SIZE - 2) /\
- sizeof(struct sit_journal_entry))
-#define SIT_JOURNAL_RESERVED ((SUM_JOURNAL_SIZE - 2) %\
- sizeof(struct sit_journal_entry))
-
-/* Reserved area should make size of f2fs_extra_info equals to
- * that of nat_journal and sit_journal.
- */
-#define EXTRA_INFO_RESERVED (SUM_JOURNAL_SIZE - 2 - 8)
-
/*
* frequently updated NAT/SIT entries can be stored in the spare area in
* summary blocks
@@ -498,9 +479,16 @@ struct nat_journal_entry {
struct f2fs_nat_entry ne;
} __packed;
+/*
+ * The nat_journal structure is a placeholder whose actual size varies depending
+ * on the use of packed_ssa. Therefore, it must always be accessed only through
+ * specific sets of macros and fields, and size calculations should use
+ * size-related macros instead of sizeof().
+ * Relevant macros: sbi->nat_journal_entries, nat_in_journal(),
+ * nid_in_journal(), MAX_NAT_JENTRIES().
+ */
struct nat_journal {
- struct nat_journal_entry entries[NAT_JOURNAL_ENTRIES];
- __u8 reserved[NAT_JOURNAL_RESERVED];
+ struct nat_journal_entry entries[0];
} __packed;
struct sit_journal_entry {
@@ -508,14 +496,21 @@ struct sit_journal_entry {
struct f2fs_sit_entry se;
} __packed;
+/*
+ * The sit_journal structure is a placeholder whose actual size varies depending
+ * on the use of packed_ssa. Therefore, it must always be accessed only through
+ * specific sets of macros and fields, and size calculations should use
+ * size-related macros instead of sizeof().
+ * Relevant macros: sbi->sit_journal_entries, sit_in_journal(),
+ * segno_in_journal(), MAX_SIT_JENTRIES().
+ */
struct sit_journal {
- struct sit_journal_entry entries[SIT_JOURNAL_ENTRIES];
- __u8 reserved[SIT_JOURNAL_RESERVED];
+ struct sit_journal_entry entries[0];
} __packed;
struct f2fs_extra_info {
__le64 kbytes_written;
- __u8 reserved[EXTRA_INFO_RESERVED];
+ __u8 reserved[];
} __packed;
struct f2fs_journal {
@@ -531,11 +526,33 @@ struct f2fs_journal {
};
} __packed;
-/* Block-sized summary block structure */
+/*
+ * Block-sized summary block structure
+ *
+ * The f2fs_summary_block structure is a placeholder whose actual size varies
+ * depending on the use of packed_ssa. Therefore, it must always be accessed
+ * only through specific sets of macros and fields, and size calculations should
+ * use size-related macros instead of sizeof().
+ * Relevant macros: sbi->sum_blocksize, sbi->entries_in_sum,
+ * sbi->sum_entry_size, sum_entries(), sum_journal(), sum_footer().
+ *
+ * Summary Block Layout
+ *
+ * +-----------------------+ <--- Block Start
+ * | struct f2fs_summary |
+ * | entries[0] |
+ * | ... |
+ * | entries[N-1] |
+ * +-----------------------+
+ * | struct f2fs_journal |
+ * +-----------------------+
+ * | struct summary_footer |
+ * +-----------------------+ <--- Block End
+ */
struct f2fs_summary_block {
- struct f2fs_summary entries[ENTRIES_IN_SUM];
- struct f2fs_journal journal;
- struct summary_footer footer;
+ struct f2fs_summary entries[0];
+ // struct f2fs_journal journal;
+ // struct summary_footer footer;
} __packed;
/*
diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h
index df4017dcc701..9364e6775562 100644
--- a/include/trace/events/f2fs.h
+++ b/include/trace/events/f2fs.h
@@ -184,6 +184,15 @@ TRACE_DEFINE_ENUM(CP_PHASE_FINISH_CHECKPOINT);
{ CP_PHASE_FINISH_BLOCK_OPS, "finish block_ops" }, \
{ CP_PHASE_FINISH_CHECKPOINT, "finish checkpoint" })
+#define show_lock_name(lock) \
+ __print_symbolic(lock, \
+ { LOCK_NAME_CP_RWSEM, "cp_rwsem" }, \
+ { LOCK_NAME_NODE_CHANGE, "node_change" }, \
+ { LOCK_NAME_NODE_WRITE, "node_write" }, \
+ { LOCK_NAME_GC_LOCK, "gc_lock" }, \
+ { LOCK_NAME_CP_GLOBAL, "cp_global" }, \
+ { LOCK_NAME_IO_RWSEM, "io_rwsem" })
+
struct f2fs_sb_info;
struct f2fs_io_info;
struct extent_info;
@@ -1358,6 +1367,7 @@ DECLARE_EVENT_CLASS(f2fs__folio,
__field(int, type)
__field(int, dir)
__field(pgoff_t, index)
+ __field(pgoff_t, nrpages)
__field(int, dirty)
__field(int, uptodate)
),
@@ -1368,16 +1378,18 @@ DECLARE_EVENT_CLASS(f2fs__folio,
__entry->type = type;
__entry->dir = S_ISDIR(folio->mapping->host->i_mode);
__entry->index = folio->index;
+ __entry->nrpages= folio_nr_pages(folio);
__entry->dirty = folio_test_dirty(folio);
__entry->uptodate = folio_test_uptodate(folio);
),
- TP_printk("dev = (%d,%d), ino = %lu, %s, %s, index = %lu, "
+ TP_printk("dev = (%d,%d), ino = %lu, %s, %s, index = %lu, nr_pages = %lu, "
"dirty = %d, uptodate = %d",
show_dev_ino(__entry),
show_block_type(__entry->type),
show_file_type(__entry->dir),
(unsigned long)__entry->index,
+ (unsigned long)__entry->nrpages,
__entry->dirty,
__entry->uptodate)
);
@@ -1403,6 +1415,13 @@ DEFINE_EVENT(f2fs__folio, f2fs_readpage,
TP_ARGS(folio, type)
);
+DEFINE_EVENT(f2fs__folio, f2fs_read_folio,
+
+ TP_PROTO(struct folio *folio, int type),
+
+ TP_ARGS(folio, type)
+);
+
DEFINE_EVENT(f2fs__folio, f2fs_set_page_dirty,
TP_PROTO(struct folio *folio, int type),
@@ -2442,6 +2461,127 @@ DEFINE_EVENT(f2fs__rw_end, f2fs_datawrite_end,
TP_ARGS(inode, offset, bytes)
);
+TRACE_EVENT(f2fs_lock_elapsed_time,
+
+ TP_PROTO(struct f2fs_sb_info *sbi, enum f2fs_lock_name lock_name,
+ bool is_write, struct task_struct *p, int ioprio,
+ unsigned long long total_time,
+ unsigned long long running_time,
+ unsigned long long runnable_time,
+ unsigned long long io_sleep_time,
+ unsigned long long other_time),
+
+ TP_ARGS(sbi, lock_name, is_write, p, ioprio, total_time, running_time,
+ runnable_time, io_sleep_time, other_time),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __array(char, comm, TASK_COMM_LEN)
+ __field(pid_t, pid)
+ __field(int, prio)
+ __field(int, ioprio_class)
+ __field(int, ioprio_data)
+ __field(unsigned int, lock_name)
+ __field(bool, is_write)
+ __field(unsigned long long, total_time)
+ __field(unsigned long long, running_time)
+ __field(unsigned long long, runnable_time)
+ __field(unsigned long long, io_sleep_time)
+ __field(unsigned long long, other_time)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = sbi->sb->s_dev;
+ memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
+ __entry->pid = p->pid;
+ __entry->prio = p->prio;
+ __entry->ioprio_class = IOPRIO_PRIO_CLASS(ioprio);
+ __entry->ioprio_data = IOPRIO_PRIO_DATA(ioprio);
+ __entry->lock_name = lock_name;
+ __entry->is_write = is_write;
+ __entry->total_time = total_time;
+ __entry->running_time = running_time;
+ __entry->runnable_time = runnable_time;
+ __entry->io_sleep_time = io_sleep_time;
+ __entry->other_time = other_time;
+ ),
+
+ TP_printk("dev = (%d,%d), comm: %s, pid: %d, prio: %d, "
+ "ioprio_class: %d, ioprio_data: %d, lock_name: %s, "
+ "lock_type: %s, total: %llu, running: %llu, "
+ "runnable: %llu, io_sleep: %llu, other: %llu",
+ show_dev(__entry->dev),
+ __entry->comm,
+ __entry->pid,
+ __entry->prio,
+ __entry->ioprio_class,
+ __entry->ioprio_data,
+ show_lock_name(__entry->lock_name),
+ __entry->is_write ? "wlock" : "rlock",
+ __entry->total_time,
+ __entry->running_time,
+ __entry->runnable_time,
+ __entry->io_sleep_time,
+ __entry->other_time)
+);
+
+DECLARE_EVENT_CLASS(f2fs_priority_update,
+
+ TP_PROTO(struct f2fs_sb_info *sbi, enum f2fs_lock_name lock_name,
+ bool is_write, struct task_struct *p, int orig_prio,
+ int new_prio),
+
+ TP_ARGS(sbi, lock_name, is_write, p, orig_prio, new_prio),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __array(char, comm, TASK_COMM_LEN)
+ __field(pid_t, pid)
+ __field(unsigned int, lock_name)
+ __field(bool, is_write)
+ __field(int, orig_prio)
+ __field(int, new_prio)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = sbi->sb->s_dev;
+ memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
+ __entry->pid = p->pid;
+ __entry->lock_name = lock_name;
+ __entry->is_write = is_write;
+ __entry->orig_prio = orig_prio;
+ __entry->new_prio = new_prio;
+ ),
+
+ TP_printk("dev = (%d,%d), comm: %s, pid: %d, lock_name: %s, "
+ "lock_type: %s, orig_prio: %d, new_prio: %d",
+ show_dev(__entry->dev),
+ __entry->comm,
+ __entry->pid,
+ show_lock_name(__entry->lock_name),
+ __entry->is_write ? "wlock" : "rlock",
+ __entry->orig_prio,
+ __entry->new_prio)
+);
+
+DEFINE_EVENT(f2fs_priority_update, f2fs_priority_uplift,
+
+ TP_PROTO(struct f2fs_sb_info *sbi, enum f2fs_lock_name lock_name,
+ bool is_write, struct task_struct *p, int orig_prio,
+ int new_prio),
+
+ TP_ARGS(sbi, lock_name, is_write, p, orig_prio, new_prio)
+);
+
+DEFINE_EVENT(f2fs_priority_update, f2fs_priority_restore,
+
+ TP_PROTO(struct f2fs_sb_info *sbi, enum f2fs_lock_name lock_name,
+ bool is_write, struct task_struct *p, int orig_prio,
+ int new_prio),
+
+ TP_ARGS(sbi, lock_name, is_write, p, orig_prio, new_prio)
+);
+
#endif /* _TRACE_F2FS_H */
/* This part must be outside protection */