summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/ABI/testing/sysfs-fs-f2fs13
-rw-r--r--Documentation/filesystems/f2fs.rst44
-rw-r--r--fs/f2fs/acl.c5
-rw-r--r--fs/f2fs/checkpoint.c2
-rw-r--r--fs/f2fs/data.c114
-rw-r--r--fs/f2fs/debug.c111
-rw-r--r--fs/f2fs/extent_cache.c119
-rw-r--r--fs/f2fs/f2fs.h38
-rw-r--r--fs/f2fs/file.c71
-rw-r--r--fs/f2fs/gc.c19
-rw-r--r--fs/f2fs/gc.h1
-rw-r--r--fs/f2fs/inode.c23
-rw-r--r--fs/f2fs/node.c28
-rw-r--r--fs/f2fs/recovery.c9
-rw-r--r--fs/f2fs/segment.c161
-rw-r--r--fs/f2fs/segment.h72
-rw-r--r--fs/f2fs/super.c101
-rw-r--r--fs/f2fs/sysfs.c16
-rw-r--r--include/linux/f2fs_fs.h7
-rw-r--r--include/uapi/linux/f2fs.h1
20 files changed, 700 insertions, 255 deletions
diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs
index fdedf1ea944b..3e1630c70d8a 100644
--- a/Documentation/ABI/testing/sysfs-fs-f2fs
+++ b/Documentation/ABI/testing/sysfs-fs-f2fs
@@ -311,10 +311,13 @@ Description: Do background GC aggressively when set. Set to 0 by default.
GC approach and turns SSR mode on.
gc urgent low(2): lowers the bar of checking I/O idling in
order to process outstanding discard commands and GC a
- little bit aggressively. uses cost benefit GC approach.
+ little bit aggressively. always uses cost benefit GC approach,
+ and will override age-threshold GC approach if ATGC is enabled
+ at the same time.
gc urgent mid(3): does GC forcibly in a period of given
gc_urgent_sleep_time and executes a mid level of I/O idling check.
- uses cost benefit GC approach.
+ always uses cost benefit GC approach, and will override
+ age-threshold GC approach if ATGC is enabled at the same time.
What: /sys/fs/f2fs/<disk>/gc_urgent_sleep_time
Date: August 2017
@@ -819,3 +822,9 @@ Description: It controls the valid block ratio threshold not to trigger excessiv
for zoned deivces. The initial value of it is 95(%). F2FS will stop the
background GC thread from intiating GC for sections having valid blocks
exceeding the ratio.
+
+What: /sys/fs/f2fs/<disk>/max_read_extent_count
+Date: November 2024
+Contact: "Chao Yu" <chao@kernel.org>
+Description: It controls max read extent count for per-inode, the value of threshold
+ is 10240 by default.
diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst
index 68a0885fb5e6..fb7d2ee022bc 100644
--- a/Documentation/filesystems/f2fs.rst
+++ b/Documentation/filesystems/f2fs.rst
@@ -943,3 +943,47 @@ NVMe Zoned Namespace devices
can start before the zone-capacity and span across zone-capacity boundary.
Such spanning segments are also considered as usable segments. All blocks
past the zone-capacity are considered unusable in these segments.
+
+Device aliasing feature
+-----------------------
+
+f2fs can utilize a special file called a "device aliasing file." This file allows
+the entire storage device to be mapped with a single, large extent, not using
+the usual f2fs node structures. This mapped area is pinned and primarily intended
+for holding the space.
+
+Essentially, this mechanism allows a portion of the f2fs area to be temporarily
+reserved and used by another filesystem or for different purposes. Once that
+external usage is complete, the device aliasing file can be deleted, releasing
+the reserved space back to F2FS for its own use.
+
+<use-case>
+
+# ls /dev/vd*
+/dev/vdb (32GB) /dev/vdc (32GB)
+# mkfs.ext4 /dev/vdc
+# mkfs.f2fs -c /dev/vdc@vdc.file /dev/vdb
+# mount /dev/vdb /mnt/f2fs
+# ls -l /mnt/f2fs
+vdc.file
+# df -h
+/dev/vdb 64G 33G 32G 52% /mnt/f2fs
+
+# mount -o loop /dev/vdc /mnt/ext4
+# df -h
+/dev/vdb 64G 33G 32G 52% /mnt/f2fs
+/dev/loop7 32G 24K 30G 1% /mnt/ext4
+# umount /mnt/ext4
+
+# f2fs_io getflags /mnt/f2fs/vdc.file
+get a flag on /mnt/f2fs/vdc.file ret=0, flags=nocow(pinned),immutable
+# f2fs_io setflags noimmutable /mnt/f2fs/vdc.file
+get a flag on noimmutable ret=0, flags=800010
+set a flag on /mnt/f2fs/vdc.file ret=0, flags=noimmutable
+# rm /mnt/f2fs/vdc.file
+# df -h
+/dev/vdb 64G 753M 64G 2% /mnt/f2fs
+
+So, the key idea is, user can do any file operations on /dev/vdc, and
+reclaim the space after the use, while the space is counted as /data.
+That doesn't require modifying partition size and filesystem format.
diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index 8bffdeccdbc3..1fbc0607363b 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -296,9 +296,8 @@ static struct posix_acl *f2fs_acl_clone(const struct posix_acl *acl,
struct posix_acl *clone = NULL;
if (acl) {
- int size = sizeof(struct posix_acl) + acl->a_count *
- sizeof(struct posix_acl_entry);
- clone = kmemdup(acl, size, flags);
+ clone = kmemdup(acl, struct_size(acl, a_entries, acl->a_count),
+ flags);
if (clone)
refcount_set(&clone->a_refcount, 1);
}
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 7f76460b721f..efda9a022981 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -32,7 +32,7 @@ void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io,
f2fs_build_fault_attr(sbi, 0, 0);
if (!end_io)
f2fs_flush_merged_writes(sbi);
- f2fs_handle_critical_error(sbi, reason, end_io);
+ f2fs_handle_critical_error(sbi, reason);
}
/*
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index e3ce763cce18..a2478c2afb3a 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -1679,7 +1679,8 @@ next_block:
/* reserved delalloc block should be mapped for fiemap. */
if (blkaddr == NEW_ADDR)
map->m_flags |= F2FS_MAP_DELALLOC;
- if (flag != F2FS_GET_BLOCK_DIO || !is_hole)
+ /* DIO READ and hole case, should not map the blocks. */
+ if (!(flag == F2FS_GET_BLOCK_DIO && is_hole && !map->m_may_create))
map->m_flags |= F2FS_MAP_MAPPED;
map->m_pblk = blkaddr;
@@ -1821,16 +1822,6 @@ bool f2fs_overwrite_io(struct inode *inode, loff_t pos, size_t len)
return true;
}
-static inline u64 bytes_to_blks(struct inode *inode, u64 bytes)
-{
- return (bytes >> inode->i_blkbits);
-}
-
-static inline u64 blks_to_bytes(struct inode *inode, u64 blks)
-{
- return (blks << inode->i_blkbits);
-}
-
static int f2fs_xattr_fiemap(struct inode *inode,
struct fiemap_extent_info *fieinfo)
{
@@ -1856,7 +1847,7 @@ static int f2fs_xattr_fiemap(struct inode *inode,
return err;
}
- phys = blks_to_bytes(inode, ni.blk_addr);
+ phys = F2FS_BLK_TO_BYTES(ni.blk_addr);
offset = offsetof(struct f2fs_inode, i_addr) +
sizeof(__le32) * (DEF_ADDRS_PER_INODE -
get_inline_xattr_addrs(inode));
@@ -1888,7 +1879,7 @@ static int f2fs_xattr_fiemap(struct inode *inode,
return err;
}
- phys = blks_to_bytes(inode, ni.blk_addr);
+ phys = F2FS_BLK_TO_BYTES(ni.blk_addr);
len = inode->i_sb->s_blocksize;
f2fs_put_page(page, 1);
@@ -1904,30 +1895,11 @@ static int f2fs_xattr_fiemap(struct inode *inode,
return (err < 0 ? err : 0);
}
-static loff_t max_inode_blocks(struct inode *inode)
-{
- loff_t result = ADDRS_PER_INODE(inode);
- loff_t leaf_count = ADDRS_PER_BLOCK(inode);
-
- /* two direct node blocks */
- result += (leaf_count * 2);
-
- /* two indirect node blocks */
- leaf_count *= NIDS_PER_BLOCK;
- result += (leaf_count * 2);
-
- /* one double indirect node block */
- leaf_count *= NIDS_PER_BLOCK;
- result += leaf_count;
-
- return result;
-}
-
int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len)
{
struct f2fs_map_blocks map;
- sector_t start_blk, last_blk;
+ sector_t start_blk, last_blk, blk_len, max_len;
pgoff_t next_pgofs;
u64 logical = 0, phys = 0, size = 0;
u32 flags = 0;
@@ -1969,16 +1941,15 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
goto out;
}
- if (bytes_to_blks(inode, len) == 0)
- len = blks_to_bytes(inode, 1);
-
- start_blk = bytes_to_blks(inode, start);
- last_blk = bytes_to_blks(inode, start + len - 1);
+ start_blk = F2FS_BYTES_TO_BLK(start);
+ last_blk = F2FS_BYTES_TO_BLK(start + len - 1);
+ blk_len = last_blk - start_blk + 1;
+ max_len = F2FS_BYTES_TO_BLK(maxbytes) - start_blk;
next:
memset(&map, 0, sizeof(map));
map.m_lblk = start_blk;
- map.m_len = bytes_to_blks(inode, len);
+ map.m_len = blk_len;
map.m_next_pgofs = &next_pgofs;
map.m_seg_type = NO_CHECK_TYPE;
@@ -1995,13 +1966,23 @@ next:
if (!compr_cluster && !(map.m_flags & F2FS_MAP_FLAGS)) {
start_blk = next_pgofs;
- if (blks_to_bytes(inode, start_blk) < blks_to_bytes(inode,
- max_inode_blocks(inode)))
+ if (F2FS_BLK_TO_BYTES(start_blk) < maxbytes)
goto prep_next;
flags |= FIEMAP_EXTENT_LAST;
}
+ /*
+ * current extent may cross boundary of inquiry, increase len to
+ * requery.
+ */
+ if (!compr_cluster && (map.m_flags & F2FS_MAP_MAPPED) &&
+ map.m_lblk + map.m_len - 1 == last_blk &&
+ blk_len != max_len) {
+ blk_len = max_len;
+ goto next;
+ }
+
compr_appended = false;
/* In a case of compressed cluster, append this to the last extent */
if (compr_cluster && ((map.m_flags & F2FS_MAP_DELALLOC) ||
@@ -2033,14 +2014,14 @@ skip_fill:
} else if (compr_appended) {
unsigned int appended_blks = cluster_size -
count_in_cluster + 1;
- size += blks_to_bytes(inode, appended_blks);
+ size += F2FS_BLK_TO_BYTES(appended_blks);
start_blk += appended_blks;
compr_cluster = false;
} else {
- logical = blks_to_bytes(inode, start_blk);
+ logical = F2FS_BLK_TO_BYTES(start_blk);
phys = __is_valid_data_blkaddr(map.m_pblk) ?
- blks_to_bytes(inode, map.m_pblk) : 0;
- size = blks_to_bytes(inode, map.m_len);
+ F2FS_BLK_TO_BYTES(map.m_pblk) : 0;
+ size = F2FS_BLK_TO_BYTES(map.m_len);
flags = 0;
if (compr_cluster) {
@@ -2048,13 +2029,13 @@ skip_fill:
count_in_cluster += map.m_len;
if (count_in_cluster == cluster_size) {
compr_cluster = false;
- size += blks_to_bytes(inode, 1);
+ size += F2FS_BLKSIZE;
}
} else if (map.m_flags & F2FS_MAP_DELALLOC) {
flags = FIEMAP_EXTENT_UNWRITTEN;
}
- start_blk += bytes_to_blks(inode, size);
+ start_blk += F2FS_BYTES_TO_BLK(size);
}
prep_next:
@@ -2092,7 +2073,7 @@ static int f2fs_read_single_page(struct inode *inode, struct folio *folio,
struct readahead_control *rac)
{
struct bio *bio = *bio_ret;
- const unsigned blocksize = blks_to_bytes(inode, 1);
+ const unsigned int blocksize = F2FS_BLKSIZE;
sector_t block_in_file;
sector_t last_block;
sector_t last_block_in_file;
@@ -2102,8 +2083,8 @@ static int f2fs_read_single_page(struct inode *inode, struct folio *folio,
block_in_file = (sector_t)index;
last_block = block_in_file + nr_pages;
- last_block_in_file = bytes_to_blks(inode,
- f2fs_readpage_limit(inode) + blocksize - 1);
+ last_block_in_file = F2FS_BYTES_TO_BLK(f2fs_readpage_limit(inode) +
+ blocksize - 1);
if (last_block > last_block_in_file)
last_block = last_block_in_file;
@@ -2203,7 +2184,7 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
struct bio *bio = *bio_ret;
unsigned int start_idx = cc->cluster_idx << cc->log_cluster_size;
sector_t last_block_in_file;
- const unsigned blocksize = blks_to_bytes(inode, 1);
+ const unsigned int blocksize = F2FS_BLKSIZE;
struct decompress_io_ctx *dic = NULL;
struct extent_info ei = {};
bool from_dnode = true;
@@ -2212,8 +2193,8 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
f2fs_bug_on(sbi, f2fs_cluster_is_empty(cc));
- last_block_in_file = bytes_to_blks(inode,
- f2fs_readpage_limit(inode) + blocksize - 1);
+ last_block_in_file = F2FS_BYTES_TO_BLK(f2fs_readpage_limit(inode) +
+ blocksize - 1);
/* get rid of pages beyond EOF */
for (i = 0; i < cc->cluster_size; i++) {
@@ -2388,10 +2369,10 @@ static int f2fs_mpage_readpages(struct inode *inode,
.nr_cpages = 0,
};
pgoff_t nc_cluster_idx = NULL_CLUSTER;
+ pgoff_t index;
#endif
unsigned nr_pages = rac ? readahead_count(rac) : 1;
unsigned max_nr_pages = nr_pages;
- pgoff_t index;
int ret = 0;
map.m_pblk = 0;
@@ -2409,9 +2390,9 @@ static int f2fs_mpage_readpages(struct inode *inode,
prefetchw(&folio->flags);
}
+#ifdef CONFIG_F2FS_FS_COMPRESSION
index = folio_index(folio);
-#ifdef CONFIG_F2FS_FS_COMPRESSION
if (!f2fs_compressed_file(inode))
goto read_single_page;
@@ -3444,6 +3425,11 @@ restart:
if (!f2fs_lookup_read_extent_cache_block(inode, index,
&dn.data_blkaddr)) {
+ if (IS_DEVICE_ALIASING(inode)) {
+ err = -ENODATA;
+ goto out;
+ }
+
if (locked) {
err = f2fs_reserve_block(&dn, index);
goto out;
@@ -3974,7 +3960,7 @@ static int check_swap_activate(struct swap_info_struct *sis,
* to be very smart.
*/
cur_lblock = 0;
- last_lblock = bytes_to_blks(inode, i_size_read(inode));
+ last_lblock = F2FS_BYTES_TO_BLK(i_size_read(inode));
while (cur_lblock < last_lblock && cur_lblock < sis->max) {
struct f2fs_map_blocks map;
@@ -4217,8 +4203,8 @@ static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
pgoff_t next_pgofs = 0;
int err;
- map.m_lblk = bytes_to_blks(inode, offset);
- map.m_len = bytes_to_blks(inode, offset + length - 1) - map.m_lblk + 1;
+ map.m_lblk = F2FS_BYTES_TO_BLK(offset);
+ map.m_len = F2FS_BYTES_TO_BLK(offset + length - 1) - map.m_lblk + 1;
map.m_next_pgofs = &next_pgofs;
map.m_seg_type = f2fs_rw_hint_to_seg_type(F2FS_I_SB(inode),
inode->i_write_hint);
@@ -4229,7 +4215,7 @@ static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
if (err)
return err;
- iomap->offset = blks_to_bytes(inode, map.m_lblk);
+ iomap->offset = F2FS_BLK_TO_BYTES(map.m_lblk);
/*
* When inline encryption is enabled, sometimes I/O to an encrypted file
@@ -4249,21 +4235,21 @@ static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
if (WARN_ON_ONCE(map.m_pblk == NEW_ADDR))
return -EINVAL;
- iomap->length = blks_to_bytes(inode, map.m_len);
+ iomap->length = F2FS_BLK_TO_BYTES(map.m_len);
iomap->type = IOMAP_MAPPED;
iomap->flags |= IOMAP_F_MERGED;
iomap->bdev = map.m_bdev;
- iomap->addr = blks_to_bytes(inode, map.m_pblk);
+ iomap->addr = F2FS_BLK_TO_BYTES(map.m_pblk);
} else {
if (flags & IOMAP_WRITE)
return -ENOTBLK;
if (map.m_pblk == NULL_ADDR) {
- iomap->length = blks_to_bytes(inode, next_pgofs) -
- iomap->offset;
+ iomap->length = F2FS_BLK_TO_BYTES(next_pgofs) -
+ iomap->offset;
iomap->type = IOMAP_HOLE;
} else if (map.m_pblk == NEW_ADDR) {
- iomap->length = blks_to_bytes(inode, map.m_len);
+ iomap->length = F2FS_BLK_TO_BYTES(map.m_len);
iomap->type = IOMAP_UNWRITTEN;
} else {
f2fs_bug_on(F2FS_I_SB(inode), 1);
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index 546b8ba91261..468828288a4a 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -60,6 +60,70 @@ void f2fs_update_sit_info(struct f2fs_sb_info *sbi)
}
#ifdef CONFIG_DEBUG_FS
+static void update_multidevice_stats(struct f2fs_sb_info *sbi)
+{
+ struct f2fs_stat_info *si = F2FS_STAT(sbi);
+ struct f2fs_dev_stats *dev_stats = si->dev_stats;
+ int i, j;
+
+ if (!f2fs_is_multi_device(sbi))
+ return;
+
+ memset(dev_stats, 0, sizeof(struct f2fs_dev_stats) * sbi->s_ndevs);
+ for (i = 0; i < sbi->s_ndevs; i++) {
+ unsigned int start_segno, end_segno;
+ block_t start_blk, end_blk;
+
+ if (i == 0) {
+ start_blk = MAIN_BLKADDR(sbi);
+ end_blk = FDEV(i).end_blk + 1 - SEG0_BLKADDR(sbi);
+ } else {
+ start_blk = FDEV(i).start_blk;
+ end_blk = FDEV(i).end_blk + 1;
+ }
+
+ start_segno = GET_SEGNO(sbi, start_blk);
+ end_segno = GET_SEGNO(sbi, end_blk);
+
+ for (j = start_segno; j < end_segno; j++) {
+ unsigned int seg_blks, sec_blks;
+
+ seg_blks = get_seg_entry(sbi, j)->valid_blocks;
+
+ /* update segment stats */
+ if (IS_CURSEG(sbi, j))
+ dev_stats[i].devstats[0][DEVSTAT_INUSE]++;
+ else if (seg_blks == BLKS_PER_SEG(sbi))
+ dev_stats[i].devstats[0][DEVSTAT_FULL]++;
+ else if (seg_blks != 0)
+ dev_stats[i].devstats[0][DEVSTAT_DIRTY]++;
+ else if (!test_bit(j, FREE_I(sbi)->free_segmap))
+ dev_stats[i].devstats[0][DEVSTAT_FREE]++;
+ else
+ dev_stats[i].devstats[0][DEVSTAT_PREFREE]++;
+
+ if (!__is_large_section(sbi) ||
+ (j % SEGS_PER_SEC(sbi)) != 0)
+ continue;
+
+ sec_blks = get_sec_entry(sbi, j)->valid_blocks;
+
+ /* update section stats */
+ if (IS_CURSEC(sbi, GET_SEC_FROM_SEG(sbi, j)))
+ dev_stats[i].devstats[1][DEVSTAT_INUSE]++;
+ else if (sec_blks == BLKS_PER_SEC(sbi))
+ dev_stats[i].devstats[1][DEVSTAT_FULL]++;
+ else if (sec_blks != 0)
+ dev_stats[i].devstats[1][DEVSTAT_DIRTY]++;
+ else if (!test_bit(GET_SEC_FROM_SEG(sbi, j),
+ FREE_I(sbi)->free_secmap))
+ dev_stats[i].devstats[1][DEVSTAT_FREE]++;
+ else
+ dev_stats[i].devstats[1][DEVSTAT_PREFREE]++;
+ }
+ }
+}
+
static void update_general_status(struct f2fs_sb_info *sbi)
{
struct f2fs_stat_info *si = F2FS_STAT(sbi);
@@ -214,6 +278,8 @@ static void update_general_status(struct f2fs_sb_info *sbi)
si->valid_blks[type] += blks;
}
+ update_multidevice_stats(sbi);
+
for (i = 0; i < MAX_CALL_TYPE; i++)
si->cp_call_count[i] = atomic_read(&sbi->cp_call_count[i]);
@@ -498,6 +564,36 @@ static int stat_show(struct seq_file *s, void *v)
si->dirty_count);
seq_printf(s, " - Prefree: %d\n - Free: %d (%d)\n\n",
si->prefree_count, si->free_segs, si->free_secs);
+ if (f2fs_is_multi_device(sbi)) {
+ seq_puts(s, "Multidevice stats:\n");
+ seq_printf(s, " [seg: %8s %8s %8s %8s %8s]",
+ "inuse", "dirty", "full", "free", "prefree");
+ if (__is_large_section(sbi))
+ seq_printf(s, " [sec: %8s %8s %8s %8s %8s]\n",
+ "inuse", "dirty", "full", "free", "prefree");
+ else
+ seq_puts(s, "\n");
+
+ for (i = 0; i < sbi->s_ndevs; i++) {
+ seq_printf(s, " #%-2d %8u %8u %8u %8u %8u", i,
+ si->dev_stats[i].devstats[0][DEVSTAT_INUSE],
+ si->dev_stats[i].devstats[0][DEVSTAT_DIRTY],
+ si->dev_stats[i].devstats[0][DEVSTAT_FULL],
+ si->dev_stats[i].devstats[0][DEVSTAT_FREE],
+ si->dev_stats[i].devstats[0][DEVSTAT_PREFREE]);
+ if (!__is_large_section(sbi)) {
+ seq_puts(s, "\n");
+ continue;
+ }
+ seq_printf(s, " %8u %8u %8u %8u %8u\n",
+ si->dev_stats[i].devstats[1][DEVSTAT_INUSE],
+ si->dev_stats[i].devstats[1][DEVSTAT_DIRTY],
+ si->dev_stats[i].devstats[1][DEVSTAT_FULL],
+ si->dev_stats[i].devstats[1][DEVSTAT_FREE],
+ si->dev_stats[i].devstats[1][DEVSTAT_PREFREE]);
+ }
+ seq_puts(s, "\n");
+ }
seq_printf(s, "CP calls: %d (BG: %d)\n",
si->cp_call_count[TOTAL_CALL],
si->cp_call_count[BACKGROUND]);
@@ -598,9 +694,9 @@ static int stat_show(struct seq_file *s, void *v)
si->ndirty_node, si->node_pages);
seq_printf(s, " - dents: %4d in dirs:%4d (%4d)\n",
si->ndirty_dent, si->ndirty_dirs, si->ndirty_all);
- seq_printf(s, " - datas: %4d in files:%4d\n",
+ seq_printf(s, " - data: %4d in files:%4d\n",
si->ndirty_data, si->ndirty_files);
- seq_printf(s, " - quota datas: %4d in quota files:%4d\n",
+ seq_printf(s, " - quota data: %4d in quota files:%4d\n",
si->ndirty_qdata, si->nquota_files);
seq_printf(s, " - meta: %4d in %4d\n",
si->ndirty_meta, si->meta_pages);
@@ -665,6 +761,7 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi)
{
struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
struct f2fs_stat_info *si;
+ struct f2fs_dev_stats *dev_stats;
unsigned long flags;
int i;
@@ -672,6 +769,15 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi)
if (!si)
return -ENOMEM;
+ dev_stats = f2fs_kzalloc(sbi, sizeof(struct f2fs_dev_stats) *
+ sbi->s_ndevs, GFP_KERNEL);
+ if (!dev_stats) {
+ kfree(si);
+ return -ENOMEM;
+ }
+
+ si->dev_stats = dev_stats;
+
si->all_area_segs = le32_to_cpu(raw_super->segment_count);
si->sit_area_segs = le32_to_cpu(raw_super->segment_count_sit);
si->nat_area_segs = le32_to_cpu(raw_super->segment_count_nat);
@@ -724,6 +830,7 @@ void f2fs_destroy_stats(struct f2fs_sb_info *sbi)
list_del(&si->stat_list);
raw_spin_unlock_irqrestore(&f2fs_stat_lock, flags);
+ kfree(si->dev_stats);
kfree(si);
}
diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
index 62ac440d9416..347b3b647834 100644
--- a/fs/f2fs/extent_cache.c
+++ b/fs/f2fs/extent_cache.c
@@ -24,6 +24,7 @@ bool sanity_check_extent_cache(struct inode *inode, struct page *ipage)
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct f2fs_extent *i_ext = &F2FS_INODE(ipage)->i_ext;
struct extent_info ei;
+ int devi;
get_read_extent_info(&ei, i_ext);
@@ -38,7 +39,36 @@ bool sanity_check_extent_cache(struct inode *inode, struct page *ipage)
ei.blk, ei.fofs, ei.len);
return false;
}
- return true;
+
+ if (!IS_DEVICE_ALIASING(inode))
+ return true;
+
+ for (devi = 0; devi < sbi->s_ndevs; devi++) {
+ if (FDEV(devi).start_blk != ei.blk ||
+ FDEV(devi).end_blk != ei.blk + ei.len - 1)
+ continue;
+
+ if (devi == 0) {
+ f2fs_warn(sbi,
+ "%s: inode (ino=%lx) is an alias of meta device",
+ __func__, inode->i_ino);
+ return false;
+ }
+
+ if (bdev_is_zoned(FDEV(devi).bdev)) {
+ f2fs_warn(sbi,
+ "%s: device alias inode (ino=%lx)'s extent info "
+ "[%u, %u, %u] maps to zoned block device",
+ __func__, inode->i_ino, ei.blk, ei.fofs, ei.len);
+ return false;
+ }
+ return true;
+ }
+
+ f2fs_warn(sbi, "%s: device alias inode (ino=%lx)'s extent info "
+ "[%u, %u, %u] is inconsistent w/ any devices",
+ __func__, inode->i_ino, ei.blk, ei.fofs, ei.len);
+ return false;
}
static void __set_extent_info(struct extent_info *ei,
@@ -76,6 +106,9 @@ static bool __init_may_extent_tree(struct inode *inode, enum extent_type type)
static bool __may_extent_tree(struct inode *inode, enum extent_type type)
{
+ if (IS_DEVICE_ALIASING(inode) && type == EX_READ)
+ return true;
+
/*
* for recovered files during mount do not create extents
* if shrinker is not registered.
@@ -346,21 +379,22 @@ static struct extent_tree *__grab_extent_tree(struct inode *inode,
}
static unsigned int __free_extent_tree(struct f2fs_sb_info *sbi,
- struct extent_tree *et)
+ struct extent_tree *et, unsigned int nr_shrink)
{
struct rb_node *node, *next;
struct extent_node *en;
- unsigned int count = atomic_read(&et->node_cnt);
+ unsigned int count;
node = rb_first_cached(&et->root);
- while (node) {
+
+ for (count = 0; node && count < nr_shrink; count++) {
next = rb_next(node);
en = rb_entry(node, struct extent_node, rb_node);
__release_extent_node(sbi, et, en);
node = next;
}
- return count - atomic_read(&et->node_cnt);
+ return count;
}
static void __drop_largest_extent(struct extent_tree *et,
@@ -401,6 +435,11 @@ void f2fs_init_read_extent_tree(struct inode *inode, struct page *ipage)
if (atomic_read(&et->node_cnt) || !ei.len)
goto skip;
+ if (IS_DEVICE_ALIASING(inode)) {
+ et->largest = ei;
+ goto skip;
+ }
+
en = __attach_extent_node(sbi, et, &ei, NULL,
&et->root.rb_root.rb_node, true);
if (en) {
@@ -463,6 +502,11 @@ static bool __lookup_extent_tree(struct inode *inode, pgoff_t pgofs,
goto out;
}
+ if (IS_DEVICE_ALIASING(inode)) {
+ ret = false;
+ goto out;
+ }
+
en = __lookup_extent_node(&et->root, et->cached_en, pgofs);
if (!en)
goto out;
@@ -579,6 +623,30 @@ do_insert:
return en;
}
+static unsigned int __destroy_extent_node(struct inode *inode,
+ enum extent_type type)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct extent_tree *et = F2FS_I(inode)->extent_tree[type];
+ unsigned int nr_shrink = type == EX_READ ?
+ READ_EXTENT_CACHE_SHRINK_NUMBER :
+ AGE_EXTENT_CACHE_SHRINK_NUMBER;
+ unsigned int node_cnt = 0;
+
+ if (!et || !atomic_read(&et->node_cnt))
+ return 0;
+
+ while (atomic_read(&et->node_cnt)) {
+ write_lock(&et->lock);
+ node_cnt += __free_extent_tree(sbi, et, nr_shrink);
+ write_unlock(&et->lock);
+ }
+
+ f2fs_bug_on(sbi, atomic_read(&et->node_cnt));
+
+ return node_cnt;
+}
+
static void __update_extent_tree_range(struct inode *inode,
struct extent_info *tei, enum extent_type type)
{
@@ -649,7 +717,9 @@ static void __update_extent_tree_range(struct inode *inode,
}
if (end < org_end && (type != EX_READ ||
- org_end - end >= F2FS_MIN_EXTENT_LEN)) {
+ (org_end - end >= F2FS_MIN_EXTENT_LEN &&
+ atomic_read(&et->node_cnt) <
+ sbi->max_read_extent_count))) {
if (parts) {
__set_extent_info(&ei,
end, org_end - end,
@@ -717,9 +787,6 @@ static void __update_extent_tree_range(struct inode *inode,
}
}
- if (is_inode_flag_set(inode, FI_NO_EXTENT))
- __free_extent_tree(sbi, et);
-
if (et->largest_updated) {
et->largest_updated = false;
updated = true;
@@ -737,6 +804,9 @@ update_age_extent_cache:
out_read_extent_cache:
write_unlock(&et->lock);
+ if (is_inode_flag_set(inode, FI_NO_EXTENT))
+ __destroy_extent_node(inode, EX_READ);
+
if (updated)
f2fs_mark_inode_dirty_sync(inode, true);
}
@@ -899,10 +969,14 @@ static unsigned int __shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink
list_for_each_entry_safe(et, next, &eti->zombie_list, list) {
if (atomic_read(&et->node_cnt)) {
write_lock(&et->lock);
- node_cnt += __free_extent_tree(sbi, et);
+ node_cnt += __free_extent_tree(sbi, et,
+ nr_shrink - node_cnt - tree_cnt);
write_unlock(&et->lock);
}
- f2fs_bug_on(sbi, atomic_read(&et->node_cnt));
+
+ if (atomic_read(&et->node_cnt))
+ goto unlock_out;
+
list_del_init(&et->list);
radix_tree_delete(&eti->extent_tree_root, et->ino);
kmem_cache_free(extent_tree_slab, et);
@@ -1041,23 +1115,6 @@ unsigned int f2fs_shrink_age_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink
return __shrink_extent_tree(sbi, nr_shrink, EX_BLOCK_AGE);
}
-static unsigned int __destroy_extent_node(struct inode *inode,
- enum extent_type type)
-{
- struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- struct extent_tree *et = F2FS_I(inode)->extent_tree[type];
- unsigned int node_cnt = 0;
-
- if (!et || !atomic_read(&et->node_cnt))
- return 0;
-
- write_lock(&et->lock);
- node_cnt = __free_extent_tree(sbi, et);
- write_unlock(&et->lock);
-
- return node_cnt;
-}
-
void f2fs_destroy_extent_node(struct inode *inode)
{
__destroy_extent_node(inode, EX_READ);
@@ -1066,7 +1123,6 @@ void f2fs_destroy_extent_node(struct inode *inode)
static void __drop_extent_tree(struct inode *inode, enum extent_type type)
{
- struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct extent_tree *et = F2FS_I(inode)->extent_tree[type];
bool updated = false;
@@ -1074,7 +1130,6 @@ static void __drop_extent_tree(struct inode *inode, enum extent_type type)
return;
write_lock(&et->lock);
- __free_extent_tree(sbi, et);
if (type == EX_READ) {
set_inode_flag(inode, FI_NO_EXTENT);
if (et->largest.len) {
@@ -1083,6 +1138,9 @@ static void __drop_extent_tree(struct inode *inode, enum extent_type type)
}
}
write_unlock(&et->lock);
+
+ __destroy_extent_node(inode, type);
+
if (updated)
f2fs_mark_inode_dirty_sync(inode, true);
}
@@ -1156,6 +1214,7 @@ void f2fs_init_extent_cache_info(struct f2fs_sb_info *sbi)
sbi->hot_data_age_threshold = DEF_HOT_DATA_AGE_THRESHOLD;
sbi->warm_data_age_threshold = DEF_WARM_DATA_AGE_THRESHOLD;
sbi->last_age_weight = LAST_AGE_WEIGHT;
+ sbi->max_read_extent_count = DEF_MAX_READ_EXTENT_COUNT;
}
int __init f2fs_create_extent_cache(void)
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 33f5449dc22d..6f2cbf4c5740 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -213,6 +213,7 @@ struct f2fs_mount_info {
#define F2FS_FEATURE_CASEFOLD 0x00001000
#define F2FS_FEATURE_COMPRESSION 0x00002000
#define F2FS_FEATURE_RO 0x00004000
+#define F2FS_FEATURE_DEVICE_ALIAS 0x00008000
#define __F2FS_HAS_FEATURE(raw_super, mask) \
((raw_super->feature & cpu_to_le32(mask)) != 0)
@@ -634,6 +635,9 @@ enum {
#define DEF_HOT_DATA_AGE_THRESHOLD 262144
#define DEF_WARM_DATA_AGE_THRESHOLD 2621440
+/* default max read extent count per inode */
+#define DEF_MAX_READ_EXTENT_COUNT 10240
+
/* extent cache type */
enum extent_type {
EX_READ,
@@ -1018,7 +1022,7 @@ static inline void set_new_dnode(struct dnode_of_data *dn, struct inode *inode,
#define NR_CURSEG_PERSIST_TYPE (NR_CURSEG_DATA_TYPE + NR_CURSEG_NODE_TYPE)
#define NR_CURSEG_TYPE (NR_CURSEG_INMEM_TYPE + NR_CURSEG_PERSIST_TYPE)
-enum {
+enum log_type {
CURSEG_HOT_DATA = 0, /* directory entry blocks */
CURSEG_WARM_DATA, /* data blocks */
CURSEG_COLD_DATA, /* multimedia or GCed data blocks */
@@ -1063,7 +1067,6 @@ struct f2fs_sm_info {
unsigned int segment_count; /* total # of segments */
unsigned int main_segments; /* # of segments in main area */
unsigned int reserved_segments; /* # of reserved segments */
- unsigned int additional_reserved_segments;/* reserved segs for IO align feature */
unsigned int ovp_segments; /* # of overprovision segments */
/* a threshold to reclaim prefree segments */
@@ -1619,6 +1622,7 @@ struct f2fs_sb_info {
/* for extent tree cache */
struct extent_tree_info extent_tree[NR_EXTENT_CACHES];
atomic64_t allocated_data_blocks; /* for block age extent_cache */
+ unsigned int max_read_extent_count; /* max read extent count per inode */
/* The threshold used for hot and warm data seperation*/
unsigned int hot_data_age_threshold;
@@ -1758,6 +1762,7 @@ struct f2fs_sb_info {
unsigned int dirty_device; /* for checkpoint data flush */
spinlock_t dev_lock; /* protect dirty_device */
bool aligned_blksize; /* all devices has the same logical blksize */
+ unsigned int first_zoned_segno; /* first zoned segno */
/* For write statistics */
u64 sectors_written_start;
@@ -3046,6 +3051,7 @@ static inline void f2fs_change_bit(unsigned int nr, char *addr)
#define F2FS_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */
#define F2FS_PROJINHERIT_FL 0x20000000 /* Create with parents projid */
#define F2FS_CASEFOLD_FL 0x40000000 /* Casefolded file */
+#define F2FS_DEVICE_ALIAS_FL 0x80000000 /* File for aliasing a device */
#define F2FS_QUOTA_DEFAULT_FL (F2FS_NOATIME_FL | F2FS_IMMUTABLE_FL)
@@ -3061,6 +3067,8 @@ static inline void f2fs_change_bit(unsigned int nr, char *addr)
/* Flags that are appropriate for non-directories/regular files. */
#define F2FS_OTHER_FLMASK (F2FS_NODUMP_FL | F2FS_NOATIME_FL)
+#define IS_DEVICE_ALIASING(inode) (F2FS_I(inode)->i_flags & F2FS_DEVICE_ALIAS_FL)
+
static inline __u32 f2fs_mask_flags(umode_t mode, __u32 flags)
{
if (S_ISDIR(mode))
@@ -3632,8 +3640,7 @@ int f2fs_quota_sync(struct super_block *sb, int type);
loff_t max_file_blocks(struct inode *inode);
void f2fs_quota_off_umount(struct super_block *sb);
void f2fs_save_errors(struct f2fs_sb_info *sbi, unsigned char flag);
-void f2fs_handle_critical_error(struct f2fs_sb_info *sbi, unsigned char reason,
- bool irq_context);
+void f2fs_handle_critical_error(struct f2fs_sb_info *sbi, unsigned char reason);
void f2fs_handle_error(struct f2fs_sb_info *sbi, unsigned char error);
void f2fs_handle_error_async(struct f2fs_sb_info *sbi, unsigned char error);
int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover);
@@ -3754,7 +3761,8 @@ void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn,
block_t old_addr, block_t new_addr,
unsigned char version, bool recover_curseg,
bool recover_newaddr);
-int f2fs_get_segment_temp(int seg_type);
+enum temp_type f2fs_get_segment_temp(struct f2fs_sb_info *sbi,
+ enum log_type seg_type);
int f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
block_t old_blkaddr, block_t *new_blkaddr,
struct f2fs_summary *sum, int type,
@@ -3771,8 +3779,7 @@ void f2fs_write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk);
int f2fs_lookup_journal_in_cursum(struct f2fs_journal *journal, int type,
unsigned int val, int alloc);
void f2fs_flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc);
-int f2fs_fix_curseg_write_pointer(struct f2fs_sb_info *sbi);
-int f2fs_check_write_pointer(struct f2fs_sb_info *sbi);
+int f2fs_check_and_fix_write_pointer(struct f2fs_sb_info *sbi);
int f2fs_build_segment_manager(struct f2fs_sb_info *sbi);
void f2fs_destroy_segment_manager(struct f2fs_sb_info *sbi);
int __init f2fs_create_segment_manager_caches(void);
@@ -3783,6 +3790,8 @@ enum rw_hint f2fs_io_type_to_rw_hint(struct f2fs_sb_info *sbi,
unsigned int f2fs_usable_segs_in_sec(struct f2fs_sb_info *sbi);
unsigned int f2fs_usable_blks_in_seg(struct f2fs_sb_info *sbi,
unsigned int segno);
+unsigned long long f2fs_get_section_mtime(struct f2fs_sb_info *sbi,
+ unsigned int segno);
#define DEF_FRAGMENT_SIZE 4
#define MIN_FRAGMENT_SIZE 1
@@ -3935,6 +3944,19 @@ void f2fs_destroy_recovery_cache(void);
* debug.c
*/
#ifdef CONFIG_F2FS_STAT_FS
+enum {
+ DEVSTAT_INUSE,
+ DEVSTAT_DIRTY,
+ DEVSTAT_FULL,
+ DEVSTAT_FREE,
+ DEVSTAT_PREFREE,
+ DEVSTAT_MAX,
+};
+
+struct f2fs_dev_stats {
+ unsigned int devstats[2][DEVSTAT_MAX]; /* 0: segs, 1: secs */
+};
+
struct f2fs_stat_info {
struct list_head stat_list;
struct f2fs_sb_info *sbi;
@@ -3998,6 +4020,7 @@ struct f2fs_stat_info {
unsigned int block_count[2];
unsigned int inplace_count;
unsigned long long base_mem, cache_mem, page_mem;
+ struct f2fs_dev_stats *dev_stats;
};
static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
@@ -4510,6 +4533,7 @@ F2FS_FEATURE_FUNCS(sb_chksum, SB_CHKSUM);
F2FS_FEATURE_FUNCS(casefold, CASEFOLD);
F2FS_FEATURE_FUNCS(compression, COMPRESSION);
F2FS_FEATURE_FUNCS(readonly, RO);
+F2FS_FEATURE_FUNCS(device_alias, DEVICE_ALIAS);
#ifdef CONFIG_BLK_DEV_ZONED
static inline bool f2fs_blkz_is_seq(struct f2fs_sb_info *sbi, int devi,
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 84447d5145aa..aa9679b3d8e4 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -725,6 +725,11 @@ int f2fs_do_truncate_blocks(struct inode *inode, u64 from, bool lock)
trace_f2fs_truncate_blocks_enter(inode, from);
+ if (IS_DEVICE_ALIASING(inode) && from) {
+ err = -EINVAL;
+ goto out_err;
+ }
+
free_from = (pgoff_t)F2FS_BLK_ALIGN(from);
if (free_from >= max_file_blocks(inode))
@@ -739,6 +744,21 @@ int f2fs_do_truncate_blocks(struct inode *inode, u64 from, bool lock)
goto out;
}
+ if (IS_DEVICE_ALIASING(inode)) {
+ struct extent_tree *et = F2FS_I(inode)->extent_tree[EX_READ];
+ struct extent_info ei = et->largest;
+ unsigned int i;
+
+ for (i = 0; i < ei.len; i++)
+ f2fs_invalidate_blocks(sbi, ei.blk + i);
+
+ dec_valid_block_count(sbi, inode, ei.len);
+ f2fs_update_time(sbi, REQ_TIME);
+
+ f2fs_put_page(ipage, 1);
+ goto out;
+ }
+
if (f2fs_has_inline_data(inode)) {
f2fs_truncate_inline_inode(inode, ipage, from);
f2fs_put_page(ipage, 1);
@@ -774,7 +794,7 @@ free_partial:
/* lastly zero out the first data page */
if (!err)
err = truncate_partial_data_page(inode, from, truncate_page);
-
+out_err:
trace_f2fs_truncate_blocks_exit(inode, err);
return err;
}
@@ -863,7 +883,11 @@ static bool f2fs_force_buffered_io(struct inode *inode, int rw)
return true;
if (f2fs_compressed_file(inode))
return true;
- if (f2fs_has_inline_data(inode))
+ /*
+ * only force direct read to use buffered IO, for direct write,
+ * it expects inline data conversion before committing IO.
+ */
+ if (f2fs_has_inline_data(inode) && rw == READ)
return true;
/* disallow direct IO if any of devices has unaligned blksize */
@@ -992,7 +1016,8 @@ int f2fs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
return -EPERM;
if ((attr->ia_valid & ATTR_SIZE)) {
- if (!f2fs_is_compress_backend_ready(inode))
+ if (!f2fs_is_compress_backend_ready(inode) ||
+ IS_DEVICE_ALIASING(inode))
return -EOPNOTSUPP;
if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED) &&
!IS_ALIGNED(attr->ia_size,
@@ -1790,7 +1815,8 @@ static int f2fs_expand_inode_data(struct inode *inode, loff_t offset,
map.m_len = sec_blks;
next_alloc:
- if (has_not_enough_free_secs(sbi, 0,
+ if (has_not_enough_free_secs(sbi, 0, f2fs_sb_has_blkzoned(sbi) ?
+ ZONED_PIN_SEC_REQUIRED_COUNT :
GET_SEC_FROM_SEG(sbi, overprovision_segments(sbi)))) {
f2fs_down_write(&sbi->gc_lock);
stat_inc_gc_call_count(sbi, FOREGROUND);
@@ -1860,7 +1886,7 @@ static long f2fs_fallocate(struct file *file, int mode,
return -EIO;
if (!f2fs_is_checkpoint_ready(F2FS_I_SB(inode)))
return -ENOSPC;
- if (!f2fs_is_compress_backend_ready(inode))
+ if (!f2fs_is_compress_backend_ready(inode) || IS_DEVICE_ALIASING(inode))
return -EOPNOTSUPP;
/* f2fs only support ->fallocate for regular file */
@@ -2343,9 +2369,12 @@ int f2fs_do_shutdown(struct f2fs_sb_info *sbi, unsigned int flag,
if (readonly)
goto out;
- /* grab sb->s_umount to avoid racing w/ remount() */
+ /*
+ * grab sb->s_umount to avoid racing w/ remount() and other shutdown
+ * paths.
+ */
if (need_lock)
- down_read(&sbi->sb->s_umount);
+ down_write(&sbi->sb->s_umount);
f2fs_stop_gc_thread(sbi);
f2fs_stop_discard_thread(sbi);
@@ -2354,7 +2383,7 @@ int f2fs_do_shutdown(struct f2fs_sb_info *sbi, unsigned int flag,
clear_opt(sbi, DISCARD);
if (need_lock)
- up_read(&sbi->sb->s_umount);
+ up_write(&sbi->sb->s_umount);
f2fs_update_time(sbi, REQ_TIME);
out:
@@ -2861,7 +2890,7 @@ static int f2fs_ioc_defragment(struct file *filp, unsigned long arg)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (!S_ISREG(inode->i_mode) || f2fs_is_atomic_file(inode))
+ if (!S_ISREG(inode->i_mode))
return -EINVAL;
if (f2fs_readonly(sbi->sb))
@@ -3291,6 +3320,9 @@ int f2fs_pin_file_control(struct inode *inode, bool inc)
struct f2fs_inode_info *fi = F2FS_I(inode);
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ if (IS_DEVICE_ALIASING(inode))
+ return -EINVAL;
+
if (fi->i_gc_failures >= sbi->gc_pin_file_threshold) {
f2fs_warn(sbi, "%s: Enable GC = ino %lx after %x GC trials",
__func__, inode->i_ino, fi->i_gc_failures);
@@ -3321,6 +3353,9 @@ static int f2fs_ioc_set_pin_file(struct file *filp, unsigned long arg)
if (f2fs_readonly(sbi->sb))
return -EROFS;
+ if (!pin && IS_DEVICE_ALIASING(inode))
+ return -EOPNOTSUPP;
+
ret = mnt_want_write_file(filp);
if (ret)
return ret;
@@ -3386,6 +3421,12 @@ static int f2fs_ioc_get_pin_file(struct file *filp, unsigned long arg)
return put_user(pin, (u32 __user *)arg);
}
+static int f2fs_ioc_get_dev_alias_file(struct file *filp, unsigned long arg)
+{
+ return put_user(IS_DEVICE_ALIASING(file_inode(filp)) ? 1 : 0,
+ (u32 __user *)arg);
+}
+
int f2fs_precache_extents(struct inode *inode)
{
struct f2fs_inode_info *fi = F2FS_I(inode);
@@ -3787,7 +3828,7 @@ static int reserve_compress_blocks(struct dnode_of_data *dn, pgoff_t count,
to_reserved = cluster_size - compr_blocks - reserved;
/* for the case all blocks in cluster were reserved */
- if (to_reserved == 1) {
+ if (reserved && to_reserved == 1) {
dn->ofs_in_node += cluster_size;
goto next;
}
@@ -4485,6 +4526,8 @@ static long __f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
return f2fs_ioc_decompress_file(filp);
case F2FS_IOC_COMPRESS_FILE:
return f2fs_ioc_compress_file(filp);
+ case F2FS_IOC_GET_DEV_ALIAS_FILE:
+ return f2fs_ioc_get_dev_alias_file(filp, arg);
default:
return -ENOTTY;
}
@@ -4760,7 +4803,8 @@ static int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *iter,
else
return 0;
- map.m_may_create = true;
+ if (!IS_DEVICE_ALIASING(inode))
+ map.m_may_create = true;
if (dio) {
map.m_seg_type = f2fs_rw_hint_to_seg_type(sbi,
inode->i_write_hint);
@@ -4816,8 +4860,8 @@ static void f2fs_dio_write_submit_io(const struct iomap_iter *iter,
{
struct inode *inode = iter->inode;
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- int seg_type = f2fs_rw_hint_to_seg_type(sbi, inode->i_write_hint);
- enum temp_type temp = f2fs_get_segment_temp(seg_type);
+ enum log_type type = f2fs_rw_hint_to_seg_type(sbi, inode->i_write_hint);
+ enum temp_type temp = f2fs_get_segment_temp(sbi, type);
bio->bi_write_hint = f2fs_io_type_to_rw_hint(sbi, DATA, temp);
submit_bio(bio);
@@ -5197,6 +5241,7 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case F2FS_IOC_SET_COMPRESS_OPTION:
case F2FS_IOC_DECOMPRESS_FILE:
case F2FS_IOC_COMPRESS_FILE:
+ case F2FS_IOC_GET_DEV_ALIAS_FILE:
break;
default:
return -ENOIOCTLCMD;
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 9322a7200e31..3e1b6d2ff3a7 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -257,6 +257,8 @@ static int select_gc_type(struct f2fs_sb_info *sbi, int gc_type)
switch (sbi->gc_mode) {
case GC_IDLE_CB:
+ case GC_URGENT_LOW:
+ case GC_URGENT_MID:
gc_mode = GC_CB;
break;
case GC_IDLE_GREEDY:
@@ -361,20 +363,15 @@ static unsigned int check_bg_victims(struct f2fs_sb_info *sbi)
static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno)
{
struct sit_info *sit_i = SIT_I(sbi);
- unsigned int secno = GET_SEC_FROM_SEG(sbi, segno);
- unsigned int start = GET_SEG_FROM_SEC(sbi, secno);
unsigned long long mtime = 0;
unsigned int vblocks;
unsigned char age = 0;
unsigned char u;
- unsigned int i;
unsigned int usable_segs_per_sec = f2fs_usable_segs_in_sec(sbi);
- for (i = 0; i < usable_segs_per_sec; i++)
- mtime += get_seg_entry(sbi, start + i)->mtime;
+ mtime = f2fs_get_section_mtime(sbi, segno);
+ f2fs_bug_on(sbi, mtime == INVALID_MTIME);
vblocks = get_valid_blocks(sbi, segno, true);
-
- mtime = div_u64(mtime, usable_segs_per_sec);
vblocks = div_u64(vblocks, usable_segs_per_sec);
u = BLKS_TO_SEGS(sbi, vblocks * 100);
@@ -519,10 +516,7 @@ static void add_victim_entry(struct f2fs_sb_info *sbi,
struct victim_sel_policy *p, unsigned int segno)
{
struct sit_info *sit_i = SIT_I(sbi);
- unsigned int secno = GET_SEC_FROM_SEG(sbi, segno);
- unsigned int start = GET_SEG_FROM_SEC(sbi, secno);
unsigned long long mtime = 0;
- unsigned int i;
if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) {
if (p->gc_mode == GC_AT &&
@@ -530,9 +524,8 @@ static void add_victim_entry(struct f2fs_sb_info *sbi,
return;
}
- for (i = 0; i < SEGS_PER_SEC(sbi); i++)
- mtime += get_seg_entry(sbi, start + i)->mtime;
- mtime = div_u64(mtime, SEGS_PER_SEC(sbi));
+ mtime = f2fs_get_section_mtime(sbi, segno);
+ f2fs_bug_on(sbi, mtime == INVALID_MTIME);
/* Handle if the system time has changed by the user */
if (mtime < sit_i->min_mtime)
diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h
index 2914b678bf8f..5c1eaf55e127 100644
--- a/fs/f2fs/gc.h
+++ b/fs/f2fs/gc.h
@@ -35,6 +35,7 @@
#define LIMIT_BOOST_ZONED_GC 25 /* percentage over total user space of boosted gc for zoned devices */
#define DEF_MIGRATION_WINDOW_GRANULARITY_ZONED 3
#define BOOST_GC_MULTIPLE 5
+#define ZONED_PIN_SEC_REQUIRED_COUNT 1
#define DEF_GC_FAILED_PINNED_FILES 2048
#define MAX_GC_FAILED_PINNED_FILES USHRT_MAX
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 1ed86df343a5..282fd320bdb3 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -372,6 +372,19 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page)
return false;
}
+ if (IS_DEVICE_ALIASING(inode)) {
+ if (!f2fs_sb_has_device_alias(sbi)) {
+ f2fs_warn(sbi, "%s: inode (ino=%lx) has device alias flag, but the feature is off",
+ __func__, inode->i_ino);
+ return false;
+ }
+ if (!f2fs_is_pinned_file(inode)) {
+ f2fs_warn(sbi, "%s: inode (ino=%lx) has device alias flag, but is not pinned",
+ __func__, inode->i_ino);
+ return false;
+ }
+ }
+
return true;
}
@@ -775,8 +788,10 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
!is_inode_flag_set(inode, FI_DIRTY_INODE))
return 0;
- if (!f2fs_is_checkpoint_ready(sbi))
+ if (!f2fs_is_checkpoint_ready(sbi)) {
+ f2fs_mark_inode_dirty_sync(inode, true);
return -ENOSPC;
+ }
/*
* We need to balance fs here to prevent from producing dirty node pages
@@ -823,7 +838,8 @@ void f2fs_evict_inode(struct inode *inode)
f2fs_bug_on(sbi, get_dirty_pages(inode));
f2fs_remove_dirty_inode(inode);
- f2fs_destroy_extent_tree(inode);
+ if (!IS_DEVICE_ALIASING(inode))
+ f2fs_destroy_extent_tree(inode);
if (inode->i_nlink || is_bad_inode(inode))
goto no_delete;
@@ -879,6 +895,9 @@ retry:
goto retry;
}
+ if (IS_DEVICE_ALIASING(inode))
+ f2fs_destroy_extent_tree(inode);
+
if (err) {
f2fs_update_inode_page(inode);
if (dquot_initialize_needed(inode))
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 59b13ff243fa..0b900a7a48e5 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -905,6 +905,16 @@ static int truncate_node(struct dnode_of_data *dn)
if (err)
return err;
+ if (ni.blk_addr != NEW_ADDR &&
+ !f2fs_is_valid_blkaddr(sbi, ni.blk_addr, DATA_GENERIC_ENHANCE)) {
+ f2fs_err_ratelimited(sbi,
+ "nat entry is corrupted, run fsck to fix it, ino:%u, "
+ "nid:%u, blkaddr:%u", ni.ino, ni.nid, ni.blk_addr);
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
+ f2fs_handle_error(sbi, ERROR_INCONSISTENT_NAT);
+ return -EFSCORRUPTED;
+ }
+
/* Deallocate node address */
f2fs_invalidate_blocks(sbi, ni.blk_addr);
dec_valid_node_count(sbi, dn->inode, dn->nid == dn->inode->i_ino);
@@ -1056,7 +1066,7 @@ static int truncate_partial_nodes(struct dnode_of_data *dn,
int i;
int idx = depth - 2;
- nid[0] = le32_to_cpu(ri->i_nid[offset[0] - NODE_DIR1_BLOCK]);
+ nid[0] = get_nid(dn->inode_page, offset[0], true);
if (!nid[0])
return 0;
@@ -1167,7 +1177,7 @@ int f2fs_truncate_inode_blocks(struct inode *inode, pgoff_t from)
skip_partial:
while (cont) {
- dn.nid = le32_to_cpu(ri->i_nid[offset[0] - NODE_DIR1_BLOCK]);
+ dn.nid = get_nid(page, offset[0], true);
switch (offset[0]) {
case NODE_DIR1_BLOCK:
case NODE_DIR2_BLOCK:
@@ -1199,13 +1209,10 @@ skip_partial:
}
if (err < 0)
goto fail;
- if (offset[1] == 0 &&
- ri->i_nid[offset[0] - NODE_DIR1_BLOCK]) {
+ if (offset[1] == 0 && get_nid(page, offset[0], true)) {
lock_page(page);
BUG_ON(page->mapping != NODE_MAPPING(sbi));
- f2fs_wait_on_page_writeback(page, NODE, true, true);
- ri->i_nid[offset[0] - NODE_DIR1_BLOCK] = 0;
- set_page_dirty(page);
+ set_nid(page, offset[0], 0, true);
unlock_page(page);
}
offset[1] = 0;
@@ -1331,7 +1338,12 @@ struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs)
err = -EFSCORRUPTED;
dec_valid_node_count(sbi, dn->inode, !ofs);
set_sbi_flag(sbi, SBI_NEED_FSCK);
- f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR);
+ f2fs_warn_ratelimited(sbi,
+ "f2fs_new_node_page: inconsistent nat entry, "
+ "ino:%u, nid:%u, blkaddr:%u, ver:%u, flag:%u",
+ new_ni.ino, new_ni.nid, new_ni.blk_addr,
+ new_ni.version, new_ni.flag);
+ f2fs_handle_error(sbi, ERROR_INCONSISTENT_NAT);
goto fail;
}
#endif
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index e4d81b8705d1..f35be2c48e3c 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -899,13 +899,8 @@ skip:
* and the f2fs is not read only, check and fix zoned block devices'
* write pointer consistency.
*/
- if (f2fs_sb_has_blkzoned(sbi) && !f2fs_readonly(sbi->sb)) {
- int err2 = f2fs_fix_curseg_write_pointer(sbi);
-
- if (!err2)
- err2 = f2fs_check_write_pointer(sbi);
- if (err2)
- err = err2;
+ if (!err) {
+ err = f2fs_check_and_fix_write_pointer(sbi);
ret = err;
}
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 1766254279d2..eade36c5ef13 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -1290,16 +1290,18 @@ static int __submit_discard_cmd(struct f2fs_sb_info *sbi,
wait_list, issued);
return 0;
}
-
- /*
- * Issue discard for conventional zones only if the device
- * supports discard.
- */
- if (!bdev_max_discard_sectors(bdev))
- return -EOPNOTSUPP;
}
#endif
+ /*
+ * stop issuing discard for any of below cases:
+ * 1. device is conventional zone, but it doesn't support discard.
+ * 2. device is regulare device, after snapshot it doesn't support
+ * discard.
+ */
+ if (!bdev_max_discard_sectors(bdev))
+ return -EOPNOTSUPP;
+
trace_f2fs_issue_discard(bdev, dc->di.start, dc->di.len);
lstart = dc->di.lstart;
@@ -2711,7 +2713,7 @@ static int get_new_segment(struct f2fs_sb_info *sbi,
if (sbi->blkzone_alloc_policy == BLKZONE_ALLOC_PRIOR_CONV || pinning)
segno = 0;
else
- segno = max(first_zoned_segno(sbi), *newseg);
+ segno = max(sbi->first_zoned_segno, *newseg);
hint = GET_SEC_FROM_SEG(sbi, segno);
}
#endif
@@ -2723,7 +2725,7 @@ find_other_zone:
if (secno >= MAIN_SECS(sbi) && f2fs_sb_has_blkzoned(sbi)) {
/* Write only to sequential zones */
if (sbi->blkzone_alloc_policy == BLKZONE_ALLOC_ONLY_SEQ) {
- hint = GET_SEC_FROM_SEG(sbi, first_zoned_segno(sbi));
+ hint = GET_SEC_FROM_SEG(sbi, sbi->first_zoned_segno);
secno = find_next_zero_bit(free_i->free_secmap, MAIN_SECS(sbi), hint);
} else
secno = find_first_zero_bit(free_i->free_secmap,
@@ -2926,7 +2928,8 @@ static int change_curseg(struct f2fs_sb_info *sbi, int type)
struct f2fs_summary_block *sum_node;
struct page *sum_page;
- write_sum_page(sbi, curseg->sum_blk, GET_SUM_BLOCK(sbi, curseg->segno));
+ if (curseg->inited)
+ write_sum_page(sbi, curseg->sum_blk, GET_SUM_BLOCK(sbi, curseg->segno));
__set_test_and_inuse(sbi, new_segno);
@@ -3237,7 +3240,8 @@ retry:
if (f2fs_sb_has_blkzoned(sbi) && err == -EAGAIN && gc_required) {
f2fs_down_write(&sbi->gc_lock);
- err = f2fs_gc_range(sbi, 0, GET_SEGNO(sbi, FDEV(0).end_blk), true, 1);
+ err = f2fs_gc_range(sbi, 0, GET_SEGNO(sbi, FDEV(0).end_blk),
+ true, ZONED_PIN_SEC_REQUIRED_COUNT);
f2fs_up_write(&sbi->gc_lock);
gc_required = false;
@@ -3581,18 +3585,35 @@ static int __get_segment_type_6(struct f2fs_io_info *fio)
}
}
-int f2fs_get_segment_temp(int seg_type)
+enum temp_type f2fs_get_segment_temp(struct f2fs_sb_info *sbi,
+ enum log_type type)
{
- if (IS_HOT(seg_type))
- return HOT;
- else if (IS_WARM(seg_type))
- return WARM;
- return COLD;
+ struct curseg_info *curseg = CURSEG_I(sbi, type);
+ enum temp_type temp = COLD;
+
+ switch (curseg->seg_type) {
+ case CURSEG_HOT_NODE:
+ case CURSEG_HOT_DATA:
+ temp = HOT;
+ break;
+ case CURSEG_WARM_NODE:
+ case CURSEG_WARM_DATA:
+ temp = WARM;
+ break;
+ case CURSEG_COLD_NODE:
+ case CURSEG_COLD_DATA:
+ temp = COLD;
+ break;
+ default:
+ f2fs_bug_on(sbi, 1);
+ }
+
+ return temp;
}
static int __get_segment_type(struct f2fs_io_info *fio)
{
- int type = 0;
+ enum log_type type = CURSEG_HOT_DATA;
switch (F2FS_OPTION(fio->sbi).active_logs) {
case 2:
@@ -3608,7 +3629,7 @@ static int __get_segment_type(struct f2fs_io_info *fio)
f2fs_bug_on(fio->sbi, true);
}
- fio->temp = f2fs_get_segment_temp(type);
+ fio->temp = f2fs_get_segment_temp(fio->sbi, type);
return type;
}
@@ -3793,10 +3814,35 @@ void f2fs_update_device_state(struct f2fs_sb_info *sbi, nid_t ino,
}
}
+static int log_type_to_seg_type(enum log_type type)
+{
+ int seg_type = CURSEG_COLD_DATA;
+
+ switch (type) {
+ case CURSEG_HOT_DATA:
+ case CURSEG_WARM_DATA:
+ case CURSEG_COLD_DATA:
+ case CURSEG_HOT_NODE:
+ case CURSEG_WARM_NODE:
+ case CURSEG_COLD_NODE:
+ seg_type = (int)type;
+ break;
+ case CURSEG_COLD_DATA_PINNED:
+ case CURSEG_ALL_DATA_ATGC:
+ seg_type = CURSEG_COLD_DATA;
+ break;
+ default:
+ break;
+ }
+ return seg_type;
+}
+
static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio)
{
- int type = __get_segment_type(fio);
- bool keep_order = (f2fs_lfs_mode(fio->sbi) && type == CURSEG_COLD_DATA);
+ enum log_type type = __get_segment_type(fio);
+ int seg_type = log_type_to_seg_type(type);
+ bool keep_order = (f2fs_lfs_mode(fio->sbi) &&
+ seg_type == CURSEG_COLD_DATA);
if (keep_order)
f2fs_down_read(&fio->sbi->io_order_lock);
@@ -3977,8 +4023,8 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
}
}
- f2fs_bug_on(sbi, !IS_DATASEG(type));
curseg = CURSEG_I(sbi, type);
+ f2fs_bug_on(sbi, !IS_DATASEG(curseg->seg_type));
mutex_lock(&curseg->curseg_mutex);
down_write(&sit_i->sentry_lock);
@@ -4778,12 +4824,7 @@ static int build_curseg(struct f2fs_sb_info *sbi)
sizeof(struct f2fs_journal), GFP_KERNEL);
if (!array[i].journal)
return -ENOMEM;
- if (i < NR_PERSISTENT_LOG)
- array[i].seg_type = CURSEG_HOT_DATA + i;
- else if (i == CURSEG_COLD_DATA_PINNED)
- array[i].seg_type = CURSEG_COLD_DATA;
- else if (i == CURSEG_ALL_DATA_ATGC)
- array[i].seg_type = CURSEG_COLD_DATA;
+ array[i].seg_type = log_type_to_seg_type(i);
reset_curseg_fields(&array[i]);
}
return restore_curseg_summaries(sbi);
@@ -5207,7 +5248,7 @@ static int report_one_zone_cb(struct blk_zone *zone, unsigned int idx,
return 0;
}
-static int fix_curseg_write_pointer(struct f2fs_sb_info *sbi, int type)
+static int do_fix_curseg_write_pointer(struct f2fs_sb_info *sbi, int type)
{
struct curseg_info *cs = CURSEG_I(sbi, type);
struct f2fs_dev_info *zbd;
@@ -5312,12 +5353,12 @@ static int fix_curseg_write_pointer(struct f2fs_sb_info *sbi, int type)
return 0;
}
-int f2fs_fix_curseg_write_pointer(struct f2fs_sb_info *sbi)
+static int fix_curseg_write_pointer(struct f2fs_sb_info *sbi)
{
int i, ret;
for (i = 0; i < NR_PERSISTENT_LOG; i++) {
- ret = fix_curseg_write_pointer(sbi, i);
+ ret = do_fix_curseg_write_pointer(sbi, i);
if (ret)
return ret;
}
@@ -5340,7 +5381,7 @@ static int check_zone_write_pointer_cb(struct blk_zone *zone, unsigned int idx,
return check_zone_write_pointer(args->sbi, args->fdev, zone);
}
-int f2fs_check_write_pointer(struct f2fs_sb_info *sbi)
+static int check_write_pointer(struct f2fs_sb_info *sbi)
{
int i, ret;
struct check_zone_write_pointer_args args;
@@ -5360,6 +5401,20 @@ int f2fs_check_write_pointer(struct f2fs_sb_info *sbi)
return 0;
}
+int f2fs_check_and_fix_write_pointer(struct f2fs_sb_info *sbi)
+{
+ int ret;
+
+ if (!f2fs_sb_has_blkzoned(sbi) || f2fs_readonly(sbi->sb))
+ return 0;
+
+ f2fs_notice(sbi, "Checking entire write pointers");
+ ret = fix_curseg_write_pointer(sbi);
+ if (!ret)
+ ret = check_write_pointer(sbi);
+ return ret;
+}
+
/*
* Return the number of usable blocks in a segment. The number of blocks
* returned is always equal to the number of blocks in a segment for
@@ -5396,12 +5451,7 @@ static inline unsigned int f2fs_usable_zone_blks_in_seg(
return BLKS_PER_SEG(sbi);
}
#else
-int f2fs_fix_curseg_write_pointer(struct f2fs_sb_info *sbi)
-{
- return 0;
-}
-
-int f2fs_check_write_pointer(struct f2fs_sb_info *sbi)
+int f2fs_check_and_fix_write_pointer(struct f2fs_sb_info *sbi)
{
return 0;
}
@@ -5430,6 +5480,35 @@ unsigned int f2fs_usable_segs_in_sec(struct f2fs_sb_info *sbi)
return SEGS_PER_SEC(sbi);
}
+unsigned long long f2fs_get_section_mtime(struct f2fs_sb_info *sbi,
+ unsigned int segno)
+{
+ unsigned int usable_segs_per_sec = f2fs_usable_segs_in_sec(sbi);
+ unsigned int secno = 0, start = 0;
+ unsigned int total_valid_blocks = 0;
+ unsigned long long mtime = 0;
+ unsigned int i = 0;
+
+ secno = GET_SEC_FROM_SEG(sbi, segno);
+ start = GET_SEG_FROM_SEC(sbi, secno);
+
+ if (!__is_large_section(sbi))
+ return get_seg_entry(sbi, start + i)->mtime;
+
+ for (i = 0; i < usable_segs_per_sec; i++) {
+ /* for large section, only check the mtime of valid segments */
+ struct seg_entry *se = get_seg_entry(sbi, start+i);
+
+ mtime += se->mtime * se->valid_blocks;
+ total_valid_blocks += se->valid_blocks;
+ }
+
+ if (total_valid_blocks == 0)
+ return INVALID_MTIME;
+
+ return div_u64(mtime, total_valid_blocks);
+}
+
/*
* Update min, max modified time for cost-benefit GC algorithm
*/
@@ -5443,13 +5522,9 @@ static void init_min_max_mtime(struct f2fs_sb_info *sbi)
sit_i->min_mtime = ULLONG_MAX;
for (segno = 0; segno < MAIN_SEGS(sbi); segno += SEGS_PER_SEC(sbi)) {
- unsigned int i;
unsigned long long mtime = 0;
- for (i = 0; i < SEGS_PER_SEC(sbi); i++)
- mtime += get_seg_entry(sbi, segno + i)->mtime;
-
- mtime = div_u64(mtime, SEGS_PER_SEC(sbi));
+ mtime = f2fs_get_section_mtime(sbi, segno);
if (sit_i->min_mtime > mtime)
sit_i->min_mtime = mtime;
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 71adb4a43bec..943be4f1d6d2 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -18,6 +18,8 @@
#define F2FS_MIN_SEGMENTS 9 /* SB + 2 (CP + SIT + NAT) + SSA + MAIN */
#define F2FS_MIN_META_SEGMENTS 8 /* SB + 2 (CP + SIT + NAT) + SSA */
+#define INVALID_MTIME ULLONG_MAX /* no valid blocks in a segment/section */
+
/* L: Logical segment # in volume, R: Relative segment # in main area */
#define GET_L2R_SEGNO(free_i, segno) ((segno) - (free_i)->start_segno)
#define GET_R2L_SEGNO(free_i, segno) ((segno) + (free_i)->start_segno)
@@ -32,10 +34,6 @@ static inline void sanity_check_seg_type(struct f2fs_sb_info *sbi,
f2fs_bug_on(sbi, seg_type >= NR_PERSISTENT_LOG);
}
-#define IS_HOT(t) ((t) == CURSEG_HOT_NODE || (t) == CURSEG_HOT_DATA)
-#define IS_WARM(t) ((t) == CURSEG_WARM_NODE || (t) == CURSEG_WARM_DATA)
-#define IS_COLD(t) ((t) == CURSEG_COLD_NODE || (t) == CURSEG_COLD_DATA)
-
#define IS_CURSEG(sbi, seg) \
(((seg) == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno) || \
((seg) == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno) || \
@@ -524,8 +522,7 @@ static inline unsigned int free_segments(struct f2fs_sb_info *sbi)
static inline unsigned int reserved_segments(struct f2fs_sb_info *sbi)
{
- return SM_I(sbi)->reserved_segments +
- SM_I(sbi)->additional_reserved_segments;
+ return SM_I(sbi)->reserved_segments;
}
static inline unsigned int free_sections(struct f2fs_sb_info *sbi)
@@ -559,18 +556,21 @@ static inline int reserved_sections(struct f2fs_sb_info *sbi)
}
static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi,
- unsigned int node_blocks, unsigned int dent_blocks)
+ unsigned int node_blocks, unsigned int data_blocks,
+ unsigned int dent_blocks)
{
- unsigned segno, left_blocks;
+ unsigned int segno, left_blocks, blocks;
int i;
- /* check current node sections in the worst case. */
- for (i = CURSEG_HOT_NODE; i <= CURSEG_COLD_NODE; i++) {
+ /* check current data/node sections in the worst case. */
+ for (i = CURSEG_HOT_DATA; i < NR_PERSISTENT_LOG; i++) {
segno = CURSEG_I(sbi, i)->segno;
left_blocks = CAP_BLKS_PER_SEC(sbi) -
get_ckpt_valid_blocks(sbi, segno, true);
- if (node_blocks > left_blocks)
+
+ blocks = i <= CURSEG_COLD_DATA ? data_blocks : node_blocks;
+ if (blocks > left_blocks)
return false;
}
@@ -584,8 +584,9 @@ static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi,
}
/*
- * calculate needed sections for dirty node/dentry
- * and call has_curseg_enough_space
+ * calculate needed sections for dirty node/dentry and call
+ * has_curseg_enough_space, please note that, it needs to account
+ * dirty data as well in lfs mode when checkpoint is disabled.
*/
static inline void __get_secs_required(struct f2fs_sb_info *sbi,
unsigned int *lower_p, unsigned int *upper_p, bool *curseg_p)
@@ -594,19 +595,30 @@ static inline void __get_secs_required(struct f2fs_sb_info *sbi,
get_pages(sbi, F2FS_DIRTY_DENTS) +
get_pages(sbi, F2FS_DIRTY_IMETA);
unsigned int total_dent_blocks = get_pages(sbi, F2FS_DIRTY_DENTS);
+ unsigned int total_data_blocks = 0;
unsigned int node_secs = total_node_blocks / CAP_BLKS_PER_SEC(sbi);
unsigned int dent_secs = total_dent_blocks / CAP_BLKS_PER_SEC(sbi);
+ unsigned int data_secs = 0;
unsigned int node_blocks = total_node_blocks % CAP_BLKS_PER_SEC(sbi);
unsigned int dent_blocks = total_dent_blocks % CAP_BLKS_PER_SEC(sbi);
+ unsigned int data_blocks = 0;
+
+ if (f2fs_lfs_mode(sbi) &&
+ unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) {
+ total_data_blocks = get_pages(sbi, F2FS_DIRTY_DATA);
+ data_secs = total_data_blocks / CAP_BLKS_PER_SEC(sbi);
+ data_blocks = total_data_blocks % CAP_BLKS_PER_SEC(sbi);
+ }
if (lower_p)
- *lower_p = node_secs + dent_secs;
+ *lower_p = node_secs + dent_secs + data_secs;
if (upper_p)
*upper_p = node_secs + dent_secs +
- (node_blocks ? 1 : 0) + (dent_blocks ? 1 : 0);
+ (node_blocks ? 1 : 0) + (dent_blocks ? 1 : 0) +
+ (data_blocks ? 1 : 0);
if (curseg_p)
*curseg_p = has_curseg_enough_space(sbi,
- node_blocks, dent_blocks);
+ node_blocks, data_blocks, dent_blocks);
}
static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi,
@@ -637,12 +649,30 @@ static inline bool has_enough_free_secs(struct f2fs_sb_info *sbi,
return !has_not_enough_free_secs(sbi, freed, needed);
}
+static inline bool has_enough_free_blks(struct f2fs_sb_info *sbi)
+{
+ unsigned int total_free_blocks = 0;
+ unsigned int avail_user_block_count;
+
+ spin_lock(&sbi->stat_lock);
+
+ avail_user_block_count = get_available_block_count(sbi, NULL, true);
+ total_free_blocks = avail_user_block_count - (unsigned int)valid_user_blocks(sbi);
+
+ spin_unlock(&sbi->stat_lock);
+
+ return total_free_blocks > 0;
+}
+
static inline bool f2fs_is_checkpoint_ready(struct f2fs_sb_info *sbi)
{
if (likely(!is_sbi_flag_set(sbi, SBI_CP_DISABLED)))
return true;
if (likely(has_enough_free_secs(sbi, 0, 0)))
return true;
+ if (!f2fs_lfs_mode(sbi) &&
+ likely(has_enough_free_blks(sbi)))
+ return true;
return false;
}
@@ -957,13 +987,3 @@ wake_up:
dcc->discard_wake = true;
wake_up_interruptible_all(&dcc->discard_wait_queue);
}
-
-static inline unsigned int first_zoned_segno(struct f2fs_sb_info *sbi)
-{
- int devi;
-
- for (devi = 0; devi < sbi->s_ndevs; devi++)
- if (bdev_is_zoned(FDEV(devi).bdev))
- return GET_SEGNO(sbi, FDEV(devi).start_blk);
- return 0;
-}
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 87ab5696bd48..fc7d463dee15 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -150,6 +150,8 @@ enum {
Opt_mode,
Opt_fault_injection,
Opt_fault_type,
+ Opt_lazytime,
+ Opt_nolazytime,
Opt_quota,
Opt_noquota,
Opt_usrquota,
@@ -226,6 +228,8 @@ static match_table_t f2fs_tokens = {
{Opt_mode, "mode=%s"},
{Opt_fault_injection, "fault_injection=%u"},
{Opt_fault_type, "fault_type=%u"},
+ {Opt_lazytime, "lazytime"},
+ {Opt_nolazytime, "nolazytime"},
{Opt_quota, "quota"},
{Opt_noquota, "noquota"},
{Opt_usrquota, "usrquota"},
@@ -834,6 +838,10 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
set_opt(sbi, READ_EXTENT_CACHE);
break;
case Opt_noextent_cache:
+ if (F2FS_HAS_FEATURE(sbi, F2FS_FEATURE_DEVICE_ALIAS)) {
+ f2fs_err(sbi, "device aliasing requires extent cache");
+ return -EINVAL;
+ }
clear_opt(sbi, READ_EXTENT_CACHE);
break;
case Opt_noinline_data:
@@ -918,6 +926,12 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
f2fs_info(sbi, "fault_type options not supported");
break;
#endif
+ case Opt_lazytime:
+ sb->s_flags |= SB_LAZYTIME;
+ break;
+ case Opt_nolazytime:
+ sb->s_flags &= ~SB_LAZYTIME;
+ break;
#ifdef CONFIG_QUOTA
case Opt_quota:
case Opt_usrquota:
@@ -1158,7 +1172,11 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
break;
}
- strcpy(ext[ext_cnt], name);
+ ret = strscpy(ext[ext_cnt], name);
+ if (ret < 0) {
+ kfree(name);
+ return ret;
+ }
F2FS_OPTION(sbi).compress_ext_cnt++;
kfree(name);
break;
@@ -1187,7 +1205,11 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
break;
}
- strcpy(noext[noext_cnt], name);
+ ret = strscpy(noext[noext_cnt], name);
+ if (ret < 0) {
+ kfree(name);
+ return ret;
+ }
F2FS_OPTION(sbi).nocompress_ext_cnt++;
kfree(name);
break;
@@ -1738,6 +1760,18 @@ static int f2fs_freeze(struct super_block *sb)
static int f2fs_unfreeze(struct super_block *sb)
{
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+
+ /*
+ * It will update discard_max_bytes of mounted lvm device to zero
+ * after creating snapshot on this lvm device, let's drop all
+ * remained discards.
+ * We don't need to disable real-time discard because discard_max_bytes
+ * will recover after removal of snapshot.
+ */
+ if (test_opt(sbi, DISCARD) && !f2fs_hw_support_discard(sbi))
+ f2fs_issue_discard_timeout(sbi);
+
clear_sbi_flag(F2FS_SB(sb), SBI_IS_FREEZING);
return 0;
}
@@ -2474,6 +2508,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
}
}
+ adjust_unusable_cap_perc(sbi);
if (enable_checkpoint == !!test_opt(sbi, DISABLE_CHECKPOINT)) {
if (test_opt(sbi, DISABLE_CHECKPOINT)) {
err = f2fs_disable_checkpoint(sbi);
@@ -2518,7 +2553,6 @@ skip:
(test_opt(sbi, POSIX_ACL) ? SB_POSIXACL : 0);
limit_reserve_root(sbi);
- adjust_unusable_cap_perc(sbi);
*flags = (*flags & ~SB_LAZYTIME) | (sb->s_flags & SB_LAZYTIME);
return 0;
restore_checkpoint:
@@ -3322,7 +3356,7 @@ loff_t max_file_blocks(struct inode *inode)
* fit within U32_MAX + 1 data units.
*/
- result = min(result, F2FS_BYTES_TO_BLK(((loff_t)U32_MAX + 1) * 4096));
+ result = umin(result, F2FS_BYTES_TO_BLK(((loff_t)U32_MAX + 1) * 4096));
return result;
}
@@ -4155,8 +4189,7 @@ static bool system_going_down(void)
|| system_state == SYSTEM_RESTART;
}
-void f2fs_handle_critical_error(struct f2fs_sb_info *sbi, unsigned char reason,
- bool irq_context)
+void f2fs_handle_critical_error(struct f2fs_sb_info *sbi, unsigned char reason)
{
struct super_block *sb = sbi->sb;
bool shutdown = reason == STOP_CP_REASON_SHUTDOWN;
@@ -4168,10 +4201,12 @@ void f2fs_handle_critical_error(struct f2fs_sb_info *sbi, unsigned char reason,
if (!f2fs_hw_is_readonly(sbi)) {
save_stop_reason(sbi, reason);
- if (irq_context && !shutdown)
- schedule_work(&sbi->s_error_work);
- else
- f2fs_record_stop_reason(sbi);
+ /*
+ * always create an asynchronous task to record stop_reason
+ * in order to avoid potential deadlock when running into
+ * f2fs_record_stop_reason() synchronously.
+ */
+ schedule_work(&sbi->s_error_work);
}
/*
@@ -4217,6 +4252,16 @@ static void f2fs_record_error_work(struct work_struct *work)
f2fs_record_stop_reason(sbi);
}
+static inline unsigned int get_first_zoned_segno(struct f2fs_sb_info *sbi)
+{
+ int devi;
+
+ for (devi = 0; devi < sbi->s_ndevs; devi++)
+ if (bdev_is_zoned(FDEV(devi).bdev))
+ return GET_SEGNO(sbi, FDEV(devi).start_blk);
+ return 0;
+}
+
static int f2fs_scan_devices(struct f2fs_sb_info *sbi)
{
struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
@@ -4617,6 +4662,9 @@ try_onemore:
/* For write statistics */
sbi->sectors_written_start = f2fs_get_sectors_written(sbi);
+ /* get segno of first zoned block device */
+ sbi->first_zoned_segno = get_first_zoned_segno(sbi);
+
/* Read accumulated write IO statistics if exists */
seg_i = CURSEG_I(sbi, CURSEG_HOT_NODE);
if (__exist_node_summaries(sbi))
@@ -4738,26 +4786,23 @@ try_onemore:
reset_checkpoint:
/*
* If the f2fs is not readonly and fsync data recovery succeeds,
- * check zoned block devices' write pointer consistency.
+ * write pointer consistency of cursegs and other zones are already
+ * checked and fixed during recovery. However, if recovery fails,
+ * write pointers are left untouched, and retry-mount should check
+ * them here.
*/
- if (f2fs_sb_has_blkzoned(sbi) && !f2fs_readonly(sb)) {
- int err2;
-
- f2fs_notice(sbi, "Checking entire write pointers");
- err2 = f2fs_check_write_pointer(sbi);
- if (err2)
- err = err2;
- }
+ if (skip_recovery)
+ err = f2fs_check_and_fix_write_pointer(sbi);
if (err)
goto free_meta;
+ /* f2fs_recover_fsync_data() cleared this already */
+ clear_sbi_flag(sbi, SBI_POR_DOING);
+
err = f2fs_init_inmem_curseg(sbi);
if (err)
goto sync_free_meta;
- /* f2fs_recover_fsync_data() cleared this already */
- clear_sbi_flag(sbi, SBI_POR_DOING);
-
if (test_opt(sbi, DISABLE_CHECKPOINT)) {
err = f2fs_disable_checkpoint(sbi);
if (err)
@@ -4991,9 +5036,6 @@ static int __init init_f2fs_fs(void)
err = f2fs_init_shrinker();
if (err)
goto free_sysfs;
- err = register_filesystem(&f2fs_fs_type);
- if (err)
- goto free_shrinker;
f2fs_create_root_stats();
err = f2fs_init_post_read_processing();
if (err)
@@ -5016,7 +5058,12 @@ static int __init init_f2fs_fs(void)
err = f2fs_create_casefold_cache();
if (err)
goto free_compress_cache;
+ err = register_filesystem(&f2fs_fs_type);
+ if (err)
+ goto free_casefold_cache;
return 0;
+free_casefold_cache:
+ f2fs_destroy_casefold_cache();
free_compress_cache:
f2fs_destroy_compress_cache();
free_compress_mempool:
@@ -5031,8 +5078,6 @@ free_post_read:
f2fs_destroy_post_read_processing();
free_root_stats:
f2fs_destroy_root_stats();
- unregister_filesystem(&f2fs_fs_type);
-free_shrinker:
f2fs_exit_shrinker();
free_sysfs:
f2fs_exit_sysfs();
@@ -5056,6 +5101,7 @@ fail:
static void __exit exit_f2fs_fs(void)
{
+ unregister_filesystem(&f2fs_fs_type);
f2fs_destroy_casefold_cache();
f2fs_destroy_compress_cache();
f2fs_destroy_compress_mempool();
@@ -5064,7 +5110,6 @@ static void __exit exit_f2fs_fs(void)
f2fs_destroy_iostat_processing();
f2fs_destroy_post_read_processing();
f2fs_destroy_root_stats();
- unregister_filesystem(&f2fs_fs_type);
f2fs_exit_shrinker();
f2fs_exit_sysfs();
f2fs_destroy_garbage_collection_cache();
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index c56e8c873935..6b99dc49f776 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -501,9 +501,7 @@ out:
if (a->struct_type == RESERVED_BLOCKS) {
spin_lock(&sbi->stat_lock);
if (t > (unsigned long)(sbi->user_block_count -
- F2FS_OPTION(sbi).root_reserved_blocks -
- SEGS_TO_BLKS(sbi,
- SM_I(sbi)->additional_reserved_segments))) {
+ F2FS_OPTION(sbi).root_reserved_blocks)) {
spin_unlock(&sbi->stat_lock);
return -EINVAL;
}
@@ -789,6 +787,13 @@ out:
return count;
}
+ if (!strcmp(a->attr.name, "max_read_extent_count")) {
+ if (t > UINT_MAX)
+ return -EINVAL;
+ *ui = (unsigned int)t;
+ return count;
+ }
+
if (!strcmp(a->attr.name, "ipu_policy")) {
if (t >= BIT(F2FS_IPU_MAX))
return -EINVAL;
@@ -1054,6 +1059,8 @@ F2FS_SBI_GENERAL_RW_ATTR(revoked_atomic_block);
F2FS_SBI_GENERAL_RW_ATTR(hot_data_age_threshold);
F2FS_SBI_GENERAL_RW_ATTR(warm_data_age_threshold);
F2FS_SBI_GENERAL_RW_ATTR(last_age_weight);
+/* read extent cache */
+F2FS_SBI_GENERAL_RW_ATTR(max_read_extent_count);
#ifdef CONFIG_BLK_DEV_ZONED
F2FS_SBI_GENERAL_RO_ATTR(unusable_blocks_per_sec);
F2FS_SBI_GENERAL_RW_ATTR(blkzone_alloc_policy);
@@ -1244,6 +1251,7 @@ static struct attribute *f2fs_attrs[] = {
ATTR_LIST(hot_data_age_threshold),
ATTR_LIST(warm_data_age_threshold),
ATTR_LIST(last_age_weight),
+ ATTR_LIST(max_read_extent_count),
NULL,
};
ATTRIBUTE_GROUPS(f2fs);
@@ -1313,6 +1321,7 @@ F2FS_SB_FEATURE_RO_ATTR(sb_checksum, SB_CHKSUM);
F2FS_SB_FEATURE_RO_ATTR(casefold, CASEFOLD);
F2FS_SB_FEATURE_RO_ATTR(compression, COMPRESSION);
F2FS_SB_FEATURE_RO_ATTR(readonly, RO);
+F2FS_SB_FEATURE_RO_ATTR(device_alias, DEVICE_ALIAS);
static struct attribute *f2fs_sb_feat_attrs[] = {
ATTR_LIST(sb_encryption),
@@ -1329,6 +1338,7 @@ static struct attribute *f2fs_sb_feat_attrs[] = {
ATTR_LIST(sb_casefold),
ATTR_LIST(sb_compression),
ATTR_LIST(sb_readonly),
+ ATTR_LIST(sb_device_alias),
NULL,
};
ATTRIBUTE_GROUPS(f2fs_sb_feat);
diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index b0b821edfd97..c24f8bc01045 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -24,10 +24,11 @@
#define NEW_ADDR ((block_t)-1) /* used as block_t addresses */
#define COMPRESS_ADDR ((block_t)-2) /* used as compressed data flag */
-#define F2FS_BYTES_TO_BLK(bytes) ((bytes) >> F2FS_BLKSIZE_BITS)
-#define F2FS_BLK_TO_BYTES(blk) ((blk) << F2FS_BLKSIZE_BITS)
+#define F2FS_BLKSIZE_MASK (F2FS_BLKSIZE - 1)
+#define F2FS_BYTES_TO_BLK(bytes) ((unsigned long long)(bytes) >> F2FS_BLKSIZE_BITS)
+#define F2FS_BLK_TO_BYTES(blk) ((unsigned long long)(blk) << F2FS_BLKSIZE_BITS)
#define F2FS_BLK_END_BYTES(blk) (F2FS_BLK_TO_BYTES(blk + 1) - 1)
-#define F2FS_BLK_ALIGN(x) (F2FS_BYTES_TO_BLK((x) + F2FS_BLKSIZE - 1))
+#define F2FS_BLK_ALIGN(x) (F2FS_BYTES_TO_BLK((x) + F2FS_BLKSIZE - 1))
/* 0, 1(node nid), 2(meta nid) are reserved node id */
#define F2FS_RESERVED_NODE_NUM 3
diff --git a/include/uapi/linux/f2fs.h b/include/uapi/linux/f2fs.h
index 955d440be104..f7aaf8d23e20 100644
--- a/include/uapi/linux/f2fs.h
+++ b/include/uapi/linux/f2fs.h
@@ -43,6 +43,7 @@
#define F2FS_IOC_DECOMPRESS_FILE _IO(F2FS_IOCTL_MAGIC, 23)
#define F2FS_IOC_COMPRESS_FILE _IO(F2FS_IOCTL_MAGIC, 24)
#define F2FS_IOC_START_ATOMIC_REPLACE _IO(F2FS_IOCTL_MAGIC, 25)
+#define F2FS_IOC_GET_DEV_ALIAS_FILE _IOR(F2FS_IOCTL_MAGIC, 26, __u32)
/*
* should be same as XFS_IOC_GOINGDOWN.