From 13dd4e04625f600e5affb1b3f0b6c35268ab839b Mon Sep 17 00:00:00 2001 From: Shiyang Ruan Date: Wed, 22 Mar 2023 11:11:09 +0000 Subject: fsdax: unshare: zero destination if srcmap is HOLE or UNWRITTEN unshare copies data from source to destination. But if the source is HOLE or UNWRITTEN extents, we should zero the destination, otherwise the HOLE or UNWRITTEN part will be user-visible old data of the new allocated extent. Found by running generic/649 while mounting with -o dax=always on pmem. Link: https://lkml.kernel.org/r/1679483469-2-1-git-send-email-ruansy.fnst@fujitsu.com Fixes: d984648e428b ("fsdax,xfs: port unshare to fsdax") Signed-off-by: Shiyang Ruan Cc: Dan Williams Cc: Darrick J. Wong Cc: Jan Kara Cc: Matthew Wilcox (Oracle) Cc: Alistair Popple Cc: Jason Gunthorpe Cc: John Hubbard Cc: Signed-off-by: Andrew Morton --- fs/dax.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/dax.c b/fs/dax.c index 3e457a16c7d1..e48d68902a8c 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1258,15 +1258,20 @@ static s64 dax_unshare_iter(struct iomap_iter *iter) /* don't bother with blocks that are not shared to start with */ if (!(iomap->flags & IOMAP_F_SHARED)) return length; - /* don't bother with holes or unwritten extents */ - if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN) - return length; id = dax_read_lock(); ret = dax_iomap_direct_access(iomap, pos, length, &daddr, NULL); if (ret < 0) goto out_unlock; + /* zero the distance if srcmap is HOLE or UNWRITTEN */ + if (srcmap->flags & IOMAP_F_SHARED || srcmap->type == IOMAP_UNWRITTEN) { + memset(daddr, 0, length); + dax_flush(iomap->dax_dev, daddr, length); + ret = length; + goto out_unlock; + } + ret = dax_iomap_direct_access(srcmap, pos, length, &saddr, NULL); if (ret < 0) goto out_unlock; -- cgit v1.2.3 From e900ba10d15041a6236cc75778cc6e06c3590a58 Mon Sep 17 00:00:00 2001 From: Shiyang Ruan Date: Wed, 22 Mar 2023 07:25:58 +0000 Subject: fsdax: dedupe should compare the min of two iters' length In an dedupe comparison iter loop, the length of iomap_iter decreases because it implies the remaining length after each iteration. The dedupe command will fail with -EIO if the range is larger than one page size and not aligned to the page size. Also report warning in dmesg: [ 4338.498374] ------------[ cut here ]------------ [ 4338.498689] WARNING: CPU: 3 PID: 1415645 at fs/iomap/iter.c:16 ... The compare function should use the min length of the current iters, not the total length. Link: https://lkml.kernel.org/r/1679469958-2-1-git-send-email-ruansy.fnst@fujitsu.com Fixes: 0e79e3736d54 ("fsdax: dedupe: iter two files at the same time") Signed-off-by: Shiyang Ruan Reviewed-by: Darrick J. Wong Cc: Dan Williams Cc: Jan Kara Cc: Matthew Wilcox (Oracle) Cc: Signed-off-by: Andrew Morton --- fs/dax.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/dax.c b/fs/dax.c index e48d68902a8c..5d2e9b10030e 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -2027,8 +2027,8 @@ int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff, while ((ret = iomap_iter(&src_iter, ops)) > 0 && (ret = iomap_iter(&dst_iter, ops)) > 0) { - compared = dax_range_compare_iter(&src_iter, &dst_iter, len, - same); + compared = dax_range_compare_iter(&src_iter, &dst_iter, + min(src_iter.len, dst_iter.len), same); if (compared < 0) return ret; src_iter.processed = dst_iter.processed = compared; -- cgit v1.2.3 From f76b3a32879de215ced3f8c754c4077b0c2f79e3 Mon Sep 17 00:00:00 2001 From: Shiyang Ruan Date: Fri, 24 Mar 2023 10:28:00 +0000 Subject: fsdax: force clear dirty mark if CoW XFS allows CoW on non-shared extents to combat fragmentation[1]. The old non-shared extent could be mwrited before, its dax entry is marked dirty. This results in a WARNing: [ 28.512349] ------------[ cut here ]------------ [ 28.512622] WARNING: CPU: 2 PID: 5255 at fs/dax.c:390 dax_insert_entry+0x342/0x390 [ 28.513050] Modules linked in: rpcsec_gss_krb5 auth_rpcgss nfsv4 nfs lockd grace fscache netfs nft_fib_inet nft_fib_ipv4 nft_fib_ipv6 nft_fib nft_reject_inet nf_reject_ipv4 nf_reject_ipv6 nft_reject nft_ct nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 ip_set nf_tables [ 28.515462] CPU: 2 PID: 5255 Comm: fsstress Kdump: loaded Not tainted 6.3.0-rc1-00001-g85e1481e19c1-dirty #117 [ 28.515902] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS Arch Linux 1.16.1-1-1 04/01/2014 [ 28.516307] RIP: 0010:dax_insert_entry+0x342/0x390 [ 28.516536] Code: 30 5b 5d 41 5c 41 5d 41 5e 41 5f c3 cc cc cc cc 48 8b 45 20 48 83 c0 01 e9 e2 fe ff ff 48 8b 45 20 48 83 c0 01 e9 cd fe ff ff <0f> 0b e9 53 ff ff ff 48 8b 7c 24 08 31 f6 e8 1b 61 a1 00 eb 8c 48 [ 28.517417] RSP: 0000:ffffc9000845fb18 EFLAGS: 00010086 [ 28.517721] RAX: 0000000000000053 RBX: 0000000000000155 RCX: 000000000018824b [ 28.518113] RDX: 0000000000000000 RSI: ffffffff827525a6 RDI: 00000000ffffffff [ 28.518515] RBP: ffffea00062092c0 R08: 0000000000000000 R09: ffffc9000845f9c8 [ 28.518905] R10: 0000000000000003 R11: ffffffff82ddb7e8 R12: 0000000000000155 [ 28.519301] R13: 0000000000000000 R14: 000000000018824b R15: ffff88810cfa76b8 [ 28.519703] FS: 00007f14a0c94740(0000) GS:ffff88817bd00000(0000) knlGS:0000000000000000 [ 28.520148] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 28.520472] CR2: 00007f14a0c8d000 CR3: 000000010321c004 CR4: 0000000000770ee0 [ 28.520863] PKRU: 55555554 [ 28.521043] Call Trace: [ 28.521219] [ 28.521368] dax_fault_iter+0x196/0x390 [ 28.521595] dax_iomap_pte_fault+0x19b/0x3d0 [ 28.521852] __xfs_filemap_fault+0x234/0x2b0 [ 28.522116] __do_fault+0x30/0x130 [ 28.522334] do_fault+0x193/0x340 [ 28.522586] __handle_mm_fault+0x2d3/0x690 [ 28.522975] handle_mm_fault+0xe6/0x2c0 [ 28.523259] do_user_addr_fault+0x1bc/0x6f0 [ 28.523521] exc_page_fault+0x60/0x140 [ 28.523763] asm_exc_page_fault+0x22/0x30 [ 28.524001] RIP: 0033:0x7f14a0b589ca [ 28.524225] Code: c5 fe 7f 07 c5 fe 7f 47 20 c5 fe 7f 47 40 c5 fe 7f 47 60 c5 f8 77 c3 66 0f 1f 84 00 00 00 00 00 40 0f b6 c6 48 89 d1 48 89 fa aa 48 89 d0 c5 f8 77 c3 66 66 2e 0f 1f 84 00 00 00 00 00 66 90 [ 28.525198] RSP: 002b:00007fff1dea1c98 EFLAGS: 00010202 [ 28.525505] RAX: 000000000000001e RBX: 000000000014a000 RCX: 0000000000006046 [ 28.525895] RDX: 00007f14a0c82000 RSI: 000000000000001e RDI: 00007f14a0c8d000 [ 28.526290] RBP: 000000000000006f R08: 0000000000000004 R09: 000000000014a000 [ 28.526681] R10: 0000000000000008 R11: 0000000000000246 R12: 028f5c28f5c28f5c [ 28.527067] R13: 8f5c28f5c28f5c29 R14: 0000000000011046 R15: 00007f14a0c946c0 [ 28.527449] [ 28.527600] ---[ end trace 0000000000000000 ]--- To be able to delete this entry, clear its dirty mark before invalidate_inode_pages2_range(). [1] https://lore.kernel.org/linux-xfs/20230321151339.GA11376@frogsfrogsfrogs/ Link: https://lkml.kernel.org/r/1679653680-2-1-git-send-email-ruansy.fnst@fujitsu.com Fixes: f80e1668888f3 ("fsdax: invalidate pages when CoW") Signed-off-by: Shiyang Ruan Cc: Dan Williams Cc: Darrick J. Wong Cc: Jan Kara Cc: Matthew Wilcox (Oracle) Cc: Signed-off-by: Andrew Morton --- fs/dax.c | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) (limited to 'fs') diff --git a/fs/dax.c b/fs/dax.c index 5d2e9b10030e..2ababb89918d 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -781,6 +781,33 @@ out: return ret; } +static int __dax_clear_dirty_range(struct address_space *mapping, + pgoff_t start, pgoff_t end) +{ + XA_STATE(xas, &mapping->i_pages, start); + unsigned int scanned = 0; + void *entry; + + xas_lock_irq(&xas); + xas_for_each(&xas, entry, end) { + entry = get_unlocked_entry(&xas, 0); + xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY); + xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE); + put_unlocked_entry(&xas, entry, WAKE_NEXT); + + if (++scanned % XA_CHECK_SCHED) + continue; + + xas_pause(&xas); + xas_unlock_irq(&xas); + cond_resched(); + xas_lock_irq(&xas); + } + xas_unlock_irq(&xas); + + return 0; +} + /* * Delete DAX entry at @index from @mapping. Wait for it * to be unlocked before deleting it. @@ -1440,6 +1467,16 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi, * written by write(2) is visible in mmap. */ if (iomap->flags & IOMAP_F_NEW || cow) { + /* + * Filesystem allows CoW on non-shared extents. The src extents + * may have been mmapped with dirty mark before. To be able to + * invalidate its dax entries, we need to clear the dirty mark + * in advance. + */ + if (cow) + __dax_clear_dirty_range(iomi->inode->i_mapping, + pos >> PAGE_SHIFT, + (end - 1) >> PAGE_SHIFT); invalidate_inode_pages2_range(iomi->inode->i_mapping, pos >> PAGE_SHIFT, (end - 1) >> PAGE_SHIFT); -- cgit v1.2.3 From 6be49d100c22ffea3287a4b19d7639d259888e33 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Tue, 28 Mar 2023 02:53:18 +0900 Subject: nilfs2: fix potential UAF of struct nilfs_sc_info in nilfs_segctor_thread() The finalization of nilfs_segctor_thread() can race with nilfs_segctor_kill_thread() which terminates that thread, potentially causing a use-after-free BUG as KASAN detected. At the end of nilfs_segctor_thread(), it assigns NULL to "sc_task" member of "struct nilfs_sc_info" to indicate the thread has finished, and then notifies nilfs_segctor_kill_thread() of this using waitqueue "sc_wait_task" on the struct nilfs_sc_info. However, here, immediately after the NULL assignment to "sc_task", it is possible that nilfs_segctor_kill_thread() will detect it and return to continue the deallocation, freeing the nilfs_sc_info structure before the thread does the notification. This fixes the issue by protecting the NULL assignment to "sc_task" and its notification, with spinlock "sc_state_lock" of the struct nilfs_sc_info. Since nilfs_segctor_kill_thread() does a final check to see if "sc_task" is NULL with "sc_state_lock" locked, this can eliminate the race. Link: https://lkml.kernel.org/r/20230327175318.8060-1-konishi.ryusuke@gmail.com Reported-by: syzbot+b08ebcc22f8f3e6be43a@syzkaller.appspotmail.com Link: https://lkml.kernel.org/r/00000000000000660d05f7dfa877@google.com Signed-off-by: Ryusuke Konishi Cc: Signed-off-by: Andrew Morton --- fs/nilfs2/segment.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 19446a8243d7..6ad41390fa74 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -2609,11 +2609,10 @@ static int nilfs_segctor_thread(void *arg) goto loop; end_thread: - spin_unlock(&sci->sc_state_lock); - /* end sync. */ sci->sc_task = NULL; wake_up(&sci->sc_wait_task); /* for nilfs_segctor_kill_thread() */ + spin_unlock(&sci->sc_state_lock); return 0; } -- cgit v1.2.3 From 7397031622e05ca206e2d674ec199d6bb66fc9ba Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Mon, 27 Mar 2023 00:21:46 +0900 Subject: nilfs2: initialize "struct nilfs_binfo_dat"->bi_pad field nilfs_btree_assign_p() and nilfs_direct_assign_p() are not initializing "struct nilfs_binfo_dat"->bi_pad field, causing uninit-value reports when being passed to CRC function. Link: https://lkml.kernel.org/r/20230326152146.15872-1-konishi.ryusuke@gmail.com Reported-by: syzbot Link: https://syzkaller.appspot.com/bug?extid=048585f3f4227bb2b49b Reported-by: Dipanjan Das Link: https://lkml.kernel.org/r/CANX2M5bVbzRi6zH3PTcNE_31TzerstOXUa9Bay4E6y6dX23_pg@mail.gmail.com Signed-off-by: Tetsuo Handa Signed-off-by: Ryusuke Konishi Cc: Alexander Potapenko Signed-off-by: Andrew Morton --- fs/nilfs2/btree.c | 1 + fs/nilfs2/direct.c | 1 + 2 files changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c index 2681a449edc1..13592e82eaf6 100644 --- a/fs/nilfs2/btree.c +++ b/fs/nilfs2/btree.c @@ -2219,6 +2219,7 @@ static int nilfs_btree_assign_p(struct nilfs_bmap *btree, /* on-disk format */ binfo->bi_dat.bi_blkoff = cpu_to_le64(key); binfo->bi_dat.bi_level = level; + memset(binfo->bi_dat.bi_pad, 0, sizeof(binfo->bi_dat.bi_pad)); return 0; } diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c index a35f2795b242..4c85914f2abc 100644 --- a/fs/nilfs2/direct.c +++ b/fs/nilfs2/direct.c @@ -314,6 +314,7 @@ static int nilfs_direct_assign_p(struct nilfs_bmap *direct, binfo->bi_dat.bi_blkoff = cpu_to_le64(key); binfo->bi_dat.bi_level = 0; + memset(binfo->bi_dat.bi_pad, 0, sizeof(binfo->bi_dat.bi_pad)); return 0; } -- cgit v1.2.3 From 42560f9c92cc43dce75dbf06cc0d840dced39b12 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Fri, 31 Mar 2023 05:55:15 +0900 Subject: nilfs2: fix sysfs interface lifetime The current nilfs2 sysfs support has issues with the timing of creation and deletion of sysfs entries, potentially leading to null pointer dereferences, use-after-free, and lockdep warnings. Some of the sysfs attributes for nilfs2 per-filesystem instance refer to metadata file "cpfile", "sufile", or "dat", but nilfs_sysfs_create_device_group that creates those attributes is executed before the inodes for these metadata files are loaded, and nilfs_sysfs_delete_device_group which deletes these sysfs entries is called after releasing their metadata file inodes. Therefore, access to some of these sysfs attributes may occur outside of the lifetime of these metadata files, resulting in inode NULL pointer dereferences or use-after-free. In addition, the call to nilfs_sysfs_create_device_group() is made during the locking period of the semaphore "ns_sem" of nilfs object, so the shrinker call caused by the memory allocation for the sysfs entries, may derive lock dependencies "ns_sem" -> (shrinker) -> "locks acquired in nilfs_evict_inode()". Since nilfs2 may acquire "ns_sem" deep in the call stack holding other locks via its error handler __nilfs_error(), this causes lockdep to report circular locking. This is a false positive and no circular locking actually occurs as no inodes exist yet when nilfs_sysfs_create_device_group() is called. Fortunately, the lockdep warnings can be resolved by simply moving the call to nilfs_sysfs_create_device_group() out of "ns_sem". This fixes these sysfs issues by revising where the device's sysfs interface is created/deleted and keeping its lifetime within the lifetime of the metadata files above. Link: https://lkml.kernel.org/r/20230330205515.6167-1-konishi.ryusuke@gmail.com Fixes: dd70edbde262 ("nilfs2: integrate sysfs support into driver") Signed-off-by: Ryusuke Konishi Reported-by: syzbot+979fa7f9c0d086fdc282@syzkaller.appspotmail.com Link: https://lkml.kernel.org/r/0000000000003414b505f7885f7e@google.com Reported-by: syzbot+5b7d542076d9bddc3c6a@syzkaller.appspotmail.com Link: https://lkml.kernel.org/r/0000000000006ac86605f5f44eb9@google.com Cc: Viacheslav Dubeyko Cc: Signed-off-by: Andrew Morton --- fs/nilfs2/super.c | 2 ++ fs/nilfs2/the_nilfs.c | 12 +++++++----- 2 files changed, 9 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 1422b8ba24ed..77f1e5778d1c 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -482,6 +482,7 @@ static void nilfs_put_super(struct super_block *sb) up_write(&nilfs->ns_sem); } + nilfs_sysfs_delete_device_group(nilfs); iput(nilfs->ns_sufile); iput(nilfs->ns_cpfile); iput(nilfs->ns_dat); @@ -1105,6 +1106,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent) nilfs_put_root(fsroot); failed_unload: + nilfs_sysfs_delete_device_group(nilfs); iput(nilfs->ns_sufile); iput(nilfs->ns_cpfile); iput(nilfs->ns_dat); diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index 3a4c9c150cbf..2894152a6b25 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -87,7 +87,6 @@ void destroy_nilfs(struct the_nilfs *nilfs) { might_sleep(); if (nilfs_init(nilfs)) { - nilfs_sysfs_delete_device_group(nilfs); brelse(nilfs->ns_sbh[0]); brelse(nilfs->ns_sbh[1]); } @@ -305,6 +304,10 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb) goto failed; } + err = nilfs_sysfs_create_device_group(sb); + if (unlikely(err)) + goto sysfs_error; + if (valid_fs) goto skip_recovery; @@ -366,6 +369,9 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb) goto failed; failed_unload: + nilfs_sysfs_delete_device_group(nilfs); + + sysfs_error: iput(nilfs->ns_cpfile); iput(nilfs->ns_sufile); iput(nilfs->ns_dat); @@ -697,10 +703,6 @@ int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data) if (err) goto failed_sbh; - err = nilfs_sysfs_create_device_group(sb); - if (err) - goto failed_sbh; - set_nilfs_init(nilfs); err = 0; out: -- cgit v1.2.3