summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/Kconfig1
-rw-r--r--fs/affs/file.c9
-rw-r--r--fs/anon_inodes.c23
-rw-r--r--fs/binfmt_elf.c285
-rw-r--r--fs/binfmt_elf_fdpic.c2
-rw-r--r--fs/btrfs/block-group.c18
-rw-r--r--fs/btrfs/discard.c51
-rw-r--r--fs/btrfs/disk-io.c43
-rw-r--r--fs/btrfs/extent-tree.c30
-rw-r--r--fs/btrfs/extent_io.c7
-rw-r--r--fs/btrfs/file.c9
-rw-r--r--fs/btrfs/free-space-tree.c16
-rw-r--r--fs/btrfs/inode.c122
-rw-r--r--fs/btrfs/relocation.c6
-rw-r--r--fs/btrfs/send.c6
-rw-r--r--fs/btrfs/super.c3
-rw-r--r--fs/btrfs/tree-log.c8
-rw-r--r--fs/btrfs/volumes.c6
-rw-r--r--fs/btrfs/zoned.c6
-rw-r--r--fs/cachefiles/io.c2
-rw-r--r--fs/cachefiles/ondemand.c4
-rw-r--r--fs/ceph/file.c2
-rw-r--r--fs/ceph/super.c1
-rw-r--r--fs/configfs/dir.c2
-rw-r--r--fs/coredump.c81
-rw-r--r--fs/dlm/lowcomms.c4
-rw-r--r--fs/erofs/data.c2
-rw-r--r--fs/erofs/decompressor.c23
-rw-r--r--fs/erofs/dir.c17
-rw-r--r--fs/erofs/inode.c3
-rw-r--r--fs/erofs/internal.h2
-rw-r--r--fs/erofs/namei.c9
-rw-r--r--fs/erofs/zdata.c182
-rw-r--r--fs/erofs/zmap.c3
-rw-r--r--fs/exec.c2
-rw-r--r--fs/exfat/balloc.c10
-rw-r--r--fs/exfat/exfat_fs.h2
-rw-r--r--fs/exfat/fatent.c13
-rw-r--r--fs/ext4/balloc.c4
-rw-r--r--fs/ext4/block_validity.c5
-rw-r--r--fs/ext4/dir.c3
-rw-r--r--fs/ext4/ext4.h7
-rw-r--r--fs/ext4/extents.c18
-rw-r--r--fs/ext4/file.c7
-rw-r--r--fs/ext4/inline.c2
-rw-r--r--fs/ext4/inode.c78
-rw-r--r--fs/ext4/namei.c2
-rw-r--r--fs/ext4/super.c56
-rw-r--r--fs/ext4/xattr.c11
-rw-r--r--fs/f2fs/data.c6
-rw-r--r--fs/f2fs/extent_cache.c2
-rw-r--r--fs/f2fs/f2fs.h12
-rw-r--r--fs/f2fs/inode.c25
-rw-r--r--fs/f2fs/namei.c19
-rw-r--r--fs/f2fs/node.c9
-rw-r--r--fs/f2fs/segment.h5
-rw-r--r--fs/f2fs/super.c42
-rw-r--r--fs/file.c26
-rw-r--r--fs/filesystems.c14
-rw-r--r--fs/fuse/dax.c1
-rw-r--r--fs/fuse/dir.c6
-rw-r--r--fs/fuse/file.c4
-rw-r--r--fs/fuse/virtio_fs.c3
-rw-r--r--fs/gfs2/glock.c11
-rw-r--r--fs/gfs2/inode.c3
-rw-r--r--fs/gfs2/lock_dlm.c3
-rw-r--r--fs/hfs/bnode.c6
-rw-r--r--fs/hfsplus/bnode.c6
-rw-r--r--fs/hfsplus/extents.c3
-rw-r--r--fs/isofs/dir.c3
-rw-r--r--fs/isofs/export.c2
-rw-r--r--fs/isofs/inode.c9
-rw-r--r--fs/jbd2/journal.c1
-rw-r--r--fs/jbd2/transaction.c5
-rw-r--r--fs/jffs2/erase.c4
-rw-r--r--fs/jffs2/scan.c4
-rw-r--r--fs/jffs2/summary.c7
-rw-r--r--fs/jfs/jfs_dinode.h2
-rw-r--r--fs/jfs/jfs_dmap.c47
-rw-r--r--fs/jfs/jfs_dtree.c3
-rw-r--r--fs/jfs/jfs_imap.c23
-rw-r--r--fs/jfs/jfs_incore.h2
-rw-r--r--fs/jfs/jfs_txnmgr.c4
-rw-r--r--fs/jfs/jfs_xtree.c4
-rw-r--r--fs/jfs/jfs_xtree.h37
-rw-r--r--fs/jfs/xattr.c15
-rw-r--r--fs/kernfs/dir.c5
-rw-r--r--fs/kernfs/file.c3
-rw-r--r--fs/namei.c24
-rw-r--r--fs/namespace.c31
-rw-r--r--fs/nfs/Kconfig2
-rw-r--r--fs/nfs/client.c2
-rw-r--r--fs/nfs/delegation.c36
-rw-r--r--fs/nfs/dir.c19
-rw-r--r--fs/nfs/export.c11
-rw-r--r--fs/nfs/filelayout/filelayoutdev.c6
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c148
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayoutdev.c12
-rw-r--r--fs/nfs/inode.c21
-rw-r--r--fs/nfs/internal.h32
-rw-r--r--fs/nfs/nfs3proc.c2
-rw-r--r--fs/nfs/nfs4proc.c37
-rw-r--r--fs/nfs/nfs4session.h4
-rw-r--r--fs/nfs/nfs4state.c10
-rw-r--r--fs/nfs/pnfs.c13
-rw-r--r--fs/nfs/pnfs.h4
-rw-r--r--fs/nfs/pnfs_nfs.c9
-rw-r--r--fs/nfs/super.c19
-rw-r--r--fs/nfsd/Kconfig1
-rw-r--r--fs/nfsd/nfs4proc.c3
-rw-r--r--fs/nfsd/nfs4state.c31
-rw-r--r--fs/nfsd/nfsfh.h7
-rw-r--r--fs/nfsd/nfssvc.c6
-rw-r--r--fs/nilfs2/btree.c4
-rw-r--r--fs/nilfs2/dir.c24
-rw-r--r--fs/nilfs2/direct.c3
-rw-r--r--fs/nilfs2/inode.c9
-rw-r--r--fs/nilfs2/namei.c37
-rw-r--r--fs/nilfs2/nilfs.h10
-rw-r--r--fs/ntfs3/attrib.c176
-rw-r--r--fs/ntfs3/file.c157
-rw-r--r--fs/ntfs3/frecord.c2
-rw-r--r--fs/ntfs3/index.c16
-rw-r--r--fs/ntfs3/inode.c12
-rw-r--r--fs/ntfs3/ntfs_fs.h9
-rw-r--r--fs/ntfs3/record.c3
-rw-r--r--fs/ntfs3/super.c68
-rw-r--r--fs/ocfs2/alloc.c8
-rw-r--r--fs/ocfs2/journal.c80
-rw-r--r--fs/ocfs2/journal.h1
-rw-r--r--fs/ocfs2/ocfs2.h17
-rw-r--r--fs/ocfs2/quota_local.c11
-rw-r--r--fs/ocfs2/super.c3
-rw-r--r--fs/omfs/file.c12
-rw-r--r--fs/omfs/omfs_fs.h2
-rw-r--r--fs/orangefs/inode.c7
-rw-r--r--fs/orangefs/orangefs-debugfs.c6
-rw-r--r--fs/overlayfs/util.c4
-rw-r--r--fs/proc/base.c11
-rw-r--r--fs/proc/generic.c12
-rw-r--r--fs/proc/inode.c10
-rw-r--r--fs/proc/internal.h19
-rw-r--r--fs/proc/proc_sysctl.c18
-rw-r--r--fs/select.c11
-rw-r--r--fs/smb/client/asn1.c2
-rw-r--r--fs/smb/client/cached_dir.h8
-rw-r--r--fs/smb/client/cifs_dfs_ref.c34
-rw-r--r--fs/smb/client/cifs_spnego.c4
-rw-r--r--fs/smb/client/cifsglob.h6
-rw-r--r--fs/smb/client/cifspdu.h6
-rw-r--r--fs/smb/client/cifsproto.h26
-rw-r--r--fs/smb/client/cifssmb.c1
-rw-r--r--fs/smb/client/connect.c63
-rw-r--r--fs/smb/client/dir.c21
-rw-r--r--fs/smb/client/file.c38
-rw-r--r--fs/smb/client/fs_context.c19
-rw-r--r--fs/smb/client/ioctl.c6
-rw-r--r--fs/smb/client/link.c8
-rw-r--r--fs/smb/client/misc.c8
-rw-r--r--fs/smb/client/readdir.c35
-rw-r--r--fs/smb/client/sess.c4
-rw-r--r--fs/smb/client/smb1ops.c7
-rw-r--r--fs/smb/client/smb2file.c11
-rw-r--r--fs/smb/client/smb2misc.c9
-rw-r--r--fs/smb/client/smb2ops.c10
-rw-r--r--fs/smb/client/smb2pdu.c6
-rw-r--r--fs/smb/client/smbdirect.c14
-rw-r--r--fs/smb/client/transport.c2
-rw-r--r--fs/smb/server/auth.c20
-rw-r--r--fs/smb/server/connection.h1
-rw-r--r--fs/smb/server/mgmt/user_session.c33
-rw-r--r--fs/smb/server/mgmt/user_session.h2
-rw-r--r--fs/smb/server/oplock.c13
-rw-r--r--fs/smb/server/smb2pdu.c156
-rw-r--r--fs/smb/server/smb_common.c2
-rw-r--r--fs/smb/server/smbacl.c10
-rw-r--r--fs/smb/server/transport_ipc.c8
-rw-r--r--fs/smb/server/transport_rdma.c102
-rw-r--r--fs/smb/server/transport_tcp.c17
-rw-r--r--fs/smb/server/vfs.c14
-rw-r--r--fs/squashfs/super.c5
-rw-r--r--fs/userfaultfd.c6
-rw-r--r--fs/vboxsf/super.c3
-rw-r--r--fs/xfs/Kconfig12
-rw-r--r--fs/xfs/libxfs/xfs_ag.c45
-rw-r--r--fs/xfs/libxfs/xfs_ag.h3
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c87
-rw-r--r--fs/xfs/libxfs/xfs_alloc.h24
-rw-r--r--fs/xfs/libxfs/xfs_attr.c6
-rw-r--r--fs/xfs/libxfs/xfs_attr_remote.c1
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c251
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h5
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.c8
-rw-r--r--fs/xfs/libxfs/xfs_btree_staging.c4
-rw-r--r--fs/xfs/libxfs/xfs_btree_staging.h6
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.c27
-rw-r--r--fs/xfs/libxfs/xfs_dir2_data.c31
-rw-r--r--fs/xfs/libxfs/xfs_dir2_priv.h7
-rw-r--r--fs/xfs/libxfs/xfs_format.h2
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c24
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.c6
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c47
-rw-r--r--fs/xfs/libxfs/xfs_log_recover.h22
-rw-r--r--fs/xfs/libxfs/xfs_quota_defs.h2
-rw-r--r--fs/xfs/libxfs/xfs_refcount.c129
-rw-r--r--fs/xfs/libxfs/xfs_refcount.h4
-rw-r--r--fs/xfs/libxfs/xfs_refcount_btree.c9
-rw-r--r--fs/xfs/libxfs/xfs_rmap.c10
-rw-r--r--fs/xfs/libxfs/xfs_rtbitmap.c2
-rw-r--r--fs/xfs/libxfs/xfs_rtbitmap.h83
-rw-r--r--fs/xfs/libxfs/xfs_sb.c27
-rw-r--r--fs/xfs/libxfs/xfs_sb.h2
-rw-r--r--fs/xfs/libxfs/xfs_trans_resv.c28
-rw-r--r--fs/xfs/libxfs/xfs_types.h13
-rw-r--r--fs/xfs/scrub/attr.c5
-rw-r--r--fs/xfs/scrub/bmap.c8
-rw-r--r--fs/xfs/scrub/repair.c3
-rw-r--r--fs/xfs/scrub/rtbitmap.c3
-rw-r--r--fs/xfs/xfs.h4
-rw-r--r--fs/xfs/xfs_aops.c54
-rw-r--r--fs/xfs/xfs_attr_item.c104
-rw-r--r--fs/xfs/xfs_bmap_item.c85
-rw-r--r--fs/xfs/xfs_bmap_util.c83
-rw-r--r--fs/xfs/xfs_bmap_util.h2
-rw-r--r--fs/xfs/xfs_buf.c44
-rw-r--r--fs/xfs/xfs_buf.h1
-rw-r--r--fs/xfs/xfs_buf_item.c32
-rw-r--r--fs/xfs/xfs_dquot.c1
-rw-r--r--fs/xfs/xfs_dquot_item.c31
-rw-r--r--fs/xfs/xfs_extfree_item.c144
-rw-r--r--fs/xfs/xfs_file.c33
-rw-r--r--fs/xfs/xfs_file.h15
-rw-r--r--fs/xfs/xfs_fsmap.c268
-rw-r--r--fs/xfs/xfs_fsops.c5
-rw-r--r--fs/xfs/xfs_icache.c2
-rw-r--r--fs/xfs/xfs_inode.c43
-rw-r--r--fs/xfs/xfs_inode.h2
-rw-r--r--fs/xfs/xfs_inode_item.c35
-rw-r--r--fs/xfs/xfs_ioctl.c12
-rw-r--r--fs/xfs/xfs_iomap.c81
-rw-r--r--fs/xfs/xfs_iops.c1
-rw-r--r--fs/xfs/xfs_iops.h3
-rw-r--r--fs/xfs/xfs_refcount_item.c68
-rw-r--r--fs/xfs/xfs_reflink.c27
-rw-r--r--fs/xfs/xfs_rmap_item.c6
-rw-r--r--fs/xfs/xfs_rtalloc.c90
-rw-r--r--fs/xfs/xfs_rtalloc.h73
-rw-r--r--fs/xfs/xfs_symlink.c8
-rw-r--r--fs/xfs/xfs_trace.h40
249 files changed, 3752 insertions, 2177 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index 703a1cea0fc0..7104e3eb38eb 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -347,6 +347,7 @@ config GRACE_PERIOD
config LOCKD
tristate
depends on FILE_LOCKING
+ select CRC32
select GRACE_PERIOD
config LOCKD_V4
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 8daeed31e1af..52d11931025c 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -595,7 +595,7 @@ affs_extent_file_ofs(struct inode *inode, u32 newsize)
BUG_ON(tmp > bsize);
AFFS_DATA_HEAD(bh)->ptype = cpu_to_be32(T_DATA);
AFFS_DATA_HEAD(bh)->key = cpu_to_be32(inode->i_ino);
- AFFS_DATA_HEAD(bh)->sequence = cpu_to_be32(bidx);
+ AFFS_DATA_HEAD(bh)->sequence = cpu_to_be32(bidx + 1);
AFFS_DATA_HEAD(bh)->size = cpu_to_be32(tmp);
affs_fix_checksum(sb, bh);
bh->b_state &= ~(1UL << BH_New);
@@ -724,7 +724,8 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping,
tmp = min(bsize - boff, to - from);
BUG_ON(boff + tmp > bsize || tmp > bsize);
memcpy(AFFS_DATA(bh) + boff, data + from, tmp);
- be32_add_cpu(&AFFS_DATA_HEAD(bh)->size, tmp);
+ AFFS_DATA_HEAD(bh)->size = cpu_to_be32(
+ max(boff + tmp, be32_to_cpu(AFFS_DATA_HEAD(bh)->size)));
affs_fix_checksum(sb, bh);
mark_buffer_dirty_inode(bh, inode);
written += tmp;
@@ -746,7 +747,7 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping,
if (buffer_new(bh)) {
AFFS_DATA_HEAD(bh)->ptype = cpu_to_be32(T_DATA);
AFFS_DATA_HEAD(bh)->key = cpu_to_be32(inode->i_ino);
- AFFS_DATA_HEAD(bh)->sequence = cpu_to_be32(bidx);
+ AFFS_DATA_HEAD(bh)->sequence = cpu_to_be32(bidx + 1);
AFFS_DATA_HEAD(bh)->size = cpu_to_be32(bsize);
AFFS_DATA_HEAD(bh)->next = 0;
bh->b_state &= ~(1UL << BH_New);
@@ -780,7 +781,7 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping,
if (buffer_new(bh)) {
AFFS_DATA_HEAD(bh)->ptype = cpu_to_be32(T_DATA);
AFFS_DATA_HEAD(bh)->key = cpu_to_be32(inode->i_ino);
- AFFS_DATA_HEAD(bh)->sequence = cpu_to_be32(bidx);
+ AFFS_DATA_HEAD(bh)->sequence = cpu_to_be32(bidx + 1);
AFFS_DATA_HEAD(bh)->size = cpu_to_be32(tmp);
AFFS_DATA_HEAD(bh)->next = 0;
bh->b_state &= ~(1UL << BH_New);
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 24192a7667ed..7a9b1ce9c98a 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -55,15 +55,26 @@ static struct file_system_type anon_inode_fs_type = {
.kill_sb = kill_anon_super,
};
-static struct inode *anon_inode_make_secure_inode(
- const char *name,
- const struct inode *context_inode)
+/**
+ * anon_inode_make_secure_inode - allocate an anonymous inode with security context
+ * @sb: [in] Superblock to allocate from
+ * @name: [in] Name of the class of the newfile (e.g., "secretmem")
+ * @context_inode:
+ * [in] Optional parent inode for security inheritance
+ *
+ * The function ensures proper security initialization through the LSM hook
+ * security_inode_init_security_anon().
+ *
+ * Return: Pointer to new inode on success, ERR_PTR on failure.
+ */
+struct inode *anon_inode_make_secure_inode(struct super_block *sb, const char *name,
+ const struct inode *context_inode)
{
struct inode *inode;
const struct qstr qname = QSTR_INIT(name, strlen(name));
int error;
- inode = alloc_anon_inode(anon_inode_mnt->mnt_sb);
+ inode = alloc_anon_inode(sb);
if (IS_ERR(inode))
return inode;
inode->i_flags &= ~S_PRIVATE;
@@ -74,6 +85,7 @@ static struct inode *anon_inode_make_secure_inode(
}
return inode;
}
+EXPORT_SYMBOL_GPL(anon_inode_make_secure_inode);
static struct file *__anon_inode_getfile(const char *name,
const struct file_operations *fops,
@@ -88,7 +100,8 @@ static struct file *__anon_inode_getfile(const char *name,
return ERR_PTR(-ENOENT);
if (secure) {
- inode = anon_inode_make_secure_inode(name, context_inode);
+ inode = anon_inode_make_secure_inode(anon_inode_mnt->mnt_sb,
+ name, context_inode);
if (IS_ERR(inode)) {
file = ERR_CAST(inode);
goto err;
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 89e7e4826efc..762704eed9ce 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -109,25 +109,6 @@ static struct linux_binfmt elf_format = {
#define BAD_ADDR(x) (unlikely((unsigned long)(x) >= TASK_SIZE))
-static int set_brk(unsigned long start, unsigned long end, int prot)
-{
- start = ELF_PAGEALIGN(start);
- end = ELF_PAGEALIGN(end);
- if (end > start) {
- /*
- * Map the last of the bss segment.
- * If the header is requesting these pages to be
- * executable, honour that (ppc32 needs this).
- */
- int error = vm_brk_flags(start, end - start,
- prot & PROT_EXEC ? VM_EXEC : 0);
- if (error)
- return error;
- }
- current->mm->start_brk = current->mm->brk = end;
- return 0;
-}
-
/* We need to explicitly zero any fractional pages
after the data section (i.e. bss). This would
contain the junk from the file that should not
@@ -248,7 +229,7 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec,
} while (0)
#ifdef ARCH_DLINFO
- /*
+ /*
* ARCH_DLINFO must come first so PPC can do its special alignment of
* AUXV.
* update AT_VECTOR_SIZE_ARCH if the number of NEW_AUX_ENT() in
@@ -401,6 +382,51 @@ static unsigned long elf_map(struct file *filep, unsigned long addr,
return(map_addr);
}
+static unsigned long elf_load(struct file *filep, unsigned long addr,
+ const struct elf_phdr *eppnt, int prot, int type,
+ unsigned long total_size)
+{
+ unsigned long zero_start, zero_end;
+ unsigned long map_addr;
+
+ if (eppnt->p_filesz) {
+ map_addr = elf_map(filep, addr, eppnt, prot, type, total_size);
+ if (BAD_ADDR(map_addr))
+ return map_addr;
+ if (eppnt->p_memsz > eppnt->p_filesz) {
+ zero_start = map_addr + ELF_PAGEOFFSET(eppnt->p_vaddr) +
+ eppnt->p_filesz;
+ zero_end = map_addr + ELF_PAGEOFFSET(eppnt->p_vaddr) +
+ eppnt->p_memsz;
+
+ /* Zero the end of the last mapped page */
+ padzero(zero_start);
+ }
+ } else {
+ map_addr = zero_start = ELF_PAGESTART(addr);
+ zero_end = zero_start + ELF_PAGEOFFSET(eppnt->p_vaddr) +
+ eppnt->p_memsz;
+ }
+ if (eppnt->p_memsz > eppnt->p_filesz) {
+ /*
+ * Map the last of the segment.
+ * If the header is requesting these pages to be
+ * executable, honour that (ppc32 needs this).
+ */
+ int error;
+
+ zero_start = ELF_PAGEALIGN(zero_start);
+ zero_end = ELF_PAGEALIGN(zero_end);
+
+ error = vm_brk_flags(zero_start, zero_end - zero_start,
+ prot & PROT_EXEC ? VM_EXEC : 0);
+ if (error)
+ map_addr = error;
+ }
+ return map_addr;
+}
+
+
static unsigned long total_mapping_size(const struct elf_phdr *phdr, int nr)
{
elf_addr_t min_addr = -1;
@@ -829,8 +855,8 @@ static int load_elf_binary(struct linux_binprm *bprm)
unsigned long error;
struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL;
struct elf_phdr *elf_property_phdata = NULL;
- unsigned long elf_bss, elf_brk;
- int bss_prot = 0;
+ unsigned long elf_brk;
+ bool brk_moved = false;
int retval, i;
unsigned long elf_entry;
unsigned long e_entry;
@@ -1021,8 +1047,7 @@ out_free_interp:
executable_stack);
if (retval < 0)
goto out_free_dentry;
-
- elf_bss = 0;
+
elf_brk = 0;
start_code = ~0UL;
@@ -1042,33 +1067,6 @@ out_free_interp:
if (elf_ppnt->p_type != PT_LOAD)
continue;
- if (unlikely (elf_brk > elf_bss)) {
- unsigned long nbyte;
-
- /* There was a PT_LOAD segment with p_memsz > p_filesz
- before this one. Map anonymous pages, if needed,
- and clear the area. */
- retval = set_brk(elf_bss + load_bias,
- elf_brk + load_bias,
- bss_prot);
- if (retval)
- goto out_free_dentry;
- nbyte = ELF_PAGEOFFSET(elf_bss);
- if (nbyte) {
- nbyte = ELF_MIN_ALIGN - nbyte;
- if (nbyte > elf_brk - elf_bss)
- nbyte = elf_brk - elf_bss;
- if (clear_user((void __user *)elf_bss +
- load_bias, nbyte)) {
- /*
- * This bss-zeroing can fail if the ELF
- * file specifies odd protections. So
- * we don't check the return value
- */
- }
- }
- }
-
elf_prot = make_prot(elf_ppnt->p_flags, &arch_state,
!!interpreter, false);
@@ -1096,15 +1094,49 @@ out_free_interp:
* Header for ET_DYN binaries to calculate the
* randomization (load_bias) for all the LOAD
* Program Headers.
+ */
+
+ /*
+ * Calculate the entire size of the ELF mapping
+ * (total_size), used for the initial mapping,
+ * due to load_addr_set which is set to true later
+ * once the initial mapping is performed.
+ *
+ * Note that this is only sensible when the LOAD
+ * segments are contiguous (or overlapping). If
+ * used for LOADs that are far apart, this would
+ * cause the holes between LOADs to be mapped,
+ * running the risk of having the mapping fail,
+ * as it would be larger than the ELF file itself.
+ *
+ * As a result, only ET_DYN does this, since
+ * some ET_EXEC (e.g. ia64) may have large virtual
+ * memory holes between LOADs.
+ *
+ */
+ total_size = total_mapping_size(elf_phdata,
+ elf_ex->e_phnum);
+ if (!total_size) {
+ retval = -EINVAL;
+ goto out_free_dentry;
+ }
+
+ /* Calculate any requested alignment. */
+ alignment = maximum_alignment(elf_phdata, elf_ex->e_phnum);
+
+ /**
+ * DOC: PIE handling
*
- * There are effectively two types of ET_DYN
- * binaries: programs (i.e. PIE: ET_DYN with INTERP)
- * and loaders (ET_DYN without INTERP, since they
- * _are_ the ELF interpreter). The loaders must
- * be loaded away from programs since the program
- * may otherwise collide with the loader (especially
- * for ET_EXEC which does not have a randomized
- * position). For example to handle invocations of
+ * There are effectively two types of ET_DYN ELF
+ * binaries: programs (i.e. PIE: ET_DYN with
+ * PT_INTERP) and loaders (i.e. static PIE: ET_DYN
+ * without PT_INTERP, usually the ELF interpreter
+ * itself). Loaders must be loaded away from programs
+ * since the program may otherwise collide with the
+ * loader (especially for ET_EXEC which does not have
+ * a randomized position).
+ *
+ * For example, to handle invocations of
* "./ld.so someprog" to test out a new version of
* the loader, the subsequent program that the
* loader loads must avoid the loader itself, so
@@ -1117,17 +1149,49 @@ out_free_interp:
* ELF_ET_DYN_BASE and loaders are loaded into the
* independently randomized mmap region (0 load_bias
* without MAP_FIXED nor MAP_FIXED_NOREPLACE).
+ *
+ * See below for "brk" handling details, which is
+ * also affected by program vs loader and ASLR.
*/
if (interpreter) {
+ /* On ET_DYN with PT_INTERP, we do the ASLR. */
load_bias = ELF_ET_DYN_BASE;
if (current->flags & PF_RANDOMIZE)
load_bias += arch_mmap_rnd();
- alignment = maximum_alignment(elf_phdata, elf_ex->e_phnum);
+ /* Adjust alignment as requested. */
if (alignment)
load_bias &= ~(alignment - 1);
elf_flags |= MAP_FIXED_NOREPLACE;
- } else
- load_bias = 0;
+ } else {
+ /*
+ * For ET_DYN without PT_INTERP, we rely on
+ * the architectures's (potentially ASLR) mmap
+ * base address (via a load_bias of 0).
+ *
+ * When a large alignment is requested, we
+ * must do the allocation at address "0" right
+ * now to discover where things will load so
+ * that we can adjust the resulting alignment.
+ * In this case (load_bias != 0), we can use
+ * MAP_FIXED_NOREPLACE to make sure the mapping
+ * doesn't collide with anything.
+ */
+ if (alignment > ELF_MIN_ALIGN) {
+ load_bias = elf_load(bprm->file, 0, elf_ppnt,
+ elf_prot, elf_flags, total_size);
+ if (BAD_ADDR(load_bias)) {
+ retval = IS_ERR_VALUE(load_bias) ?
+ PTR_ERR((void*)load_bias) : -EINVAL;
+ goto out_free_dentry;
+ }
+ vm_munmap(load_bias, total_size);
+ /* Adjust alignment as requested. */
+ if (alignment)
+ load_bias &= ~(alignment - 1);
+ elf_flags |= MAP_FIXED_NOREPLACE;
+ } else
+ load_bias = 0;
+ }
/*
* Since load_bias is used for all subsequent loading
@@ -1137,34 +1201,9 @@ out_free_interp:
* is then page aligned.
*/
load_bias = ELF_PAGESTART(load_bias - vaddr);
-
- /*
- * Calculate the entire size of the ELF mapping
- * (total_size), used for the initial mapping,
- * due to load_addr_set which is set to true later
- * once the initial mapping is performed.
- *
- * Note that this is only sensible when the LOAD
- * segments are contiguous (or overlapping). If
- * used for LOADs that are far apart, this would
- * cause the holes between LOADs to be mapped,
- * running the risk of having the mapping fail,
- * as it would be larger than the ELF file itself.
- *
- * As a result, only ET_DYN does this, since
- * some ET_EXEC (e.g. ia64) may have large virtual
- * memory holes between LOADs.
- *
- */
- total_size = total_mapping_size(elf_phdata,
- elf_ex->e_phnum);
- if (!total_size) {
- retval = -EINVAL;
- goto out_free_dentry;
- }
}
- error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt,
+ error = elf_load(bprm->file, load_bias + vaddr, elf_ppnt,
elf_prot, elf_flags, total_size);
if (BAD_ADDR(error)) {
retval = IS_ERR((void *)error) ?
@@ -1212,41 +1251,23 @@ out_free_interp:
k = elf_ppnt->p_vaddr + elf_ppnt->p_filesz;
- if (k > elf_bss)
- elf_bss = k;
if ((elf_ppnt->p_flags & PF_X) && end_code < k)
end_code = k;
if (end_data < k)
end_data = k;
k = elf_ppnt->p_vaddr + elf_ppnt->p_memsz;
- if (k > elf_brk) {
- bss_prot = elf_prot;
+ if (k > elf_brk)
elf_brk = k;
- }
}
e_entry = elf_ex->e_entry + load_bias;
phdr_addr += load_bias;
- elf_bss += load_bias;
elf_brk += load_bias;
start_code += load_bias;
end_code += load_bias;
start_data += load_bias;
end_data += load_bias;
- /* Calling set_brk effectively mmaps the pages that we need
- * for the bss and break sections. We must do this before
- * mapping in the interpreter, to make sure it doesn't wind
- * up getting placed where the bss needs to go.
- */
- retval = set_brk(elf_bss, elf_brk, bss_prot);
- if (retval)
- goto out_free_dentry;
- if (likely(elf_bss != elf_brk) && unlikely(padzero(elf_bss))) {
- retval = -EFAULT; /* Nobody gets to see this, but.. */
- goto out_free_dentry;
- }
-
if (interpreter) {
elf_entry = load_elf_interp(interp_elf_ex,
interpreter,
@@ -1302,24 +1323,44 @@ out_free_interp:
mm->end_data = end_data;
mm->start_stack = bprm->p;
- if ((current->flags & PF_RANDOMIZE) && (snapshot_randomize_va_space > 1)) {
+ /**
+ * DOC: "brk" handling
+ *
+ * For architectures with ELF randomization, when executing a
+ * loader directly (i.e. static PIE: ET_DYN without PT_INTERP),
+ * move the brk area out of the mmap region and into the unused
+ * ELF_ET_DYN_BASE region. Since "brk" grows up it may collide
+ * early with the stack growing down or other regions being put
+ * into the mmap region by the kernel (e.g. vdso).
+ *
+ * In the CONFIG_COMPAT_BRK case, though, everything is turned
+ * off because we're not allowed to move the brk at all.
+ */
+ if (!IS_ENABLED(CONFIG_COMPAT_BRK) &&
+ IS_ENABLED(CONFIG_ARCH_HAS_ELF_RANDOMIZE) &&
+ elf_ex->e_type == ET_DYN && !interpreter) {
+ elf_brk = ELF_ET_DYN_BASE;
+ /* This counts as moving the brk, so let brk(2) know. */
+ brk_moved = true;
+ }
+ mm->start_brk = mm->brk = ELF_PAGEALIGN(elf_brk);
+
+ if ((current->flags & PF_RANDOMIZE) && snapshot_randomize_va_space > 1) {
/*
- * For architectures with ELF randomization, when executing
- * a loader directly (i.e. no interpreter listed in ELF
- * headers), move the brk area out of the mmap region
- * (since it grows up, and may collide early with the stack
- * growing down), and into the unused ELF_ET_DYN_BASE region.
+ * If we didn't move the brk to ELF_ET_DYN_BASE (above),
+ * leave a gap between .bss and brk.
*/
- if (IS_ENABLED(CONFIG_ARCH_HAS_ELF_RANDOMIZE) &&
- elf_ex->e_type == ET_DYN && !interpreter) {
- mm->brk = mm->start_brk = ELF_ET_DYN_BASE;
- }
+ if (!brk_moved)
+ mm->brk = mm->start_brk = mm->brk + PAGE_SIZE;
mm->brk = mm->start_brk = arch_randomize_brk(mm);
+ brk_moved = true;
+ }
+
#ifdef compat_brk_randomized
+ if (brk_moved)
current->brk_randomized = 1;
#endif
- }
if (current->personality & MMAP_PAGE_ZERO) {
/* Why this, you ask??? Well SVr4 maps page 0 as read-only,
@@ -1522,7 +1563,7 @@ static void fill_elf_note_phdr(struct elf_phdr *phdr, int sz, loff_t offset)
phdr->p_align = 0;
}
-static void fill_note(struct memelfnote *note, const char *name, int type,
+static void fill_note(struct memelfnote *note, const char *name, int type,
unsigned int sz, void *data)
{
note->name = name;
@@ -2005,8 +2046,8 @@ static int elf_dump_thread_status(long signr, struct elf_thread_status *t)
t->num_notes = 0;
fill_prstatus(&t->prstatus.common, p, signr);
- elf_core_copy_task_regs(p, &t->prstatus.pr_reg);
-
+ elf_core_copy_task_regs(p, &t->prstatus.pr_reg);
+
fill_note(&t->notes[0], "CORE", NT_PRSTATUS, sizeof(t->prstatus),
&(t->prstatus));
t->num_notes++;
@@ -2296,7 +2337,7 @@ static int elf_core_dump(struct coredump_params *cprm)
if (!elf_core_write_extra_phdrs(cprm, offset))
goto end_coredump;
- /* write out the notes section */
+ /* write out the notes section */
if (!write_note_info(&info, cprm))
goto end_coredump;
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index c71a40927315..b2d3b6e43bb5 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -1603,7 +1603,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
if (!elf_core_write_extra_phdrs(cprm, offset))
goto end_coredump;
- /* write out the notes section */
+ /* write out the notes section */
if (!writenote(thread_list->notes, cprm))
goto end_coredump;
if (!writenote(&psinfo_note, cprm))
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 0dcf7fecaf55..91440ef79a26 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1678,6 +1678,17 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
up_write(&space_info->groups_sem);
goto next;
}
+
+ /*
+ * Cache the zone_unusable value before turning the block group
+ * to read only. As soon as the block group is read only it's
+ * zone_unusable value gets moved to the block group's read-only
+ * bytes and isn't available for calculations anymore. We also
+ * cache it before unlocking the block group, to prevent races
+ * (reports from KCSAN and such tools) with tasks updating it.
+ */
+ zone_unusable = bg->zone_unusable;
+
spin_unlock(&bg->lock);
/*
@@ -1693,13 +1704,6 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
goto next;
}
- /*
- * Cache the zone_unusable value before turning the block group
- * to read only. As soon as the blog group is read only it's
- * zone_unusable value gets moved to the block group's read-only
- * bytes and isn't available for calculations anymore.
- */
- zone_unusable = bg->zone_unusable;
ret = inc_block_group_ro(bg, 0);
up_write(&space_info->groups_sem);
if (ret < 0)
diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c
index bd9dde374e5d..a90f3cb83c70 100644
--- a/fs/btrfs/discard.c
+++ b/fs/btrfs/discard.c
@@ -78,8 +78,6 @@ static void __add_to_discard_list(struct btrfs_discard_ctl *discard_ctl,
struct btrfs_block_group *block_group)
{
lockdep_assert_held(&discard_ctl->lock);
- if (!btrfs_run_discard_work(discard_ctl))
- return;
if (list_empty(&block_group->discard_list) ||
block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED) {
@@ -102,6 +100,9 @@ static void add_to_discard_list(struct btrfs_discard_ctl *discard_ctl,
if (!btrfs_is_block_group_data_only(block_group))
return;
+ if (!btrfs_run_discard_work(discard_ctl))
+ return;
+
spin_lock(&discard_ctl->lock);
__add_to_discard_list(discard_ctl, block_group);
spin_unlock(&discard_ctl->lock);
@@ -151,13 +152,7 @@ static bool remove_from_discard_list(struct btrfs_discard_ctl *discard_ctl,
block_group->discard_eligible_time = 0;
queued = !list_empty(&block_group->discard_list);
list_del_init(&block_group->discard_list);
- /*
- * If the block group is currently running in the discard workfn, we
- * don't want to deref it, since it's still being used by the workfn.
- * The workfn will notice this case and deref the block group when it is
- * finished.
- */
- if (queued && !running)
+ if (queued)
btrfs_put_block_group(block_group);
spin_unlock(&discard_ctl->lock);
@@ -233,6 +228,18 @@ again:
block_group->used != 0) {
if (btrfs_is_block_group_data_only(block_group)) {
__add_to_discard_list(discard_ctl, block_group);
+ /*
+ * The block group must have been moved to other
+ * discard list even if discard was disabled in
+ * the meantime or a transaction abort happened,
+ * otherwise we can end up in an infinite loop,
+ * always jumping into the 'again' label and
+ * keep getting this block group over and over
+ * in case there are no other block groups in
+ * the discard lists.
+ */
+ ASSERT(block_group->discard_index !=
+ BTRFS_DISCARD_INDEX_UNUSED);
} else {
list_del_init(&block_group->discard_list);
btrfs_put_block_group(block_group);
@@ -243,9 +250,10 @@ again:
block_group->discard_cursor = block_group->start;
block_group->discard_state = BTRFS_DISCARD_EXTENTS;
}
- discard_ctl->block_group = block_group;
}
if (block_group) {
+ btrfs_get_block_group(block_group);
+ discard_ctl->block_group = block_group;
*discard_state = block_group->discard_state;
*discard_index = block_group->discard_index;
}
@@ -469,9 +477,20 @@ static void btrfs_discard_workfn(struct work_struct *work)
block_group = peek_discard_list(discard_ctl, &discard_state,
&discard_index, now);
- if (!block_group || !btrfs_run_discard_work(discard_ctl))
+ if (!block_group)
+ return;
+ if (!btrfs_run_discard_work(discard_ctl)) {
+ spin_lock(&discard_ctl->lock);
+ btrfs_put_block_group(block_group);
+ discard_ctl->block_group = NULL;
+ spin_unlock(&discard_ctl->lock);
return;
+ }
if (now < block_group->discard_eligible_time) {
+ spin_lock(&discard_ctl->lock);
+ btrfs_put_block_group(block_group);
+ discard_ctl->block_group = NULL;
+ spin_unlock(&discard_ctl->lock);
btrfs_discard_schedule_work(discard_ctl, false);
return;
}
@@ -523,15 +542,7 @@ static void btrfs_discard_workfn(struct work_struct *work)
spin_lock(&discard_ctl->lock);
discard_ctl->prev_discard = trimmed;
discard_ctl->prev_discard_time = now;
- /*
- * If the block group was removed from the discard list while it was
- * running in this workfn, then we didn't deref it, since this function
- * still owned that reference. But we set the discard_ctl->block_group
- * back to NULL, so we can use that condition to know that now we need
- * to deref the block_group.
- */
- if (discard_ctl->block_group == NULL)
- btrfs_put_block_group(block_group);
+ btrfs_put_block_group(block_group);
discard_ctl->block_group = NULL;
__btrfs_discard_schedule_work(discard_ctl, now, false);
spin_unlock(&discard_ctl->lock);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 30fe5ebc3650..76a261cbf39d 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2481,8 +2481,7 @@ static int load_global_roots_objectid(struct btrfs_root *tree_root,
found = true;
root = read_tree_root_path(tree_root, path, &key);
if (IS_ERR(root)) {
- if (!btrfs_test_opt(fs_info, IGNOREBADROOTS))
- ret = PTR_ERR(root);
+ ret = PTR_ERR(root);
break;
}
set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
@@ -4642,6 +4641,14 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
btrfs_cleanup_defrag_inodes(fs_info);
/*
+ * Handle the error fs first, as it will flush and wait for all ordered
+ * extents. This will generate delayed iputs, thus we want to handle
+ * it first.
+ */
+ if (unlikely(BTRFS_FS_ERROR(fs_info)))
+ btrfs_error_commit_super(fs_info);
+
+ /*
* Wait for any fixup workers to complete.
* If we don't wait for them here and they are still running by the time
* we call kthread_stop() against the cleaner kthread further below, we
@@ -4662,6 +4669,31 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
btrfs_flush_workqueue(fs_info->delalloc_workers);
/*
+ * We can have ordered extents getting their last reference dropped from
+ * the fs_info->workers queue because for async writes for data bios we
+ * queue a work for that queue, at btrfs_wq_submit_bio(), that runs
+ * run_one_async_done() which calls btrfs_bio_end_io() in case the bio
+ * has an error, and that later function can do the final
+ * btrfs_put_ordered_extent() on the ordered extent attached to the bio,
+ * which adds a delayed iput for the inode. So we must flush the queue
+ * so that we don't have delayed iputs after committing the current
+ * transaction below and stopping the cleaner and transaction kthreads.
+ */
+ btrfs_flush_workqueue(fs_info->workers);
+
+ /*
+ * When finishing a compressed write bio we schedule a work queue item
+ * to finish an ordered extent - btrfs_finish_compressed_write_work()
+ * calls btrfs_finish_ordered_extent() which in turns does a call to
+ * btrfs_queue_ordered_fn(), and that queues the ordered extent
+ * completion either in the endio_write_workers work queue or in the
+ * fs_info->endio_freespace_worker work queue. We flush those queues
+ * below, so before we flush them we must flush this queue for the
+ * workers of compressed writes.
+ */
+ flush_workqueue(fs_info->compressed_write_workers);
+
+ /*
* After we parked the cleaner kthread, ordered extents may have
* completed and created new delayed iputs. If one of the async reclaim
* tasks is running and in the RUN_DELAYED_IPUTS flush state, then we
@@ -4718,9 +4750,6 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
btrfs_err(fs_info, "commit super ret %d", ret);
}
- if (BTRFS_FS_ERROR(fs_info))
- btrfs_error_commit_super(fs_info);
-
kthread_stop(fs_info->transaction_kthread);
kthread_stop(fs_info->cleaner_kthread);
@@ -4876,10 +4905,6 @@ static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info)
/* cleanup FS via transaction */
btrfs_cleanup_transaction(fs_info);
- mutex_lock(&fs_info->cleaner_mutex);
- btrfs_run_delayed_iputs(fs_info);
- mutex_unlock(&fs_info->cleaner_mutex);
-
down_write(&fs_info->cleanup_work_sem);
up_write(&fs_info->cleanup_work_sem);
}
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 50bc553cc73a..5395e27f9e89 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -179,6 +179,14 @@ search_again:
ei = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_extent_item);
num_refs = btrfs_extent_refs(leaf, ei);
+ if (unlikely(num_refs == 0)) {
+ ret = -EUCLEAN;
+ btrfs_err(fs_info,
+ "unexpected zero reference count for extent item (%llu %u %llu)",
+ key.objectid, key.type, key.offset);
+ btrfs_abort_transaction(trans, ret);
+ goto out_free;
+ }
extent_flags = btrfs_extent_flags(leaf, ei);
} else {
ret = -EINVAL;
@@ -190,8 +198,6 @@ search_again:
goto out_free;
}
-
- BUG_ON(num_refs == 0);
} else {
num_refs = 0;
extent_flags = 0;
@@ -221,10 +227,19 @@ search_again:
goto search_again;
}
spin_lock(&head->lock);
- if (head->extent_op && head->extent_op->update_flags)
+ if (head->extent_op && head->extent_op->update_flags) {
extent_flags |= head->extent_op->flags_to_set;
- else
- BUG_ON(num_refs == 0);
+ } else if (unlikely(num_refs == 0)) {
+ spin_unlock(&head->lock);
+ mutex_unlock(&head->mutex);
+ spin_unlock(&delayed_refs->lock);
+ ret = -EUCLEAN;
+ btrfs_err(fs_info,
+ "unexpected zero reference count for extent %llu (%s)",
+ bytenr, metadata ? "metadata" : "data");
+ btrfs_abort_transaction(trans, ret);
+ goto out_free;
+ }
num_refs += head->ref_mod;
spin_unlock(&head->lock);
@@ -5575,7 +5590,10 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
ret = btrfs_dec_ref(trans, root, eb, 1);
else
ret = btrfs_dec_ref(trans, root, eb, 0);
- BUG_ON(ret); /* -ENOMEM */
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
if (is_fstree(root->root_key.objectid)) {
ret = btrfs_qgroup_trace_leaf_items(trans, eb);
if (ret) {
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 72227c0b4b5a..d5552875f872 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4459,10 +4459,10 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
return eb;
}
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start)
{
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
struct extent_buffer *eb, *exists = NULL;
int ret;
@@ -4498,8 +4498,11 @@ again:
free_eb:
btrfs_release_extent_buffer(eb);
return exists;
-}
+#else
+ /* Stub to avoid linker error when compiled with optimizations turned off. */
+ return NULL;
#endif
+}
static struct extent_buffer *grab_extent_buffer(
struct btrfs_fs_info *fs_info, struct page *page)
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 9e06d1a0d373..3814f09dc4ae 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2224,15 +2224,20 @@ static void btrfs_punch_hole_lock_range(struct inode *inode,
* will always return true.
* So here we need to do extra page alignment for
* filemap_range_has_page().
+ *
+ * And do not decrease page_lockend right now, as it can be 0.
*/
const u64 page_lockstart = round_up(lockstart, PAGE_SIZE);
- const u64 page_lockend = round_down(lockend + 1, PAGE_SIZE) - 1;
+ const u64 page_lockend = round_down(lockend + 1, PAGE_SIZE);
while (1) {
truncate_pagecache_range(inode, lockstart, lockend);
lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
cached_state);
+ /* The same page or adjacent pages. */
+ if (page_lockend <= page_lockstart)
+ break;
/*
* We can't have ordered extents in the range, nor dirty/writeback
* pages, because we have locked the inode's VFS lock in exclusive
@@ -2244,7 +2249,7 @@ static void btrfs_punch_hole_lock_range(struct inode *inode,
* we do, unlock the range and retry.
*/
if (!filemap_range_has_page(inode->i_mapping, page_lockstart,
- page_lockend))
+ page_lockend - 1))
break;
unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index 6a44733a95e1..14bdb241ff6b 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -1098,11 +1098,21 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans,
ret = btrfs_search_slot_for_read(extent_root, &key, path, 1, 0);
if (ret < 0)
goto out_locked;
- ASSERT(ret == 0);
+ /*
+ * If ret is 1 (no key found), it means this is an empty block group,
+ * without any extents allocated from it and there's no block group
+ * item (key BTRFS_BLOCK_GROUP_ITEM_KEY) located in the extent tree
+ * because we are using the block group tree feature, so block group
+ * items are stored in the block group tree. It also means there are no
+ * extents allocated for block groups with a start offset beyond this
+ * block group's end offset (this is the last, highest, block group).
+ */
+ if (!btrfs_fs_compat_ro(trans->fs_info, BLOCK_GROUP_TREE))
+ ASSERT(ret == 0);
start = block_group->start;
end = block_group->start + block_group->length;
- while (1) {
+ while (ret == 0) {
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
if (key.type == BTRFS_EXTENT_ITEM_KEY ||
@@ -1132,8 +1142,6 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans,
ret = btrfs_next_item(extent_root, path);
if (ret < 0)
goto out_locked;
- if (ret)
- break;
}
if (start < end) {
ret = __add_to_free_space_tree(trans, block_group, path2,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a13ab3abef12..469a622b440b 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1428,6 +1428,7 @@ out_unlock:
locked_page,
clear_bits,
page_ops);
+ btrfs_qgroup_free_data(inode, NULL, start, cur_alloc_size, NULL);
start += cur_alloc_size;
}
@@ -1441,6 +1442,7 @@ out_unlock:
clear_bits |= EXTENT_CLEAR_DATA_RESV;
extent_clear_unlock_delalloc(inode, start, end, locked_page,
clear_bits, page_ops);
+ btrfs_qgroup_free_data(inode, NULL, start, end - start + 1, NULL);
}
return ret;
}
@@ -2168,13 +2170,15 @@ error:
if (nocow)
btrfs_dec_nocow_writers(bg);
- if (ret && cur_offset < end)
+ if (ret && cur_offset < end) {
extent_clear_unlock_delalloc(inode, cur_offset, end,
locked_page, EXTENT_LOCKED |
EXTENT_DELALLOC | EXTENT_DEFRAG |
EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
PAGE_START_WRITEBACK |
PAGE_END_WRITEBACK);
+ btrfs_qgroup_free_data(inode, NULL, cur_offset, end - cur_offset + 1, NULL);
+ }
btrfs_free_path(path);
return ret;
}
@@ -4852,7 +4856,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
int err = 0;
struct btrfs_trans_handle *trans;
- u64 last_unlink_trans;
struct fscrypt_name fname;
if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
@@ -4878,6 +4881,23 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
goto out_notrans;
}
+ /*
+ * Propagate the last_unlink_trans value of the deleted dir to its
+ * parent directory. This is to prevent an unrecoverable log tree in the
+ * case we do something like this:
+ * 1) create dir foo
+ * 2) create snapshot under dir foo
+ * 3) delete the snapshot
+ * 4) rmdir foo
+ * 5) mkdir foo
+ * 6) fsync foo or some file inside foo
+ *
+ * This is because we can't unlink other roots when replaying the dir
+ * deletes for directory foo.
+ */
+ if (BTRFS_I(inode)->last_unlink_trans >= trans->transid)
+ BTRFS_I(dir)->last_unlink_trans = BTRFS_I(inode)->last_unlink_trans;
+
if (unlikely(btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
err = btrfs_unlink_subvol(trans, dir, dentry);
goto out;
@@ -4887,26 +4907,13 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
if (err)
goto out;
- last_unlink_trans = BTRFS_I(inode)->last_unlink_trans;
-
/* now the directory is empty */
err = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
&fname.disk_name);
if (!err) {
btrfs_i_size_write(BTRFS_I(inode), 0);
- /*
- * Propagate the last_unlink_trans value of the deleted dir to
- * its parent directory. This is to prevent an unrecoverable
- * log tree in the case we do something like this:
- * 1) create dir foo
- * 2) create snapshot under dir foo
- * 3) delete the snapshot
- * 4) rmdir foo
- * 5) mkdir foo
- * 6) fsync foo or some file inside foo
- */
- if (last_unlink_trans >= trans->transid)
- BTRFS_I(dir)->last_unlink_trans = last_unlink_trans;
+ if (BTRFS_I(inode)->last_unlink_trans >= trans->transid)
+ btrfs_record_snapshot_destroy(trans, BTRFS_I(dir));
}
out:
btrfs_end_transaction(trans);
@@ -9177,6 +9184,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
int ret;
int ret2;
bool need_abort = false;
+ bool logs_pinned = false;
struct fscrypt_name old_fname, new_fname;
struct fscrypt_str *old_name, *new_name;
@@ -9305,6 +9313,31 @@ static int btrfs_rename_exchange(struct inode *old_dir,
old_inode->i_ctime = ctime;
new_inode->i_ctime = ctime;
+ if (old_ino != BTRFS_FIRST_FREE_OBJECTID &&
+ new_ino != BTRFS_FIRST_FREE_OBJECTID) {
+ /*
+ * If we are renaming in the same directory (and it's not for
+ * root entries) pin the log early to prevent any concurrent
+ * task from logging the directory after we removed the old
+ * entries and before we add the new entries, otherwise that
+ * task can sync a log without any entry for the inodes we are
+ * renaming and therefore replaying that log, if a power failure
+ * happens after syncing the log, would result in deleting the
+ * inodes.
+ *
+ * If the rename affects two different directories, we want to
+ * make sure the that there's no log commit that contains
+ * updates for only one of the directories but not for the
+ * other.
+ *
+ * If we are renaming an entry for a root, we don't care about
+ * log updates since we called btrfs_set_log_full_commit().
+ */
+ btrfs_pin_log_trans(root);
+ btrfs_pin_log_trans(dest);
+ logs_pinned = true;
+ }
+
if (old_dentry->d_parent != new_dentry->d_parent) {
btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
BTRFS_I(old_inode), 1);
@@ -9362,30 +9395,23 @@ static int btrfs_rename_exchange(struct inode *old_dir,
BTRFS_I(new_inode)->dir_index = new_idx;
/*
- * Now pin the logs of the roots. We do it to ensure that no other task
- * can sync the logs while we are in progress with the rename, because
- * that could result in an inconsistency in case any of the inodes that
- * are part of this rename operation were logged before.
+ * Do the log updates for all inodes.
+ *
+ * If either entry is for a root we don't need to update the logs since
+ * we've called btrfs_set_log_full_commit() before.
*/
- if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
- btrfs_pin_log_trans(root);
- if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
- btrfs_pin_log_trans(dest);
-
- /* Do the log updates for all inodes. */
- if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
+ if (logs_pinned) {
btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir),
old_rename_ctx.index, new_dentry->d_parent);
- if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
btrfs_log_new_name(trans, new_dentry, BTRFS_I(new_dir),
new_rename_ctx.index, old_dentry->d_parent);
+ }
- /* Now unpin the logs. */
- if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
+out_fail:
+ if (logs_pinned) {
btrfs_end_log_trans(root);
- if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
btrfs_end_log_trans(dest);
-out_fail:
+ }
ret2 = btrfs_end_transaction(trans);
ret = ret ? ret : ret2;
out_notrans:
@@ -9435,6 +9461,7 @@ static int btrfs_rename(struct user_namespace *mnt_userns,
int ret2;
u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
struct fscrypt_name old_fname, new_fname;
+ bool logs_pinned = false;
if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
return -EPERM;
@@ -9573,6 +9600,29 @@ static int btrfs_rename(struct user_namespace *mnt_userns,
new_dir->i_ctime = old_dir->i_mtime;
old_inode->i_ctime = old_dir->i_mtime;
+ if (old_ino != BTRFS_FIRST_FREE_OBJECTID) {
+ /*
+ * If we are renaming in the same directory (and it's not a
+ * root entry) pin the log to prevent any concurrent task from
+ * logging the directory after we removed the old entry and
+ * before we add the new entry, otherwise that task can sync
+ * a log without any entry for the inode we are renaming and
+ * therefore replaying that log, if a power failure happens
+ * after syncing the log, would result in deleting the inode.
+ *
+ * If the rename affects two different directories, we want to
+ * make sure the that there's no log commit that contains
+ * updates for only one of the directories but not for the
+ * other.
+ *
+ * If we are renaming an entry for a root, we don't care about
+ * log updates since we called btrfs_set_log_full_commit().
+ */
+ btrfs_pin_log_trans(root);
+ btrfs_pin_log_trans(dest);
+ logs_pinned = true;
+ }
+
if (old_dentry->d_parent != new_dentry->d_parent)
btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
BTRFS_I(old_inode), 1);
@@ -9622,7 +9672,7 @@ static int btrfs_rename(struct user_namespace *mnt_userns,
if (old_inode->i_nlink == 1)
BTRFS_I(old_inode)->dir_index = index;
- if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
+ if (logs_pinned)
btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir),
rename_ctx.index, new_dentry->d_parent);
@@ -9638,6 +9688,10 @@ static int btrfs_rename(struct user_namespace *mnt_userns,
}
}
out_fail:
+ if (logs_pinned) {
+ btrfs_end_log_trans(root);
+ btrfs_end_log_trans(dest);
+ }
ret2 = btrfs_end_transaction(trans);
ret = ret ? ret : ret2;
out_notrans:
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index d6cda0b2e925..fd6ea3fcab33 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2977,6 +2977,7 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra,
int ret;
ASSERT(page_index <= last_index);
+again:
page = find_lock_page(inode->i_mapping, page_index);
if (!page) {
page_cache_sync_readahead(inode->i_mapping, ra, NULL,
@@ -2998,6 +2999,11 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra,
ret = -EIO;
goto release_page;
}
+ if (page->mapping != inode->i_mapping) {
+ unlock_page(page);
+ put_page(page);
+ goto again;
+ }
}
/*
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index a2b95ccb4cf5..0735decec99b 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -431,10 +431,8 @@ static int fs_path_ensure_buf(struct fs_path *p, int len)
if (p->buf_len >= len)
return 0;
- if (len > PATH_MAX) {
- WARN_ON(1);
- return -ENOMEM;
- }
+ if (WARN_ON(len > PATH_MAX))
+ return -ENAMETOOLONG;
path_len = p->end - p->start;
old_buf_len = p->buf_len;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index c0ff0c2fc01d..91b19d66449b 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1640,8 +1640,7 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
subvol_name = btrfs_get_subvol_name_from_objectid(info,
BTRFS_I(d_inode(dentry))->root->root_key.objectid);
if (!IS_ERR(subvol_name)) {
- seq_puts(seq, ",subvol=");
- seq_escape(seq, subvol_name, " \t\n\\");
+ seq_show_option(seq, "subvol", subvol_name);
kfree(subvol_name);
}
return 0;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index fdc432b3352a..982dc92bdf1d 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1085,7 +1085,9 @@ again:
search_key.type = BTRFS_INODE_REF_KEY;
search_key.offset = parent_objectid;
ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
- if (ret == 0) {
+ if (ret < 0) {
+ return ret;
+ } else if (ret == 0) {
struct btrfs_inode_ref *victim_ref;
unsigned long ptr;
unsigned long ptr_end;
@@ -1158,13 +1160,13 @@ again:
struct fscrypt_str victim_name;
extref = (struct btrfs_inode_extref *)(base + cur_offset);
+ victim_name.len = btrfs_inode_extref_name_len(leaf, extref);
if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid)
goto next;
ret = read_alloc_one_name(leaf, &extref->name,
- btrfs_inode_extref_name_len(leaf, extref),
- &victim_name);
+ victim_name.len, &victim_name);
if (ret)
return ret;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index a4177014eb8b..628238493167 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -3203,6 +3203,12 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
device->bytes_used - dev_extent_len);
atomic64_add(dev_extent_len, &fs_info->free_chunk_space);
btrfs_clear_space_info_full(fs_info);
+
+ if (list_empty(&device->post_commit_list)) {
+ list_add_tail(&device->post_commit_list,
+ &trans->transaction->dev_update_list);
+ }
+
mutex_unlock(&fs_info->chunk_mutex);
}
}
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 794526ab90d2..1dff64e62047 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -1909,6 +1909,9 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
device = map->stripes[i].dev;
physical = map->stripes[i].physical;
+ if (!device->bdev)
+ continue;
+
if (device->zone_info->max_active_zones == 0)
continue;
@@ -2052,6 +2055,9 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ
struct btrfs_device *device = map->stripes[i].dev;
const u64 physical = map->stripes[i].physical;
+ if (!device->bdev)
+ continue;
+
if (device->zone_info->max_active_zones == 0)
continue;
diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c
index 000a28f46e59..5d2a41bab9c1 100644
--- a/fs/cachefiles/io.c
+++ b/fs/cachefiles/io.c
@@ -356,8 +356,6 @@ int __cachefiles_write(struct cachefiles_object *object,
default:
ki->was_async = false;
cachefiles_write_complete(&ki->iocb, ret);
- if (ret > 0)
- ret = 0;
break;
}
diff --git a/fs/cachefiles/ondemand.c b/fs/cachefiles/ondemand.c
index 3389a373faf6..cfa8f23fdfb6 100644
--- a/fs/cachefiles/ondemand.c
+++ b/fs/cachefiles/ondemand.c
@@ -84,10 +84,8 @@ static ssize_t cachefiles_ondemand_fd_write_iter(struct kiocb *kiocb,
trace_cachefiles_ondemand_fd_write(object, file_inode(file), pos, len);
ret = __cachefiles_write(object, file, pos, iter, NULL, NULL);
- if (!ret) {
- ret = len;
+ if (ret > 0)
kiocb->ki_pos += ret;
- }
out:
fput(file);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 882eccfd67e8..3336647e64df 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -2043,7 +2043,7 @@ static int ceph_zero_objects(struct inode *inode, loff_t offset, loff_t length)
s32 stripe_unit = ci->i_layout.stripe_unit;
s32 stripe_count = ci->i_layout.stripe_count;
s32 object_size = ci->i_layout.object_size;
- u64 object_set_size = object_size * stripe_count;
+ u64 object_set_size = (u64) object_size * stripe_count;
u64 nearly, t;
/* round offset up to next period boundary */
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index bd15991166c2..0f56e4ba7812 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -1136,6 +1136,7 @@ static int ceph_set_super(struct super_block *s, struct fs_context *fc)
s->s_time_min = 0;
s->s_time_max = U32_MAX;
s->s_flags |= SB_NODIRATIME | SB_NOATIME;
+ s->s_magic = CEPH_SUPER_MAGIC;
ret = set_anon_super_fc(s, fc);
if (ret != 0)
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index ec6519e1ca3b..e017ba188f7b 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -593,7 +593,7 @@ static int populate_attrs(struct config_item *item)
break;
}
}
- if (t->ct_bin_attrs) {
+ if (!error && t->ct_bin_attrs) {
for (i = 0; (bin_attr = t->ct_bin_attrs[i]) != NULL; i++) {
error = configfs_create_bin_file(item, bin_attr);
if (error)
diff --git a/fs/coredump.c b/fs/coredump.c
index 4d332f147137..b0d782367fcc 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -42,6 +42,7 @@
#include <linux/timekeeping.h>
#include <linux/sysctl.h>
#include <linux/elf.h>
+#include <uapi/linux/pidfd.h>
#include <linux/uaccess.h>
#include <asm/mmu_context.h>
@@ -56,6 +57,13 @@
static bool dump_vma_snapshot(struct coredump_params *cprm);
static void free_vma_snapshot(struct coredump_params *cprm);
+/*
+ * File descriptor number for the pidfd for the thread-group leader of
+ * the coredumping task installed into the usermode helper's file
+ * descriptor table.
+ */
+#define COREDUMP_PIDFD_NUMBER 3
+
static int core_uses_pid;
static unsigned int core_pipe_limit;
static char core_pattern[CORENAME_MAX_SIZE] = "core";
@@ -325,6 +333,27 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm,
err = cn_printf(cn, "%lu",
rlimit(RLIMIT_CORE));
break;
+ /* pidfd number */
+ case 'F': {
+ /*
+ * Installing a pidfd only makes sense if
+ * we actually spawn a usermode helper.
+ */
+ if (!ispipe)
+ break;
+
+ /*
+ * Note that we'll install a pidfd for the
+ * thread-group leader. We know that task
+ * linkage hasn't been removed yet and even if
+ * this @current isn't the actual thread-group
+ * leader we know that the thread-group leader
+ * cannot be reaped until @current has exited.
+ */
+ cprm->pid = task_tgid(current);
+ err = cn_printf(cn, "%d", COREDUMP_PIDFD_NUMBER);
+ break;
+ }
default:
break;
}
@@ -479,7 +508,7 @@ static void wait_for_dump_helpers(struct file *file)
}
/*
- * umh_pipe_setup
+ * umh_coredump_setup
* helper function to customize the process used
* to collect the core in userspace. Specifically
* it sets up a pipe and installs it as fd 0 (stdin)
@@ -489,21 +518,61 @@ static void wait_for_dump_helpers(struct file *file)
* is a special value that we use to trap recursive
* core dumps
*/
-static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)
+static int umh_coredump_setup(struct subprocess_info *info, struct cred *new)
{
struct file *files[2];
+ struct file *pidfs_file = NULL;
struct coredump_params *cp = (struct coredump_params *)info->data;
- int err = create_pipe_files(files, 0);
+ int err;
+
+ if (cp->pid) {
+ int fd;
+
+ fd = pidfd_prepare(cp->pid, 0, &pidfs_file);
+ if (fd < 0)
+ return fd;
+
+ /*
+ * We don't care about the fd. We also cannot simply
+ * replace it below because dup2() will refuse to close
+ * this file descriptor if its in a larval state. So
+ * close it!
+ */
+ put_unused_fd(fd);
+
+ /*
+ * Usermode helpers are childen of either
+ * system_unbound_wq or of kthreadd. So we know that
+ * we're starting off with a clean file descriptor
+ * table. So we should always be able to use
+ * COREDUMP_PIDFD_NUMBER as our file descriptor value.
+ */
+ err = replace_fd(COREDUMP_PIDFD_NUMBER, pidfs_file, 0);
+ if (err < 0)
+ goto out_fail;
+
+ pidfs_file = NULL;
+ }
+
+ err = create_pipe_files(files, 0);
if (err)
- return err;
+ goto out_fail;
cp->file = files[1];
err = replace_fd(0, files[0], 0);
fput(files[0]);
+ if (err < 0)
+ goto out_fail;
+
/* and disallow core files too */
current->signal->rlim[RLIMIT_CORE] = (struct rlimit){1, 1};
+ err = 0;
+
+out_fail:
+ if (pidfs_file)
+ fput(pidfs_file);
return err;
}
@@ -580,7 +649,7 @@ void do_coredump(const kernel_siginfo_t *siginfo)
}
if (cprm.limit == 1) {
- /* See umh_pipe_setup() which sets RLIMIT_CORE = 1.
+ /* See umh_coredump_setup() which sets RLIMIT_CORE = 1.
*
* Normally core limits are irrelevant to pipes, since
* we're not writing to the file system, but we use
@@ -625,7 +694,7 @@ void do_coredump(const kernel_siginfo_t *siginfo)
retval = -ENOMEM;
sub_info = call_usermodehelper_setup(helper_argv[0],
helper_argv, NULL, GFP_KERNEL,
- umh_pipe_setup, NULL, &cprm);
+ umh_coredump_setup, NULL, &cprm);
if (sub_info)
retval = call_usermodehelper_exec(sub_info,
UMH_WAIT_EXEC);
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 2c797eb519da..51a641822d6c 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -1863,8 +1863,8 @@ static int dlm_tcp_listen_validate(void)
{
/* We don't support multi-homed hosts */
if (dlm_local_count > 1) {
- log_print("TCP protocol can't handle multi-homed hosts, try SCTP");
- return -EINVAL;
+ log_print("Detect multi-homed hosts but use only the first IP address.");
+ log_print("Try SCTP, if you want to enable multi-link.");
}
return 0;
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index 5c2e6fbb70a3..7b648bec61fd 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -354,6 +354,8 @@ int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
*/
static int erofs_read_folio(struct file *file, struct folio *folio)
{
+ trace_erofs_read_folio(folio, true);
+
return iomap_read_folio(folio, &erofs_iomap_ops);
}
diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c
index 0eaa9e495346..e524c0b432f3 100644
--- a/fs/erofs/decompressor.c
+++ b/fs/erofs/decompressor.c
@@ -323,7 +323,7 @@ static int z_erofs_transform_plain(struct z_erofs_decompress_req *rq,
const unsigned int lefthalf = rq->outputsize - righthalf;
const unsigned int interlaced_offset =
rq->alg == Z_EROFS_COMPRESSION_SHIFTED ? 0 : rq->pageofs_out;
- unsigned char *src, *dst;
+ u8 *src;
if (outpages > 2 && rq->alg == Z_EROFS_COMPRESSION_SHIFTED) {
DBG_BUGON(1);
@@ -336,23 +336,18 @@ static int z_erofs_transform_plain(struct z_erofs_decompress_req *rq,
}
src = kmap_local_page(rq->in[inpages - 1]) + rq->pageofs_in;
- if (rq->out[0]) {
- dst = kmap_local_page(rq->out[0]);
- memcpy(dst + rq->pageofs_out, src + interlaced_offset,
- righthalf);
- kunmap_local(dst);
- }
+ if (rq->out[0])
+ memcpy_to_page(rq->out[0], rq->pageofs_out,
+ src + interlaced_offset, righthalf);
if (outpages > inpages) {
DBG_BUGON(!rq->out[outpages - 1]);
- if (rq->out[outpages - 1] != rq->in[inpages - 1]) {
- dst = kmap_local_page(rq->out[outpages - 1]);
- memcpy(dst, interlaced_offset ? src :
- (src + righthalf), lefthalf);
- kunmap_local(dst);
- } else if (!interlaced_offset) {
+ if (rq->out[outpages - 1] != rq->in[inpages - 1])
+ memcpy_to_page(rq->out[outpages - 1], 0, src +
+ (interlaced_offset ? 0 : righthalf),
+ lefthalf);
+ else if (!interlaced_offset)
memmove(src, src + righthalf, lefthalf);
- }
}
kunmap_local(src);
return 0;
diff --git a/fs/erofs/dir.c b/fs/erofs/dir.c
index 966a88cc529e..963bbed0b699 100644
--- a/fs/erofs/dir.c
+++ b/fs/erofs/dir.c
@@ -6,21 +6,6 @@
*/
#include "internal.h"
-static void debug_one_dentry(unsigned char d_type, const char *de_name,
- unsigned int de_namelen)
-{
-#ifdef CONFIG_EROFS_FS_DEBUG
- /* since the on-disk name could not have the trailing '\0' */
- unsigned char dbg_namebuf[EROFS_NAME_LEN + 1];
-
- memcpy(dbg_namebuf, de_name, de_namelen);
- dbg_namebuf[de_namelen] = '\0';
-
- erofs_dbg("found dirent %s de_len %u d_type %d", dbg_namebuf,
- de_namelen, d_type);
-#endif
-}
-
static int erofs_fill_dentries(struct inode *dir, struct dir_context *ctx,
void *dentry_blk, struct erofs_dirent *de,
unsigned int nameoff, unsigned int maxsize)
@@ -52,10 +37,8 @@ static int erofs_fill_dentries(struct inode *dir, struct dir_context *ctx,
return -EFSCORRUPTED;
}
- debug_one_dentry(d_type, de_name, de_namelen);
if (!dir_emit(ctx, de_name, de_namelen,
le64_to_cpu(de->nid), d_type))
- /* stopped by some reason */
return 1;
++de;
ctx->pos += sizeof(struct erofs_dirent);
diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index 7dcf350b9fef..3cbef6318b7b 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -26,9 +26,6 @@ static void *erofs_read_inode(struct erofs_buf *buf,
blkaddr = erofs_blknr(sb, inode_loc);
*ofs = erofs_blkoff(sb, inode_loc);
- erofs_dbg("%s, reading inode nid %llu at %u of blkaddr %u",
- __func__, vi->nid, *ofs, blkaddr);
-
kaddr = erofs_read_metabuf(buf, sb, blkaddr, EROFS_KMAP);
if (IS_ERR(kaddr)) {
erofs_err(sb, "failed to get inode (nid: %llu) page, err %ld",
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index d7cd1e619d46..126970932805 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -32,10 +32,8 @@ __printf(3, 4) void _erofs_info(struct super_block *sb,
#define erofs_info(sb, fmt, ...) \
_erofs_info(sb, __func__, fmt "\n", ##__VA_ARGS__)
#ifdef CONFIG_EROFS_FS_DEBUG
-#define erofs_dbg(x, ...) pr_debug(x "\n", ##__VA_ARGS__)
#define DBG_BUGON BUG_ON
#else
-#define erofs_dbg(x, ...) ((void)0)
#define DBG_BUGON(x) ((void)(x))
#endif /* !CONFIG_EROFS_FS_DEBUG */
diff --git a/fs/erofs/namei.c b/fs/erofs/namei.c
index 8332428b780c..c0d5ffb62420 100644
--- a/fs/erofs/namei.c
+++ b/fs/erofs/namei.c
@@ -203,16 +203,13 @@ static struct dentry *erofs_lookup(struct inode *dir, struct dentry *dentry,
err = erofs_namei(dir, &dentry->d_name, &nid, &d_type);
- if (err == -ENOENT) {
+ if (err == -ENOENT)
/* negative dentry */
inode = NULL;
- } else if (err) {
+ else if (err)
inode = ERR_PTR(err);
- } else {
- erofs_dbg("%s, %pd (nid %llu) found, d_type %u", __func__,
- dentry, nid, d_type);
+ else
inode = erofs_iget(dir->i_sb, nid);
- }
return d_splice_alias(inode, dentry);
}
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 94e9e0bf3bbd..5e6580217318 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -5,7 +5,6 @@
* Copyright (C) 2022 Alibaba Cloud
*/
#include "compress.h"
-#include <linux/prefetch.h>
#include <linux/psi.h>
#include <trace/events/erofs.h>
@@ -124,9 +123,11 @@ static inline unsigned int z_erofs_pclusterpages(struct z_erofs_pcluster *pcl)
/*
* bit 30: I/O error occurred on this page
+ * bit 29: CPU has dirty data in D-cache (needs aliasing handling);
* bit 0 - 29: remaining parts to complete this page
*/
-#define Z_EROFS_PAGE_EIO (1 << 30)
+#define Z_EROFS_ONLINEPAGE_EIO 30
+#define Z_EROFS_ONLINEPAGE_DIRTY 29
static inline void z_erofs_onlinepage_init(struct page *page)
{
@@ -145,29 +146,28 @@ static inline void z_erofs_onlinepage_split(struct page *page)
atomic_inc((atomic_t *)&page->private);
}
-static inline void z_erofs_page_mark_eio(struct page *page)
+static void z_erofs_onlinepage_end(struct page *page, int err, bool dirty)
{
- int orig;
+ int orig, v;
+
+ DBG_BUGON(!PagePrivate(page));
do {
orig = atomic_read((atomic_t *)&page->private);
- } while (atomic_cmpxchg((atomic_t *)&page->private, orig,
- orig | Z_EROFS_PAGE_EIO) != orig);
-}
-
-static inline void z_erofs_onlinepage_endio(struct page *page)
-{
- unsigned int v;
+ DBG_BUGON(orig <= 0);
+ v = dirty << Z_EROFS_ONLINEPAGE_DIRTY;
+ v |= (orig - 1) | (!!err << Z_EROFS_ONLINEPAGE_EIO);
+ } while (atomic_cmpxchg((atomic_t *)&page->private, orig, v) != orig);
- DBG_BUGON(!PagePrivate(page));
- v = atomic_dec_return((atomic_t *)&page->private);
- if (!(v & ~Z_EROFS_PAGE_EIO)) {
- set_page_private(page, 0);
- ClearPagePrivate(page);
- if (!(v & Z_EROFS_PAGE_EIO))
- SetPageUptodate(page);
- unlock_page(page);
- }
+ if (v & (BIT(Z_EROFS_ONLINEPAGE_DIRTY) - 1))
+ return;
+ set_page_private(page, 0);
+ ClearPagePrivate(page);
+ if (v & BIT(Z_EROFS_ONLINEPAGE_DIRTY))
+ flush_dcache_page(page);
+ if (!(v & BIT(Z_EROFS_ONLINEPAGE_EIO)))
+ SetPageUptodate(page);
+ unlock_page(page);
}
#define Z_EROFS_ONSTACK_PAGES 32
@@ -237,14 +237,20 @@ static void z_erofs_bvec_iter_begin(struct z_erofs_bvec_iter *iter,
static int z_erofs_bvec_enqueue(struct z_erofs_bvec_iter *iter,
struct z_erofs_bvec *bvec,
- struct page **candidate_bvpage)
+ struct page **candidate_bvpage,
+ struct page **pagepool)
{
- if (iter->cur == iter->nr) {
- if (!*candidate_bvpage)
- return -EAGAIN;
-
+ if (iter->cur >= iter->nr) {
+ struct page *nextpage = *candidate_bvpage;
+
+ if (!nextpage) {
+ nextpage = erofs_allocpage(pagepool, GFP_NOFS);
+ if (!nextpage)
+ return -ENOMEM;
+ set_page_private(nextpage, Z_EROFS_SHORTLIVED_PAGE);
+ }
DBG_BUGON(iter->bvset->nextpage);
- iter->bvset->nextpage = *candidate_bvpage;
+ iter->bvset->nextpage = nextpage;
z_erofs_bvset_flip(iter);
iter->bvset->nextpage = NULL;
@@ -402,12 +408,12 @@ struct z_erofs_decompress_frontend {
struct erofs_map_blocks map;
struct z_erofs_bvec_iter biter;
+ struct page *pagepool;
struct page *candidate_bvpage;
struct z_erofs_pcluster *pcl;
z_erofs_next_pcluster_t owned_head;
enum z_erofs_pclustermode mode;
- bool readahead;
/* used for applying cache strategy on the fly */
bool backmost;
erofs_off_t headoffset;
@@ -437,8 +443,7 @@ static bool z_erofs_should_alloc_cache(struct z_erofs_decompress_frontend *fe)
return false;
}
-static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe,
- struct page **pagepool)
+static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe)
{
struct address_space *mc = MNGD_MAPPING(EROFS_I_SB(fe->inode));
struct z_erofs_pcluster *pcl = fe->pcl;
@@ -479,7 +484,7 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe,
* succeeds or fallback to in-place I/O instead
* to avoid any direct reclaim.
*/
- newpage = erofs_allocpage(pagepool, gfp);
+ newpage = erofs_allocpage(&fe->pagepool, gfp);
if (!newpage)
continue;
set_page_private(newpage, Z_EROFS_PREALLOCATED_PAGE);
@@ -492,7 +497,7 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe,
if (page)
put_page(page);
else if (newpage)
- erofs_pagepool_add(pagepool, newpage);
+ erofs_pagepool_add(&fe->pagepool, newpage);
}
/*
@@ -590,7 +595,8 @@ static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe,
!fe->candidate_bvpage)
fe->candidate_bvpage = bvec->page;
}
- ret = z_erofs_bvec_enqueue(&fe->biter, bvec, &fe->candidate_bvpage);
+ ret = z_erofs_bvec_enqueue(&fe->biter, bvec, &fe->candidate_bvpage,
+ &fe->pagepool);
fe->pcl->vcnt += (ret >= 0);
return ret;
}
@@ -746,10 +752,8 @@ static bool z_erofs_collector_end(struct z_erofs_decompress_frontend *fe)
z_erofs_bvec_iter_end(&fe->biter);
mutex_unlock(&pcl->lock);
- if (fe->candidate_bvpage) {
- DBG_BUGON(z_erofs_is_shortlived_page(fe->candidate_bvpage));
+ if (fe->candidate_bvpage)
fe->candidate_bvpage = NULL;
- }
/*
* if all pending pages are added, don't hold its reference
@@ -796,7 +800,7 @@ static int z_erofs_read_fragment(struct inode *inode, erofs_off_t pos,
}
static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe,
- struct page *page, struct page **pagepool)
+ struct page *page)
{
struct inode *const inode = fe->inode;
struct erofs_map_blocks *const map = &fe->map;
@@ -815,8 +819,6 @@ repeat:
if (offset + cur < map->m_la ||
offset + cur >= map->m_la + map->m_llen) {
- erofs_dbg("out-of-range map @ pos %llu", offset + cur);
-
if (z_erofs_collector_end(fe))
fe->backmost = false;
map->m_la = offset + cur;
@@ -857,7 +859,7 @@ repeat:
fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE;
} else {
/* bind cache first when cached decompression is preferred */
- z_erofs_bind_cache(fe, pagepool);
+ z_erofs_bind_cache(fe);
}
hitted:
/*
@@ -898,24 +900,13 @@ hitted:
if (cur)
tight &= (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED);
-retry:
err = z_erofs_attach_page(fe, &((struct z_erofs_bvec) {
.page = page,
.offset = offset - map->m_la,
.end = end,
}), exclusive);
- /* should allocate an additional short-lived page for bvset */
- if (err == -EAGAIN && !fe->candidate_bvpage) {
- fe->candidate_bvpage = alloc_page(GFP_NOFS | __GFP_NOFAIL);
- set_page_private(fe->candidate_bvpage,
- Z_EROFS_SHORTLIVED_PAGE);
- goto retry;
- }
-
- if (err) {
- DBG_BUGON(err == -EAGAIN && fe->candidate_bvpage);
+ if (err)
goto out;
- }
z_erofs_onlinepage_split(page);
/* bump up the number of spiltted parts of a page */
@@ -940,16 +931,11 @@ next_part:
goto repeat;
out:
- if (err)
- z_erofs_page_mark_eio(page);
- z_erofs_onlinepage_endio(page);
-
- erofs_dbg("%s, finish page: %pK spiltted: %u map->m_llen %llu",
- __func__, page, spiltted, map->m_llen);
+ z_erofs_onlinepage_end(page, err, false);
return err;
}
-static bool z_erofs_get_sync_decompress_policy(struct erofs_sb_info *sbi,
+static bool z_erofs_is_sync_decompress(struct erofs_sb_info *sbi,
unsigned int readahead_pages)
{
/* auto: enable for read_folio, disable for readahead */
@@ -1048,9 +1034,7 @@ static void z_erofs_fill_other_copies(struct z_erofs_decompress_backend *be,
cur += len;
}
kunmap_local(dst);
- if (err)
- z_erofs_page_mark_eio(bvi->bvec.page);
- z_erofs_onlinepage_endio(bvi->bvec.page);
+ z_erofs_onlinepage_end(bvi->bvec.page, err, true);
list_del(p);
kfree(bvi);
}
@@ -1218,9 +1202,7 @@ out:
/* recycle all individual short-lived pages */
if (z_erofs_put_shortlivedpage(be->pagepool, page))
continue;
- if (err)
- z_erofs_page_mark_eio(page);
- z_erofs_onlinepage_endio(page);
+ z_erofs_onlinepage_end(page, err, true);
}
if (be->decompressed_pages != be->onstack_pages)
@@ -1480,9 +1462,8 @@ static void z_erofs_decompressqueue_endio(struct bio *bio)
}
static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f,
- struct page **pagepool,
struct z_erofs_decompressqueue *fgq,
- bool *force_fg)
+ bool *force_fg, bool readahead)
{
struct super_block *sb = f->inode->i_sb;
struct address_space *mc = MNGD_MAPPING(EROFS_SB(sb));
@@ -1538,8 +1519,8 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f,
do {
struct page *page;
- page = pickup_page_for_submission(pcl, i++, pagepool,
- mc);
+ page = pickup_page_for_submission(pcl, i++,
+ &f->pagepool, mc);
if (!page)
continue;
@@ -1568,7 +1549,7 @@ submit_bio_retry:
bio->bi_iter.bi_sector = (sector_t)cur <<
(sb->s_blocksize_bits - 9);
bio->bi_private = q[JQ_SUBMIT];
- if (f->readahead)
+ if (readahead)
bio->bi_opf |= REQ_RAHEAD;
++nr_bios;
}
@@ -1604,16 +1585,16 @@ submit_bio_retry:
}
static void z_erofs_runqueue(struct z_erofs_decompress_frontend *f,
- struct page **pagepool, bool force_fg)
+ bool force_fg, bool ra)
{
struct z_erofs_decompressqueue io[NR_JOBQUEUES];
if (f->owned_head == Z_EROFS_PCLUSTER_TAIL)
return;
- z_erofs_submit_queue(f, pagepool, io, &force_fg);
+ z_erofs_submit_queue(f, io, &force_fg, ra);
/* handle bypass queue (no i/o pclusters) immediately */
- z_erofs_decompress_queue(&io[JQ_BYPASS], pagepool);
+ z_erofs_decompress_queue(&io[JQ_BYPASS], &f->pagepool);
if (!force_fg)
return;
@@ -1622,7 +1603,7 @@ static void z_erofs_runqueue(struct z_erofs_decompress_frontend *f,
wait_for_completion_io(&io[JQ_SUBMIT].u.done);
/* handle synchronous decompress queue in the caller context */
- z_erofs_decompress_queue(&io[JQ_SUBMIT], pagepool);
+ z_erofs_decompress_queue(&io[JQ_SUBMIT], &f->pagepool);
}
/*
@@ -1630,29 +1611,28 @@ static void z_erofs_runqueue(struct z_erofs_decompress_frontend *f,
* approximate readmore strategies as a start.
*/
static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f,
- struct readahead_control *rac,
- erofs_off_t end,
- struct page **pagepool,
- bool backmost)
+ struct readahead_control *rac, bool backmost)
{
struct inode *inode = f->inode;
struct erofs_map_blocks *map = &f->map;
- erofs_off_t cur;
+ erofs_off_t cur, end, headoffset = f->headoffset;
int err;
if (backmost) {
+ if (rac)
+ end = headoffset + readahead_length(rac) - 1;
+ else
+ end = headoffset + PAGE_SIZE - 1;
map->m_la = end;
err = z_erofs_map_blocks_iter(inode, map,
EROFS_GET_BLOCKS_READMORE);
if (err)
return;
- /* expend ra for the trailing edge if readahead */
+ /* expand ra for the trailing edge if readahead */
if (rac) {
- loff_t newstart = readahead_pos(rac);
-
cur = round_up(map->m_la + map->m_llen, PAGE_SIZE);
- readahead_expand(rac, newstart, cur - newstart);
+ readahead_expand(rac, headoffset, cur - headoffset);
return;
}
end = round_up(end, PAGE_SIZE);
@@ -1673,7 +1653,7 @@ static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f,
if (PageUptodate(page)) {
unlock_page(page);
} else {
- err = z_erofs_do_read_page(f, page, pagepool);
+ err = z_erofs_do_read_page(f, page);
if (err)
erofs_err(inode->i_sb,
"readmore error at page %lu @ nid %llu",
@@ -1690,32 +1670,27 @@ static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f,
static int z_erofs_read_folio(struct file *file, struct folio *folio)
{
- struct page *page = &folio->page;
- struct inode *const inode = page->mapping->host;
+ struct inode *const inode = folio->mapping->host;
struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
- struct page *pagepool = NULL;
int err;
- trace_erofs_readpage(page, false);
- f.headoffset = (erofs_off_t)page->index << PAGE_SHIFT;
-
- z_erofs_pcluster_readmore(&f, NULL, f.headoffset + PAGE_SIZE - 1,
- &pagepool, true);
- err = z_erofs_do_read_page(&f, page, &pagepool);
- z_erofs_pcluster_readmore(&f, NULL, 0, &pagepool, false);
+ trace_erofs_read_folio(folio, false);
+ f.headoffset = (erofs_off_t)folio->index << PAGE_SHIFT;
+ z_erofs_pcluster_readmore(&f, NULL, true);
+ err = z_erofs_do_read_page(&f, &folio->page);
+ z_erofs_pcluster_readmore(&f, NULL, false);
(void)z_erofs_collector_end(&f);
/* if some compressed cluster ready, need submit them anyway */
- z_erofs_runqueue(&f, &pagepool,
- z_erofs_get_sync_decompress_policy(sbi, 0));
+ z_erofs_runqueue(&f, z_erofs_is_sync_decompress(sbi, 0), false);
if (err)
erofs_err(inode->i_sb, "failed to read, err [%d]", err);
erofs_put_metabuf(&f.map.buf);
- erofs_release_pages(&pagepool);
+ erofs_release_pages(&f.pagepool);
return err;
}
@@ -1724,14 +1699,12 @@ static void z_erofs_readahead(struct readahead_control *rac)
struct inode *const inode = rac->mapping->host;
struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
- struct page *pagepool = NULL, *head = NULL, *page;
+ struct page *head = NULL, *page;
unsigned int nr_pages;
- f.readahead = true;
f.headoffset = readahead_pos(rac);
- z_erofs_pcluster_readmore(&f, rac, f.headoffset +
- readahead_length(rac) - 1, &pagepool, true);
+ z_erofs_pcluster_readmore(&f, rac, true);
nr_pages = readahead_count(rac);
trace_erofs_readpages(inode, readahead_index(rac), nr_pages, false);
@@ -1747,20 +1720,19 @@ static void z_erofs_readahead(struct readahead_control *rac)
/* traversal in reverse order */
head = (void *)page_private(page);
- err = z_erofs_do_read_page(&f, page, &pagepool);
+ err = z_erofs_do_read_page(&f, page);
if (err)
erofs_err(inode->i_sb,
"readahead error at page %lu @ nid %llu",
page->index, EROFS_I(inode)->nid);
put_page(page);
}
- z_erofs_pcluster_readmore(&f, rac, 0, &pagepool, false);
+ z_erofs_pcluster_readmore(&f, rac, false);
(void)z_erofs_collector_end(&f);
- z_erofs_runqueue(&f, &pagepool,
- z_erofs_get_sync_decompress_policy(sbi, nr_pages));
+ z_erofs_runqueue(&f, z_erofs_is_sync_decompress(sbi, nr_pages), true);
erofs_put_metabuf(&f.map.buf);
- erofs_release_pages(&pagepool);
+ erofs_release_pages(&f.pagepool);
}
const struct address_space_operations z_erofs_aops = {
diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
index 2cd70cf4c8b2..d2d7fe826091 100644
--- a/fs/erofs/zmap.c
+++ b/fs/erofs/zmap.c
@@ -603,9 +603,6 @@ static int z_erofs_do_map_blocks(struct inode *inode,
unmap_out:
erofs_unmap_metabuf(&m.map->buf);
- erofs_dbg("%s, m_la %llu m_pa %llu m_llen %llu m_plen %llu m_flags 0%o",
- __func__, map->m_la, map->m_pa,
- map->m_llen, map->m_plen, map->m_flags);
return err;
}
diff --git a/fs/exec.c b/fs/exec.c
index 2039414cc662..b65af8f9a4f9 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -169,7 +169,7 @@ SYSCALL_DEFINE1(uselib, const char __user *, library)
exit:
fput(file);
out:
- return error;
+ return error;
}
#endif /* #ifdef CONFIG_USELIB */
diff --git a/fs/exfat/balloc.c b/fs/exfat/balloc.c
index 5b547a596380..32209acd51be 100644
--- a/fs/exfat/balloc.c
+++ b/fs/exfat/balloc.c
@@ -160,7 +160,7 @@ int exfat_set_bitmap(struct inode *inode, unsigned int clu, bool sync)
return 0;
}
-void exfat_clear_bitmap(struct inode *inode, unsigned int clu, bool sync)
+int exfat_clear_bitmap(struct inode *inode, unsigned int clu, bool sync)
{
int i, b;
unsigned int ent_idx;
@@ -169,13 +169,17 @@ void exfat_clear_bitmap(struct inode *inode, unsigned int clu, bool sync)
struct exfat_mount_options *opts = &sbi->options;
if (!is_valid_cluster(sbi, clu))
- return;
+ return -EIO;
ent_idx = CLUSTER_TO_BITMAP_ENT(clu);
i = BITMAP_OFFSET_SECTOR_INDEX(sb, ent_idx);
b = BITMAP_OFFSET_BIT_IN_SECTOR(sb, ent_idx);
+ if (!test_bit_le(b, sbi->vol_amap[i]->b_data))
+ return -EIO;
+
clear_bit_le(b, sbi->vol_amap[i]->b_data);
+
exfat_update_bh(sbi->vol_amap[i], sync);
if (opts->discard) {
@@ -190,6 +194,8 @@ void exfat_clear_bitmap(struct inode *inode, unsigned int clu, bool sync)
opts->discard = 0;
}
}
+
+ return 0;
}
/*
diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h
index c79c78bf265b..5a1251207ab2 100644
--- a/fs/exfat/exfat_fs.h
+++ b/fs/exfat/exfat_fs.h
@@ -419,7 +419,7 @@ int exfat_count_num_clusters(struct super_block *sb,
int exfat_load_bitmap(struct super_block *sb);
void exfat_free_bitmap(struct exfat_sb_info *sbi);
int exfat_set_bitmap(struct inode *inode, unsigned int clu, bool sync);
-void exfat_clear_bitmap(struct inode *inode, unsigned int clu, bool sync);
+int exfat_clear_bitmap(struct inode *inode, unsigned int clu, bool sync);
unsigned int exfat_find_free_bitmap(struct super_block *sb, unsigned int clu);
int exfat_count_used_clusters(struct super_block *sb, unsigned int *ret_count);
int exfat_trim_fs(struct inode *inode, struct fstrim_range *range);
diff --git a/fs/exfat/fatent.c b/fs/exfat/fatent.c
index fe007ae2f23c..9fa4cffabfb6 100644
--- a/fs/exfat/fatent.c
+++ b/fs/exfat/fatent.c
@@ -175,6 +175,7 @@ static int __exfat_free_cluster(struct inode *inode, struct exfat_chain *p_chain
BITMAP_OFFSET_SECTOR_INDEX(sb, CLUSTER_TO_BITMAP_ENT(clu));
if (p_chain->flags == ALLOC_NO_FAT_CHAIN) {
+ int err;
unsigned int last_cluster = p_chain->dir + p_chain->size - 1;
do {
bool sync = false;
@@ -189,7 +190,9 @@ static int __exfat_free_cluster(struct inode *inode, struct exfat_chain *p_chain
cur_cmap_i = next_cmap_i;
}
- exfat_clear_bitmap(inode, clu, (sync && IS_DIRSYNC(inode)));
+ err = exfat_clear_bitmap(inode, clu, (sync && IS_DIRSYNC(inode)));
+ if (err)
+ break;
clu++;
num_clusters++;
} while (num_clusters < p_chain->size);
@@ -210,12 +213,13 @@ static int __exfat_free_cluster(struct inode *inode, struct exfat_chain *p_chain
cur_cmap_i = next_cmap_i;
}
- exfat_clear_bitmap(inode, clu, (sync && IS_DIRSYNC(inode)));
+ if (exfat_clear_bitmap(inode, clu, (sync && IS_DIRSYNC(inode))))
+ break;
clu = n_clu;
num_clusters++;
if (err)
- goto dec_used_clus;
+ break;
if (num_clusters >= sbi->num_clusters - EXFAT_FIRST_CLUSTER) {
/*
@@ -229,7 +233,6 @@ static int __exfat_free_cluster(struct inode *inode, struct exfat_chain *p_chain
} while (clu != EXFAT_EOF_CLUSTER);
}
-dec_used_clus:
sbi->used_clusters -= num_clusters;
return 0;
}
@@ -262,7 +265,7 @@ int exfat_find_last_cluster(struct super_block *sb, struct exfat_chain *p_chain,
clu = next;
if (exfat_ent_get(sb, clu, &next))
return -EIO;
- } while (next != EXFAT_EOF_CLUSTER);
+ } while (next != EXFAT_EOF_CLUSTER && count <= p_chain->size);
if (p_chain->size != count) {
exfat_fs_error(sb,
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index fbd0329cf254..9efe97f3721b 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -638,8 +638,8 @@ static int ext4_has_free_clusters(struct ext4_sb_info *sbi,
/* Hm, nope. Are (enough) root reserved clusters available? */
if (uid_eq(sbi->s_resuid, current_fsuid()) ||
(!gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) && in_group_p(sbi->s_resgid)) ||
- capable(CAP_SYS_RESOURCE) ||
- (flags & EXT4_MB_USE_ROOT_BLOCKS)) {
+ (flags & EXT4_MB_USE_ROOT_BLOCKS) ||
+ capable(CAP_SYS_RESOURCE)) {
if (free_clusters >= (nclusters + dirty_clusters +
resv_clusters))
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 6fe3c941b565..4d6ba140276b 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -351,10 +351,9 @@ int ext4_check_blockref(const char *function, unsigned int line,
{
__le32 *bref = p;
unsigned int blk;
+ journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
- if (ext4_has_feature_journal(inode->i_sb) &&
- (inode->i_ino ==
- le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum)))
+ if (journal && inode == journal->j_inode)
return 0;
while (bref < p+max) {
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 7ea33c3fe94e..6682b8ab11f1 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -104,6 +104,9 @@ int __ext4_check_dir_entry(const char *function, unsigned int line,
else if (unlikely(le32_to_cpu(de->inode) >
le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count)))
error_msg = "inode out of bounds";
+ else if (unlikely(next_offset == size && de->name_len == 1 &&
+ de->name[0] == '.'))
+ error_msg = "'.' directory cannot be the last in data block";
else
return 0;
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index faa889882e55..903bb01e6dd2 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -3354,6 +3354,13 @@ static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi)
return 1 << sbi->s_log_groups_per_flex;
}
+static inline loff_t ext4_get_maxbytes(struct inode *inode)
+{
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ return inode->i_sb->s_maxbytes;
+ return EXT4_SB(inode->i_sb)->s_bitmap_maxbytes;
+}
+
#define ext4_std_error(sb, errno) \
do { \
if ((errno)) \
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 53fd2431062c..1aad4ae0e7ae 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2374,18 +2374,19 @@ int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
int ext4_ext_index_trans_blocks(struct inode *inode, int extents)
{
int index;
- int depth;
/* If we are converting the inline data, only one is needed here. */
if (ext4_has_inline_data(inode))
return 1;
- depth = ext_depth(inode);
-
+ /*
+ * Extent tree can change between the time we estimate credits and
+ * the time we actually modify the tree. Assume the worst case.
+ */
if (extents <= 1)
- index = depth * 2;
+ index = EXT4_MAX_EXTENT_DEPTH * 2;
else
- index = depth * 3;
+ index = EXT4_MAX_EXTENT_DEPTH * 3;
return index;
}
@@ -4973,12 +4974,7 @@ static const struct iomap_ops ext4_iomap_xattr_ops = {
static int ext4_fiemap_check_ranges(struct inode *inode, u64 start, u64 *len)
{
- u64 maxbytes;
-
- if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
- maxbytes = inode->i_sb->s_maxbytes;
- else
- maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes;
+ u64 maxbytes = ext4_get_maxbytes(inode);
if (*len == 0)
return -EINVAL;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 202101c2cb05..289b088f4ae5 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -880,12 +880,7 @@ static int ext4_file_open(struct inode *inode, struct file *filp)
loff_t ext4_llseek(struct file *file, loff_t offset, int whence)
{
struct inode *inode = file->f_mapping->host;
- loff_t maxbytes;
-
- if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
- maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes;
- else
- maxbytes = inode->i_sb->s_maxbytes;
+ loff_t maxbytes = ext4_get_maxbytes(inode);
switch (whence) {
default:
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 7c9efc9330fe..312be3d7cfb3 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -393,7 +393,7 @@ out:
}
static int ext4_prepare_inline_data(handle_t *handle, struct inode *inode,
- unsigned int len)
+ loff_t len)
{
int ret, size, no_expand;
struct ext4_inode_info *ei = EXT4_I(inode);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 694af768ac5b..055216f9f464 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -406,10 +406,11 @@ static int __check_block_validity(struct inode *inode, const char *func,
unsigned int line,
struct ext4_map_blocks *map)
{
- if (ext4_has_feature_journal(inode->i_sb) &&
- (inode->i_ino ==
- le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum)))
+ journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
+
+ if (journal && inode == journal->j_inode)
return 0;
+
if (!ext4_inode_block_valid(inode, map->m_pblk, map->m_len)) {
ext4_error_inode(inode, func, line, map->m_pblk,
"lblock %lu mapped to illegal pblock %llu "
@@ -4776,22 +4777,43 @@ static inline void ext4_inode_set_iversion_queried(struct inode *inode, u64 val)
inode_set_iversion_queried(inode, val);
}
-static const char *check_igot_inode(struct inode *inode, ext4_iget_flags flags)
-
+static int check_igot_inode(struct inode *inode, ext4_iget_flags flags,
+ const char *function, unsigned int line)
{
+ const char *err_str;
+
if (flags & EXT4_IGET_EA_INODE) {
- if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL))
- return "missing EA_INODE flag";
+ if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) {
+ err_str = "missing EA_INODE flag";
+ goto error;
+ }
if (ext4_test_inode_state(inode, EXT4_STATE_XATTR) ||
- EXT4_I(inode)->i_file_acl)
- return "ea_inode with extended attributes";
+ EXT4_I(inode)->i_file_acl) {
+ err_str = "ea_inode with extended attributes";
+ goto error;
+ }
} else {
- if ((EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL))
- return "unexpected EA_INODE flag";
+ if ((EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) {
+ /*
+ * open_by_handle_at() could provide an old inode number
+ * that has since been reused for an ea_inode; this does
+ * not indicate filesystem corruption
+ */
+ if (flags & EXT4_IGET_HANDLE)
+ return -ESTALE;
+ err_str = "unexpected EA_INODE flag";
+ goto error;
+ }
+ }
+ if (is_bad_inode(inode) && !(flags & EXT4_IGET_BAD)) {
+ err_str = "unexpected bad inode w/o EXT4_IGET_BAD";
+ goto error;
}
- if (is_bad_inode(inode) && !(flags & EXT4_IGET_BAD))
- return "unexpected bad inode w/o EXT4_IGET_BAD";
- return NULL;
+ return 0;
+
+error:
+ ext4_error_inode(inode, function, line, 0, err_str);
+ return -EFSCORRUPTED;
}
struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
@@ -4803,7 +4825,6 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
struct ext4_inode_info *ei;
struct ext4_super_block *es = EXT4_SB(sb)->s_es;
struct inode *inode;
- const char *err_str;
journal_t *journal = EXT4_SB(sb)->s_journal;
long ret;
loff_t size;
@@ -4832,10 +4853,10 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
if (!inode)
return ERR_PTR(-ENOMEM);
if (!(inode->i_state & I_NEW)) {
- if ((err_str = check_igot_inode(inode, flags)) != NULL) {
- ext4_error_inode(inode, function, line, 0, err_str);
+ ret = check_igot_inode(inode, flags, function, line);
+ if (ret) {
iput(inode);
- return ERR_PTR(-EFSCORRUPTED);
+ return ERR_PTR(ret);
}
return inode;
}
@@ -4947,7 +4968,8 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
ei->i_file_acl |=
((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
inode->i_size = ext4_isize(sb, raw_inode);
- if ((size = i_size_read(inode)) < 0) {
+ size = i_size_read(inode);
+ if (size < 0 || size > ext4_get_maxbytes(inode)) {
ext4_error_inode(inode, function, line, 0,
"iget: bad i_size value: %lld", size);
ret = -EFSCORRUPTED;
@@ -5107,13 +5129,21 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
ret = -EFSCORRUPTED;
goto bad_inode;
}
- if ((err_str = check_igot_inode(inode, flags)) != NULL) {
- ext4_error_inode(inode, function, line, 0, err_str);
- ret = -EFSCORRUPTED;
- goto bad_inode;
+ ret = check_igot_inode(inode, flags, function, line);
+ /*
+ * -ESTALE here means there is nothing inherently wrong with the inode,
+ * it's just not an inode we can return for an fhandle lookup.
+ */
+ if (ret == -ESTALE) {
+ brelse(iloc.bh);
+ unlock_new_inode(inode);
+ iput(inode);
+ return ERR_PTR(-ESTALE);
}
-
+ if (ret)
+ goto bad_inode;
brelse(iloc.bh);
+
unlock_new_inode(inode);
return inode;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index b6a8b6c851cc..b8683c5acffe 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2041,7 +2041,7 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
* split it in half by count; each resulting block will have at least
* half the space free.
*/
- if (i > 0)
+ if (i >= 0)
split = count - move;
else
split = count/2;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 53f1deb049ec..f829f989f2b5 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2741,6 +2741,13 @@ static int ext4_check_opt_consistency(struct fs_context *fc,
}
if (is_remount) {
+ if (!sbi->s_journal &&
+ ctx_test_mount_opt(ctx, EXT4_MOUNT_DATA_ERR_ABORT)) {
+ ext4_msg(NULL, KERN_WARNING,
+ "Remounting fs w/o journal so ignoring data_err option");
+ ctx_clear_mount_opt(ctx, EXT4_MOUNT_DATA_ERR_ABORT);
+ }
+
if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS) &&
(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)) {
ext4_msg(NULL, KERN_ERR, "can't mount with "
@@ -5318,6 +5325,11 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
"data=, fs mounted w/o journal");
goto failed_mount3a;
}
+ if (test_opt(sb, DATA_ERR_ABORT)) {
+ ext4_msg(sb, KERN_ERR,
+ "can't mount with data_err=abort, fs mounted w/o journal");
+ goto failed_mount3a;
+ }
sbi->s_def_mount_opt &= ~EXT4_MOUNT_JOURNAL_CHECKSUM;
clear_opt(sb, JOURNAL_CHECKSUM);
clear_opt(sb, DATA_FLAGS);
@@ -6685,22 +6697,29 @@ static int ext4_statfs_project(struct super_block *sb,
dquot->dq_dqb.dqb_bhardlimit);
limit >>= sb->s_blocksize_bits;
- if (limit && buf->f_blocks > limit) {
+ if (limit) {
+ uint64_t remaining = 0;
+
curblock = (dquot->dq_dqb.dqb_curspace +
dquot->dq_dqb.dqb_rsvspace) >> sb->s_blocksize_bits;
- buf->f_blocks = limit;
- buf->f_bfree = buf->f_bavail =
- (buf->f_blocks > curblock) ?
- (buf->f_blocks - curblock) : 0;
+ if (limit > curblock)
+ remaining = limit - curblock;
+
+ buf->f_blocks = min(buf->f_blocks, limit);
+ buf->f_bfree = min(buf->f_bfree, remaining);
+ buf->f_bavail = min(buf->f_bavail, remaining);
}
limit = min_not_zero(dquot->dq_dqb.dqb_isoftlimit,
dquot->dq_dqb.dqb_ihardlimit);
- if (limit && buf->f_files > limit) {
- buf->f_files = limit;
- buf->f_ffree =
- (buf->f_files > dquot->dq_dqb.dqb_curinodes) ?
- (buf->f_files - dquot->dq_dqb.dqb_curinodes) : 0;
+ if (limit) {
+ uint64_t remaining = 0;
+
+ if (limit > dquot->dq_dqb.dqb_curinodes)
+ remaining = limit - dquot->dq_dqb.dqb_curinodes;
+
+ buf->f_files = min(buf->f_files, limit);
+ buf->f_ffree = min(buf->f_ffree, remaining);
}
spin_unlock(&dquot->dq_dqb_lock);
@@ -6803,12 +6822,25 @@ static int ext4_release_dquot(struct dquot *dquot)
{
int ret, err;
handle_t *handle;
+ bool freeze_protected = false;
+
+ /*
+ * Trying to sb_start_intwrite() in a running transaction
+ * can result in a deadlock. Further, running transactions
+ * are already protected from freezing.
+ */
+ if (!ext4_journal_current_handle()) {
+ sb_start_intwrite(dquot->dq_sb);
+ freeze_protected = true;
+ }
handle = ext4_journal_start(dquot_to_inode(dquot), EXT4_HT_QUOTA,
EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb));
if (IS_ERR(handle)) {
/* Release dquot anyway to avoid endless cycle in dqput() */
dquot_release(dquot);
+ if (freeze_protected)
+ sb_end_intwrite(dquot->dq_sb);
return PTR_ERR(handle);
}
ret = dquot_release(dquot);
@@ -6819,6 +6851,10 @@ static int ext4_release_dquot(struct dquot *dquot)
err = ext4_journal_stop(handle);
if (!ret)
ret = err;
+
+ if (freeze_protected)
+ sb_end_intwrite(dquot->dq_sb);
+
return ret;
}
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 5598aec75775..95dbc7c9843b 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -1127,15 +1127,24 @@ ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent,
{
struct inode *ea_inode;
struct ext4_xattr_entry *entry;
+ struct ext4_iloc iloc;
bool dirty = false;
unsigned int ea_ino;
int err;
int credits;
+ void *end;
+
+ if (block_csum)
+ end = (void *)bh->b_data + bh->b_size;
+ else {
+ ext4_get_inode_loc(parent, &iloc);
+ end = (void *)ext4_raw_inode(&iloc) + EXT4_SB(parent->i_sb)->s_inode_size;
+ }
/* One credit for dec ref on ea_inode, one for orphan list addition, */
credits = 2 + extra_credits;
- for (entry = first; !IS_LAST_ENTRY(entry);
+ for (entry = first; (void *)entry < end && !IS_LAST_ENTRY(entry);
entry = EXT4_XATTR_NEXT(entry)) {
if (!entry->e_value_inum)
continue;
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 0b0e3d44e158..dc8f283f210c 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -56,8 +56,8 @@ bool f2fs_is_cp_guaranteed(struct page *page)
struct inode *inode;
struct f2fs_sb_info *sbi;
- if (!mapping)
- return false;
+ if (fscrypt_is_bounce_page(page))
+ return page_private_gcing(fscrypt_pagecache_page(page));
inode = mapping->host;
sbi = F2FS_I_SB(inode);
@@ -273,7 +273,7 @@ static void f2fs_read_end_io(struct bio *bio)
{
struct f2fs_sb_info *sbi = F2FS_P_SB(bio_first_page_all(bio));
struct bio_post_read_ctx *ctx;
- bool intask = in_task();
+ bool intask = in_task() && !irqs_disabled();
iostat_update_and_unbind_ctx(bio, 0);
ctx = bio->bi_private;
diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
index f13143efc4b1..a5c63c7da299 100644
--- a/fs/f2fs/extent_cache.c
+++ b/fs/f2fs/extent_cache.c
@@ -440,7 +440,7 @@ void f2fs_init_read_extent_tree(struct inode *inode, struct page *ipage)
struct f2fs_extent *i_ext = &F2FS_INODE(ipage)->i_ext;
struct extent_tree *et;
struct extent_node *en;
- struct extent_info ei;
+ struct extent_info ei = {0};
if (!__may_extent_tree(inode, EX_READ)) {
/* drop largest read extent */
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 840a45855451..1ad9669666e8 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1226,7 +1226,7 @@ struct f2fs_bio_info {
#define RDEV(i) (raw_super->devs[i])
struct f2fs_dev_info {
struct block_device *bdev;
- char path[MAX_PATH_LEN];
+ char path[MAX_PATH_LEN + 1];
unsigned int total_segments;
block_t start_blk;
block_t end_blk;
@@ -2387,8 +2387,14 @@ static inline void dec_valid_block_count(struct f2fs_sb_info *sbi,
blkcnt_t sectors = count << F2FS_LOG_SECTORS_PER_BLOCK;
spin_lock(&sbi->stat_lock);
- f2fs_bug_on(sbi, sbi->total_valid_block_count < (block_t) count);
- sbi->total_valid_block_count -= (block_t)count;
+ if (unlikely(sbi->total_valid_block_count < count)) {
+ f2fs_warn(sbi, "Inconsistent total_valid_block_count:%u, ino:%lu, count:%u",
+ sbi->total_valid_block_count, inode->i_ino, count);
+ sbi->total_valid_block_count = 0;
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
+ } else {
+ sbi->total_valid_block_count -= count;
+ }
if (sbi->reserved_blocks &&
sbi->current_reserved_blocks < sbi->reserved_blocks)
sbi->current_reserved_blocks = min(sbi->reserved_blocks,
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 0f350368dea7..c02b5ea43f07 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -695,8 +695,12 @@ retry:
if (err == -ENOENT)
return;
+ if (err == -EFSCORRUPTED)
+ goto stop_checkpoint;
+
if (err == -ENOMEM || ++count <= DEFAULT_RETRY_IO_COUNT)
goto retry;
+stop_checkpoint:
f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_UPDATE_INODE);
return;
}
@@ -827,6 +831,19 @@ retry:
f2fs_update_inode_page(inode);
if (dquot_initialize_needed(inode))
set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR);
+
+ /*
+ * If both f2fs_truncate() and f2fs_update_inode_page() failed
+ * due to fuzzed corrupted inode, call f2fs_inode_synced() to
+ * avoid triggering later f2fs_bug_on().
+ */
+ if (is_inode_flag_set(inode, FI_DIRTY_INODE)) {
+ f2fs_warn(sbi,
+ "f2fs_evict_inode: inode is dirty, ino:%lu",
+ inode->i_ino);
+ f2fs_inode_synced(inode);
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
+ }
}
if (!is_sbi_flag_set(sbi, SBI_IS_FREEZING))
sb_end_intwrite(inode->i_sb);
@@ -843,8 +860,12 @@ no_delete:
if (likely(!f2fs_cp_error(sbi) &&
!is_sbi_flag_set(sbi, SBI_CP_DISABLED)))
f2fs_bug_on(sbi, is_inode_flag_set(inode, FI_DIRTY_INODE));
- else
- f2fs_inode_synced(inode);
+
+ /*
+ * anyway, it needs to remove the inode from sbi->inode_list[DIRTY_META]
+ * list to avoid UAF in f2fs_sync_inode_meta() during checkpoint.
+ */
+ f2fs_inode_synced(inode);
/* for the case f2fs_new_inode() was failed, .i_ino is zero, skip it */
if (inode->i_ino)
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 9da104c0743c..f61290a52213 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -401,7 +401,7 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
if (is_inode_flag_set(dir, FI_PROJ_INHERIT) &&
(!projid_eq(F2FS_I(dir)->i_projid,
- F2FS_I(old_dentry->d_inode)->i_projid)))
+ F2FS_I(inode)->i_projid)))
return -EXDEV;
err = f2fs_dquot_initialize(dir);
@@ -550,6 +550,15 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
goto fail;
}
+ if (unlikely(inode->i_nlink == 0)) {
+ f2fs_warn(F2FS_I_SB(inode), "%s: inode (ino=%lx) has zero i_nlink",
+ __func__, inode->i_ino);
+ err = -EFSCORRUPTED;
+ set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK);
+ f2fs_put_page(page, 0);
+ goto fail;
+ }
+
f2fs_balance_fs(sbi, true);
f2fs_lock_op(sbi);
@@ -896,7 +905,7 @@ static int f2fs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
if (is_inode_flag_set(new_dir, FI_PROJ_INHERIT) &&
(!projid_eq(F2FS_I(new_dir)->i_projid,
- F2FS_I(old_dentry->d_inode)->i_projid)))
+ F2FS_I(old_inode)->i_projid)))
return -EXDEV;
/*
@@ -1085,10 +1094,10 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
if ((is_inode_flag_set(new_dir, FI_PROJ_INHERIT) &&
!projid_eq(F2FS_I(new_dir)->i_projid,
- F2FS_I(old_dentry->d_inode)->i_projid)) ||
- (is_inode_flag_set(new_dir, FI_PROJ_INHERIT) &&
+ F2FS_I(old_inode)->i_projid)) ||
+ (is_inode_flag_set(old_dir, FI_PROJ_INHERIT) &&
!projid_eq(F2FS_I(old_dir)->i_projid,
- F2FS_I(new_dentry->d_inode)->i_projid)))
+ F2FS_I(new_inode)->i_projid)))
return -EXDEV;
err = f2fs_dquot_initialize(old_dir);
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 745ecf5523c9..ccc72781e0c6 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -1112,7 +1112,14 @@ int f2fs_truncate_inode_blocks(struct inode *inode, pgoff_t from)
trace_f2fs_truncate_inode_blocks_enter(inode, from);
level = get_node_path(inode, from, offset, noffset);
- if (level < 0) {
+ if (level <= 0) {
+ if (!level) {
+ level = -EFSCORRUPTED;
+ f2fs_err(sbi, "%s: inode ino=%lx has corrupted node block, from:%lu addrs:%u",
+ __func__, inode->i_ino,
+ from, ADDRS_PER_INODE(inode));
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
+ }
trace_f2fs_truncate_inode_blocks_exit(inode, level);
return level;
}
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 5ef5a88f47a0..20d4387c661d 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -625,8 +625,7 @@ static inline void __get_secs_required(struct f2fs_sb_info *sbi,
unsigned int dent_blocks = total_dent_blocks % CAP_BLKS_PER_SEC(sbi);
unsigned int data_blocks = 0;
- if (f2fs_lfs_mode(sbi) &&
- unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) {
+ if (f2fs_lfs_mode(sbi)) {
total_data_blocks = get_pages(sbi, F2FS_DIRTY_DATA);
data_secs = total_data_blocks / CAP_BLKS_PER_SEC(sbi);
data_blocks = total_data_blocks % CAP_BLKS_PER_SEC(sbi);
@@ -635,7 +634,7 @@ static inline void __get_secs_required(struct f2fs_sb_info *sbi,
if (lower_p)
*lower_p = node_secs + dent_secs + data_secs;
if (upper_p)
- *upper_p = node_secs + dent_secs +
+ *upper_p = node_secs + dent_secs + data_secs +
(node_blocks ? 1 : 0) + (dent_blocks ? 1 : 0) +
(data_blocks ? 1 : 0);
if (curseg_p)
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 72160b906f4b..84fc6591e3f9 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -1753,26 +1753,32 @@ static int f2fs_statfs_project(struct super_block *sb,
limit = min_not_zero(dquot->dq_dqb.dqb_bsoftlimit,
dquot->dq_dqb.dqb_bhardlimit);
- if (limit)
- limit >>= sb->s_blocksize_bits;
+ limit >>= sb->s_blocksize_bits;
+
+ if (limit) {
+ uint64_t remaining = 0;
- if (limit && buf->f_blocks > limit) {
curblock = (dquot->dq_dqb.dqb_curspace +
dquot->dq_dqb.dqb_rsvspace) >> sb->s_blocksize_bits;
- buf->f_blocks = limit;
- buf->f_bfree = buf->f_bavail =
- (buf->f_blocks > curblock) ?
- (buf->f_blocks - curblock) : 0;
+ if (limit > curblock)
+ remaining = limit - curblock;
+
+ buf->f_blocks = min(buf->f_blocks, limit);
+ buf->f_bfree = min(buf->f_bfree, remaining);
+ buf->f_bavail = min(buf->f_bavail, remaining);
}
limit = min_not_zero(dquot->dq_dqb.dqb_isoftlimit,
dquot->dq_dqb.dqb_ihardlimit);
- if (limit && buf->f_files > limit) {
- buf->f_files = limit;
- buf->f_ffree =
- (buf->f_files > dquot->dq_dqb.dqb_curinodes) ?
- (buf->f_files - dquot->dq_dqb.dqb_curinodes) : 0;
+ if (limit) {
+ uint64_t remaining = 0;
+
+ if (limit > dquot->dq_dqb.dqb_curinodes)
+ remaining = limit - dquot->dq_dqb.dqb_curinodes;
+
+ buf->f_files = min(buf->f_files, limit);
+ buf->f_ffree = min(buf->f_ffree, remaining);
}
spin_unlock(&dquot->dq_dqb_lock);
@@ -1830,9 +1836,9 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_fsid = u64_to_fsid(id);
#ifdef CONFIG_QUOTA
- if (is_inode_flag_set(dentry->d_inode, FI_PROJ_INHERIT) &&
+ if (is_inode_flag_set(d_inode(dentry), FI_PROJ_INHERIT) &&
sb_has_quota_limits_enabled(sb, PRJQUOTA)) {
- f2fs_statfs_project(sb, F2FS_I(dentry->d_inode)->i_projid, buf);
+ f2fs_statfs_project(sb, F2FS_I(d_inode(dentry))->i_projid, buf);
}
#endif
return 0;
@@ -3499,6 +3505,7 @@ int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi)
block_t user_block_count, valid_user_blocks;
block_t avail_node_count, valid_node_count;
unsigned int nat_blocks, nat_bits_bytes, nat_bits_blocks;
+ unsigned int sit_blk_cnt;
int i, j;
total = le32_to_cpu(raw_super->segment_count);
@@ -3610,6 +3617,13 @@ skip_cross:
return 1;
}
+ sit_blk_cnt = DIV_ROUND_UP(main_segs, SIT_ENTRY_PER_BLOCK);
+ if (sit_bitmap_size * 8 < sit_blk_cnt) {
+ f2fs_err(sbi, "Wrong bitmap size: sit: %u, sit_blk_cnt:%u",
+ sit_bitmap_size, sit_blk_cnt);
+ return 1;
+ }
+
cp_pack_start_sum = __start_sum_addr(sbi);
cp_payload = __cp_payload(sbi);
if (cp_pack_start_sum < cp_payload + 1 ||
diff --git a/fs/file.c b/fs/file.c
index bc0c087b31bb..2eccbb5dcd86 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -362,17 +362,25 @@ struct files_struct *dup_fd(struct files_struct *oldf, struct fd_range *punch_ho
old_fds = old_fdt->fd;
new_fds = new_fdt->fd;
+ /*
+ * We may be racing against fd allocation from other threads using this
+ * files_struct, despite holding ->file_lock.
+ *
+ * alloc_fd() might have already claimed a slot, while fd_install()
+ * did not populate it yet. Note the latter operates locklessly, so
+ * the file can show up as we are walking the array below.
+ *
+ * At the same time we know no files will disappear as all other
+ * operations take the lock.
+ *
+ * Instead of trying to placate userspace racing with itself, we
+ * ref the file if we see it and mark the fd slot as unused otherwise.
+ */
for (i = open_files; i != 0; i--) {
- struct file *f = *old_fds++;
+ struct file *f = rcu_dereference_raw(*old_fds++);
if (f) {
get_file(f);
} else {
- /*
- * The fd may be claimed in the fd bitmap but not yet
- * instantiated in the files array if a sibling thread
- * is partway through open(). So make sure that this
- * fd is available to the new process.
- */
__clear_open_fd(open_files - i, new_fdt);
}
rcu_assign_pointer(*new_fds++, f);
@@ -625,7 +633,7 @@ static struct file *pick_file(struct files_struct *files, unsigned fd)
return NULL;
fd = array_index_nospec(fd, fdt->max_fds);
- file = fdt->fd[fd];
+ file = rcu_dereference_raw(fdt->fd[fd]);
if (file) {
rcu_assign_pointer(fdt->fd[fd], NULL);
__put_unused_fd(files, fd);
@@ -1093,7 +1101,7 @@ __releases(&files->file_lock)
*/
fdt = files_fdtable(files);
fd = array_index_nospec(fd, fdt->max_fds);
- tofree = fdt->fd[fd];
+ tofree = rcu_dereference_raw(fdt->fd[fd]);
if (!tofree && fd_is_open(fd, fdt))
goto Ebusy;
get_file(file);
diff --git a/fs/filesystems.c b/fs/filesystems.c
index 58b9067b2391..95e5256821a5 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -156,15 +156,19 @@ static int fs_index(const char __user * __name)
static int fs_name(unsigned int index, char __user * buf)
{
struct file_system_type * tmp;
- int len, res;
+ int len, res = -EINVAL;
read_lock(&file_systems_lock);
- for (tmp = file_systems; tmp; tmp = tmp->next, index--)
- if (index <= 0 && try_module_get(tmp->owner))
+ for (tmp = file_systems; tmp; tmp = tmp->next, index--) {
+ if (index == 0) {
+ if (try_module_get(tmp->owner))
+ res = 0;
break;
+ }
+ }
read_unlock(&file_systems_lock);
- if (!tmp)
- return -EINVAL;
+ if (res)
+ return res;
/* OK, we got the reference, so we can safely block */
len = strlen(tmp->name) + 1;
diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c
index 6e71904c396f..dc28c28654d9 100644
--- a/fs/fuse/dax.c
+++ b/fs/fuse/dax.c
@@ -681,7 +681,6 @@ static int __fuse_dax_break_layouts(struct inode *inode, bool *retry,
0, 0, fuse_wait_dax_page(inode));
}
-/* dmap_end == 0 leads to unmapping of whole file */
int fuse_dax_break_layouts(struct inode *inode, u64 dmap_start,
u64 dmap_end)
{
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index aa2be4c1ea8f..0dbacdd7bb0d 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1068,6 +1068,8 @@ static int fuse_link(struct dentry *entry, struct inode *newdir,
else if (err == -EINTR)
fuse_invalidate_attr(inode);
+ if (err == -ENOSYS)
+ err = -EPERM;
return err;
}
@@ -1445,7 +1447,7 @@ static const char *fuse_get_link(struct dentry *dentry, struct inode *inode,
goto out_err;
if (fc->cache_symlinks)
- return page_get_link(dentry, inode, callback);
+ return page_get_link_raw(dentry, inode, callback);
err = -ECHILD;
if (!dentry)
@@ -1712,7 +1714,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
if (FUSE_IS_DAX(inode) && is_truncate) {
filemap_invalidate_lock(mapping);
fault_blocked = true;
- err = fuse_dax_break_layouts(inode, 0, 0);
+ err = fuse_dax_break_layouts(inode, 0, -1);
if (err) {
filemap_invalidate_unlock(mapping);
return err;
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 0df1311afb87..723dd9b94e56 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -240,7 +240,7 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
if (dax_truncate) {
filemap_invalidate_lock(inode->i_mapping);
- err = fuse_dax_break_layouts(inode, 0, 0);
+ err = fuse_dax_break_layouts(inode, 0, -1);
if (err)
goto out_inode_unlock;
}
@@ -3020,7 +3020,7 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
inode_lock(inode);
if (block_faults) {
filemap_invalidate_lock(inode->i_mapping);
- err = fuse_dax_break_layouts(inode, 0, 0);
+ err = fuse_dax_break_layouts(inode, 0, -1);
if (err)
goto out;
}
diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
index 92d41269f1d3..9e4f2ba0ef9d 100644
--- a/fs/fuse/virtio_fs.c
+++ b/fs/fuse/virtio_fs.c
@@ -1429,6 +1429,9 @@ static int virtio_fs_get_tree(struct fs_context *fsc)
unsigned int virtqueue_size;
int err = -EIO;
+ if (!fsc->source)
+ return invalf(fsc, "No source specified");
+
/* This gets a reference on virtio_fs object. This ptr gets installed
* in fc->iq->priv. Once fuse_conn is going away, it calls ->put()
* to drop the reference to this object.
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 6ba8460f5331..428c1db295fa 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -885,11 +885,12 @@ static void run_queue(struct gfs2_glock *gl, const int nonblock)
__releases(&gl->gl_lockref.lock)
__acquires(&gl->gl_lockref.lock)
{
- struct gfs2_holder *gh = NULL;
+ struct gfs2_holder *gh;
if (test_and_set_bit(GLF_LOCK, &gl->gl_flags))
return;
+ /* While a demote is in progress, the GLF_LOCK flag must be set. */
GLOCK_BUG_ON(gl, test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags));
if (test_bit(GLF_DEMOTE, &gl->gl_flags) &&
@@ -901,18 +902,22 @@ __acquires(&gl->gl_lockref.lock)
set_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags);
GLOCK_BUG_ON(gl, gl->gl_demote_state == LM_ST_EXCLUSIVE);
gl->gl_target = gl->gl_demote_state;
+ do_xmote(gl, NULL, gl->gl_target);
+ return;
} else {
if (test_bit(GLF_DEMOTE, &gl->gl_flags))
gfs2_demote_wake(gl);
if (do_promote(gl) == 0)
goto out_unlock;
gh = find_first_waiter(gl);
+ if (!gh)
+ goto out_unlock;
gl->gl_target = gh->gh_state;
if (!(gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)))
do_error(gl, 0); /* Fail queued try locks */
+ do_xmote(gl, gh, gl->gl_target);
+ return;
}
- do_xmote(gl, gh, gl->gl_target);
- return;
out_sched:
clear_bit(GLF_LOCK, &gl->gl_flags);
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 04fc3e72a96e..06629aeefbe6 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -631,7 +631,8 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
if (!IS_ERR(inode)) {
if (S_ISDIR(inode->i_mode)) {
iput(inode);
- inode = ERR_PTR(-EISDIR);
+ inode = NULL;
+ error = -EISDIR;
goto fail_gunlock;
}
d_instantiate(dentry, inode);
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 884081730f9f..588760c1a5da 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -950,14 +950,15 @@ locks_done:
if (sdp->sd_args.ar_spectator) {
fs_info(sdp, "Recovery is required. Waiting for a "
"non-spectator to mount.\n");
+ spin_unlock(&ls->ls_recover_spin);
msleep_interruptible(1000);
} else {
fs_info(sdp, "control_mount wait1 block %u start %u "
"mount %u lvb %u flags %lx\n", block_gen,
start_gen, mount_gen, lvb_gen,
ls->ls_recover_flags);
+ spin_unlock(&ls->ls_recover_spin);
}
- spin_unlock(&ls->ls_recover_spin);
goto restart;
}
diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c
index 6add6ebfef89..cb823a8a6ba9 100644
--- a/fs/hfs/bnode.c
+++ b/fs/hfs/bnode.c
@@ -67,6 +67,12 @@ void hfs_bnode_read_key(struct hfs_bnode *node, void *key, int off)
else
key_len = tree->max_key_len + 1;
+ if (key_len > sizeof(hfs_btree_key) || key_len < 1) {
+ memset(key, 0, sizeof(hfs_btree_key));
+ pr_err("hfs: Invalid key length: %d\n", key_len);
+ return;
+ }
+
hfs_bnode_read(node, key, off, key_len);
}
diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c
index 87974d5e6791..079ea80534f7 100644
--- a/fs/hfsplus/bnode.c
+++ b/fs/hfsplus/bnode.c
@@ -67,6 +67,12 @@ void hfs_bnode_read_key(struct hfs_bnode *node, void *key, int off)
else
key_len = tree->max_key_len + 2;
+ if (key_len > sizeof(hfsplus_btree_key) || key_len < 1) {
+ memset(key, 0, sizeof(hfsplus_btree_key));
+ pr_err("hfsplus: Invalid key length: %d\n", key_len);
+ return;
+ }
+
hfs_bnode_read(node, key, off, key_len);
}
diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c
index 91354e769642..839bf83448c3 100644
--- a/fs/hfsplus/extents.c
+++ b/fs/hfsplus/extents.c
@@ -342,9 +342,6 @@ static int hfsplus_free_extents(struct super_block *sb,
int i;
int err = 0;
- /* Mapping the allocation file may lock the extent tree */
- WARN_ON(mutex_is_locked(&HFSPLUS_SB(sb)->ext_tree->tree_lock));
-
hfsplus_dump_extent(extent);
for (i = 0; i < 8; extent++, i++) {
count = be32_to_cpu(extent->block_count);
diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c
index eb2f8273e6f1..09df40b612fb 100644
--- a/fs/isofs/dir.c
+++ b/fs/isofs/dir.c
@@ -147,7 +147,8 @@ static int do_isofs_readdir(struct inode *inode, struct file *file,
de = tmpde;
}
/* Basic sanity check, whether name doesn't exceed dir entry */
- if (de_len < de->name_len[0] +
+ if (de_len < sizeof(struct iso_directory_record) ||
+ de_len < de->name_len[0] +
sizeof(struct iso_directory_record)) {
printk(KERN_NOTICE "iso9660: Corrupted directory entry"
" in block %lu of inode %lu\n", block,
diff --git a/fs/isofs/export.c b/fs/isofs/export.c
index 35768a63fb1d..421d247fae52 100644
--- a/fs/isofs/export.c
+++ b/fs/isofs/export.c
@@ -180,7 +180,7 @@ static struct dentry *isofs_fh_to_parent(struct super_block *sb,
return NULL;
return isofs_export_iget(sb,
- fh_len > 2 ? ifid->parent_block : 0,
+ fh_len > 3 ? ifid->parent_block : 0,
ifid->parent_offset,
fh_len > 4 ? ifid->parent_generation : 0);
}
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 8a6c7fdc1d5f..e3052d3fe5dc 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -1491,9 +1491,16 @@ static int isofs_read_inode(struct inode *inode, int relocated)
inode->i_op = &page_symlink_inode_operations;
inode_nohighmem(inode);
inode->i_data.a_ops = &isofs_symlink_aops;
- } else
+ } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
+ S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
/* XXX - parse_rock_ridge_inode() had already set i_rdev. */
init_special_inode(inode, inode->i_mode, inode->i_rdev);
+ } else {
+ printk(KERN_DEBUG "ISOFS: Invalid file type 0%04o for inode %lu.\n",
+ inode->i_mode, inode->i_ino);
+ ret = -EIO;
+ goto fail;
+ }
ret = 0;
out:
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 55ada2b88146..41ab2dfd1ac2 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1711,7 +1711,6 @@ int jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid,
/* Log is no longer empty */
write_lock(&journal->j_state_lock);
- WARN_ON(!sb->s_sequence);
journal->j_flags &= ~JBD2_FLUSHED;
write_unlock(&journal->j_state_lock);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 6ef5022949c4..ca6f3f8729f3 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1518,7 +1518,7 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
jh->b_next_transaction == transaction);
spin_unlock(&jh->b_state_lock);
}
- if (jh->b_modified == 1) {
+ if (data_race(jh->b_modified == 1)) {
/* If it's in our transaction it must be in BJ_Metadata list. */
if (data_race(jh->b_transaction == transaction &&
jh->b_jlist != BJ_Metadata)) {
@@ -1537,7 +1537,6 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
goto out;
}
- journal = transaction->t_journal;
spin_lock(&jh->b_state_lock);
if (is_handle_aborted(handle)) {
@@ -1552,6 +1551,8 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
goto out_unlock_bh;
}
+ journal = transaction->t_journal;
+
if (jh->b_modified == 0) {
/*
* This buffer's got modified and becoming part
diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c
index ef3a1e1b6cb0..fda9f4d6093f 100644
--- a/fs/jffs2/erase.c
+++ b/fs/jffs2/erase.c
@@ -425,7 +425,9 @@ static void jffs2_mark_erased_block(struct jffs2_sb_info *c, struct jffs2_eraseb
.totlen = cpu_to_je32(c->cleanmarker_size)
};
- jffs2_prealloc_raw_node_refs(c, jeb, 1);
+ ret = jffs2_prealloc_raw_node_refs(c, jeb, 1);
+ if (ret)
+ goto filebad;
marker.hdr_crc = cpu_to_je32(crc32(0, &marker, sizeof(struct jffs2_unknown_node)-4));
diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index 29671e33a171..62879c218d4b 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -256,7 +256,9 @@ int jffs2_scan_medium(struct jffs2_sb_info *c)
jffs2_dbg(1, "%s(): Skipping %d bytes in nextblock to ensure page alignment\n",
__func__, skip);
- jffs2_prealloc_raw_node_refs(c, c->nextblock, 1);
+ ret = jffs2_prealloc_raw_node_refs(c, c->nextblock, 1);
+ if (ret)
+ goto out;
jffs2_scan_dirty_space(c, c->nextblock, skip);
}
#endif
diff --git a/fs/jffs2/summary.c b/fs/jffs2/summary.c
index 4fe64519870f..d83372d3e1a0 100644
--- a/fs/jffs2/summary.c
+++ b/fs/jffs2/summary.c
@@ -858,7 +858,10 @@ int jffs2_sum_write_sumnode(struct jffs2_sb_info *c)
spin_unlock(&c->erase_completion_lock);
jeb = c->nextblock;
- jffs2_prealloc_raw_node_refs(c, jeb, 1);
+ ret = jffs2_prealloc_raw_node_refs(c, jeb, 1);
+
+ if (ret)
+ goto out;
if (!c->summary->sum_num || !c->summary->sum_list_head) {
JFFS2_WARNING("Empty summary info!!!\n");
@@ -872,6 +875,8 @@ int jffs2_sum_write_sumnode(struct jffs2_sb_info *c)
datasize += padsize;
ret = jffs2_sum_write_data(c, jeb, infosize, datasize, padsize);
+
+out:
spin_lock(&c->erase_completion_lock);
return ret;
}
diff --git a/fs/jfs/jfs_dinode.h b/fs/jfs/jfs_dinode.h
index 6b231d0d0071..603aae17a693 100644
--- a/fs/jfs/jfs_dinode.h
+++ b/fs/jfs/jfs_dinode.h
@@ -96,7 +96,7 @@ struct dinode {
#define di_gengen u._file._u1._imap._gengen
union {
- xtpage_t _xtroot;
+ xtroot_t _xtroot;
struct {
u8 unused[16]; /* 16: */
dxd_t _dxd; /* 16: */
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index 6509102e581a..c761291f59ac 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -178,41 +178,30 @@ int dbMount(struct inode *ipbmap)
dbmp_le = (struct dbmap_disk *) mp->data;
bmp->db_mapsize = le64_to_cpu(dbmp_le->dn_mapsize);
bmp->db_nfree = le64_to_cpu(dbmp_le->dn_nfree);
-
bmp->db_l2nbperpage = le32_to_cpu(dbmp_le->dn_l2nbperpage);
- if (bmp->db_l2nbperpage > L2PSIZE - L2MINBLOCKSIZE ||
- bmp->db_l2nbperpage < 0) {
- err = -EINVAL;
- goto err_release_metapage;
- }
-
bmp->db_numag = le32_to_cpu(dbmp_le->dn_numag);
- if (!bmp->db_numag || bmp->db_numag > MAXAG) {
- err = -EINVAL;
- goto err_release_metapage;
- }
-
bmp->db_maxlevel = le32_to_cpu(dbmp_le->dn_maxlevel);
bmp->db_maxag = le32_to_cpu(dbmp_le->dn_maxag);
bmp->db_agpref = le32_to_cpu(dbmp_le->dn_agpref);
- if (bmp->db_maxag >= MAXAG || bmp->db_maxag < 0 ||
- bmp->db_agpref >= MAXAG || bmp->db_agpref < 0) {
- err = -EINVAL;
- goto err_release_metapage;
- }
-
bmp->db_aglevel = le32_to_cpu(dbmp_le->dn_aglevel);
bmp->db_agheight = le32_to_cpu(dbmp_le->dn_agheight);
bmp->db_agwidth = le32_to_cpu(dbmp_le->dn_agwidth);
bmp->db_agstart = le32_to_cpu(dbmp_le->dn_agstart);
bmp->db_agl2size = le32_to_cpu(dbmp_le->dn_agl2size);
- if (bmp->db_agl2size > L2MAXL2SIZE - L2MAXAG ||
- bmp->db_agl2size < 0) {
- err = -EINVAL;
- goto err_release_metapage;
- }
- if (((bmp->db_mapsize - 1) >> bmp->db_agl2size) > MAXAG) {
+ if ((bmp->db_l2nbperpage > L2PSIZE - L2MINBLOCKSIZE) ||
+ (bmp->db_l2nbperpage < 0) ||
+ !bmp->db_numag || (bmp->db_numag > MAXAG) ||
+ (bmp->db_maxag >= MAXAG) || (bmp->db_maxag < 0) ||
+ (bmp->db_agpref >= MAXAG) || (bmp->db_agpref < 0) ||
+ (bmp->db_agheight < 0) || (bmp->db_agheight > (L2LPERCTL >> 1)) ||
+ (bmp->db_agwidth < 1) || (bmp->db_agwidth > (LPERCTL / MAXAG)) ||
+ (bmp->db_agwidth > (1 << (L2LPERCTL - (bmp->db_agheight << 1)))) ||
+ (bmp->db_agstart < 0) ||
+ (bmp->db_agstart > (CTLTREESIZE - 1 - bmp->db_agwidth * (MAXAG - 1))) ||
+ (bmp->db_agl2size > L2MAXL2SIZE - L2MAXAG) ||
+ (bmp->db_agl2size < 0) ||
+ ((bmp->db_mapsize - 1) >> bmp->db_agl2size) > MAXAG) {
err = -EINVAL;
goto err_release_metapage;
}
@@ -1820,8 +1809,10 @@ dbAllocCtl(struct bmap * bmp, s64 nblocks, int l2nb, s64 blkno, s64 * results)
return -EIO;
dp = (struct dmap *) mp->data;
- if (dp->tree.budmin < 0)
+ if (dp->tree.budmin < 0) {
+ release_metapage(mp);
return -EIO;
+ }
/* try to allocate the blocks.
*/
@@ -3403,7 +3394,7 @@ int dbExtendFS(struct inode *ipbmap, s64 blkno, s64 nblocks)
oldl2agsize = bmp->db_agl2size;
bmp->db_agl2size = l2agsize;
- bmp->db_agsize = 1 << l2agsize;
+ bmp->db_agsize = (s64)1 << l2agsize;
/* compute new number of AG */
agno = bmp->db_numag;
@@ -3666,8 +3657,8 @@ void dbFinalizeBmap(struct inode *ipbmap)
* system size is not a multiple of the group size).
*/
inactfree = (inactags && ag_rem) ?
- ((inactags - 1) << bmp->db_agl2size) + ag_rem
- : inactags << bmp->db_agl2size;
+ (((s64)inactags - 1) << bmp->db_agl2size) + ag_rem
+ : ((s64)inactags << bmp->db_agl2size);
/* determine how many free blocks are in the active
* allocation groups plus the average number of free blocks
diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c
index 8f85177f284b..93db6eec4465 100644
--- a/fs/jfs/jfs_dtree.c
+++ b/fs/jfs/jfs_dtree.c
@@ -117,7 +117,8 @@ do { \
if (!(RC)) { \
if (((P)->header.nextindex > \
(((BN) == 0) ? DTROOTMAXSLOT : (P)->header.maxslot)) || \
- ((BN) && ((P)->header.maxslot > DTPAGEMAXSLOT))) { \
+ ((BN) && (((P)->header.maxslot > DTPAGEMAXSLOT) || \
+ ((P)->header.stblindex >= DTPAGEMAXSLOT)))) { \
BT_PUTPAGE(MP); \
jfs_error((IP)->i_sb, \
"DT_GETPAGE: dtree page corrupt\n"); \
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index c72e97f06579..1f2e452a7676 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -102,7 +102,7 @@ int diMount(struct inode *ipimap)
* allocate/initialize the in-memory inode map control structure
*/
/* allocate the in-memory inode map control structure. */
- imap = kmalloc(sizeof(struct inomap), GFP_KERNEL);
+ imap = kzalloc(sizeof(struct inomap), GFP_KERNEL);
if (imap == NULL)
return -ENOMEM;
@@ -456,7 +456,7 @@ struct inode *diReadSpecial(struct super_block *sb, ino_t inum, int secondary)
dp += inum % 8; /* 8 inodes per 4K page */
/* copy on-disk inode to in-memory inode */
- if ((copy_from_dinode(dp, ip)) != 0) {
+ if ((copy_from_dinode(dp, ip) != 0) || (ip->i_nlink == 0)) {
/* handle bad return by returning NULL for ip */
set_nlink(ip, 1); /* Don't want iput() deleting it */
iput(ip);
@@ -673,7 +673,7 @@ int diWrite(tid_t tid, struct inode *ip)
* This is the special xtree inside the directory for storing
* the directory table
*/
- xtpage_t *p, *xp;
+ xtroot_t *p, *xp;
xad_t *xad;
jfs_ip->xtlid = 0;
@@ -687,7 +687,7 @@ int diWrite(tid_t tid, struct inode *ip)
* copy xtree root from inode to dinode:
*/
p = &jfs_ip->i_xtroot;
- xp = (xtpage_t *) &dp->di_dirtable;
+ xp = (xtroot_t *) &dp->di_dirtable;
lv = ilinelock->lv;
for (n = 0; n < ilinelock->index; n++, lv++) {
memcpy(&xp->xad[lv->offset], &p->xad[lv->offset],
@@ -716,7 +716,7 @@ int diWrite(tid_t tid, struct inode *ip)
* regular file: 16 byte (XAD slot) granularity
*/
if (type & tlckXTREE) {
- xtpage_t *p, *xp;
+ xtroot_t *p, *xp;
xad_t *xad;
/*
@@ -3029,14 +3029,23 @@ static void duplicateIXtree(struct super_block *sb, s64 blkno,
*
* RETURN VALUES:
* 0 - success
- * -ENOMEM - insufficient memory
+ * -EINVAL - unexpected inode type
*/
static int copy_from_dinode(struct dinode * dip, struct inode *ip)
{
struct jfs_inode_info *jfs_ip = JFS_IP(ip);
struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb);
+ int fileset = le32_to_cpu(dip->di_fileset);
+
+ switch (fileset) {
+ case AGGR_RESERVED_I: case AGGREGATE_I: case BMAP_I:
+ case LOG_I: case BADBLOCK_I: case FILESYSTEM_I:
+ break;
+ default:
+ return -EINVAL;
+ }
- jfs_ip->fileset = le32_to_cpu(dip->di_fileset);
+ jfs_ip->fileset = fileset;
jfs_ip->mode2 = le32_to_cpu(dip->di_mode);
jfs_set_inode_flags(ip);
diff --git a/fs/jfs/jfs_incore.h b/fs/jfs/jfs_incore.h
index 721def69e732..dd4264aa9bed 100644
--- a/fs/jfs/jfs_incore.h
+++ b/fs/jfs/jfs_incore.h
@@ -66,7 +66,7 @@ struct jfs_inode_info {
lid_t xtlid; /* lid of xtree lock on directory */
union {
struct {
- xtpage_t _xtroot; /* 288: xtree root */
+ xtroot_t _xtroot; /* 288: xtree root */
struct inomap *_imap; /* 4: inode map header */
} file;
struct {
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
index ce4b4760fcb1..dccc8b3f1045 100644
--- a/fs/jfs/jfs_txnmgr.c
+++ b/fs/jfs/jfs_txnmgr.c
@@ -783,7 +783,7 @@ struct tlock *txLock(tid_t tid, struct inode *ip, struct metapage * mp,
if (mp->xflag & COMMIT_PAGE)
p = (xtpage_t *) mp->data;
else
- p = &jfs_ip->i_xtroot;
+ p = (xtpage_t *) &jfs_ip->i_xtroot;
xtlck->lwm.offset =
le16_to_cpu(p->header.nextindex);
}
@@ -1676,7 +1676,7 @@ static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
if (tlck->type & tlckBTROOT) {
lrd->log.redopage.type |= cpu_to_le16(LOG_BTROOT);
- p = &JFS_IP(ip)->i_xtroot;
+ p = (xtpage_t *) &JFS_IP(ip)->i_xtroot;
if (S_ISDIR(ip->i_mode))
lrd->log.redopage.type |=
cpu_to_le16(LOG_DIR_XTREE);
diff --git a/fs/jfs/jfs_xtree.c b/fs/jfs/jfs_xtree.c
index 2d304cee884c..5ee618d17e77 100644
--- a/fs/jfs/jfs_xtree.c
+++ b/fs/jfs/jfs_xtree.c
@@ -1213,7 +1213,7 @@ xtSplitRoot(tid_t tid,
struct xtlock *xtlck;
int rc;
- sp = &JFS_IP(ip)->i_xtroot;
+ sp = (xtpage_t *) &JFS_IP(ip)->i_xtroot;
INCREMENT(xtStat.split);
@@ -2098,7 +2098,7 @@ int xtAppend(tid_t tid, /* transaction id */
*/
void xtInitRoot(tid_t tid, struct inode *ip)
{
- xtpage_t *p;
+ xtroot_t *p;
/*
* acquire a transaction lock on the root
diff --git a/fs/jfs/jfs_xtree.h b/fs/jfs/jfs_xtree.h
index 142caafc73b1..15da4e16d8b2 100644
--- a/fs/jfs/jfs_xtree.h
+++ b/fs/jfs/jfs_xtree.h
@@ -65,24 +65,33 @@ struct xadlist {
#define XTPAGEMAXSLOT 256
#define XTENTRYSTART 2
-/*
- * xtree page:
- */
-typedef union {
- struct xtheader {
- __le64 next; /* 8: */
- __le64 prev; /* 8: */
+struct xtheader {
+ __le64 next; /* 8: */
+ __le64 prev; /* 8: */
- u8 flag; /* 1: */
- u8 rsrvd1; /* 1: */
- __le16 nextindex; /* 2: next index = number of entries */
- __le16 maxentry; /* 2: max number of entries */
- __le16 rsrvd2; /* 2: */
+ u8 flag; /* 1: */
+ u8 rsrvd1; /* 1: */
+ __le16 nextindex; /* 2: next index = number of entries */
+ __le16 maxentry; /* 2: max number of entries */
+ __le16 rsrvd2; /* 2: */
- pxd_t self; /* 8: self */
- } header; /* (32) */
+ pxd_t self; /* 8: self */
+};
+/*
+ * xtree root (in inode):
+ */
+typedef union {
+ struct xtheader header;
xad_t xad[XTROOTMAXSLOT]; /* 16 * maxentry: xad array */
+} xtroot_t;
+
+/*
+ * xtree page:
+ */
+typedef union {
+ struct xtheader header;
+ xad_t xad[XTPAGEMAXSLOT]; /* 16 * maxentry: xad array */
} xtpage_t;
/*
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index 2b9b98ff2dd6..e6f2c619b30a 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -559,11 +559,16 @@ static int ea_get(struct inode *inode, struct ea_buffer *ea_buf, int min_size)
size_check:
if (EALIST_SIZE(ea_buf->xattr) != ea_size) {
- int size = clamp_t(int, ea_size, 0, EALIST_SIZE(ea_buf->xattr));
-
- printk(KERN_ERR "ea_get: invalid extended attribute\n");
- print_hex_dump(KERN_ERR, "", DUMP_PREFIX_ADDRESS, 16, 1,
- ea_buf->xattr, size, 1);
+ if (unlikely(EALIST_SIZE(ea_buf->xattr) > INT_MAX)) {
+ printk(KERN_ERR "ea_get: extended attribute size too large: %u > INT_MAX\n",
+ EALIST_SIZE(ea_buf->xattr));
+ } else {
+ int size = clamp_t(int, ea_size, 0, EALIST_SIZE(ea_buf->xattr));
+
+ printk(KERN_ERR "ea_get: invalid extended attribute\n");
+ print_hex_dump(KERN_ERR, "", DUMP_PREFIX_ADDRESS, 16, 1,
+ ea_buf->xattr, size, 1);
+ }
ea_release(inode, ea_buf);
rc = -EIO;
goto clean_up;
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 2c74b24fc22a..a259fe3471a9 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -1532,8 +1532,9 @@ void kernfs_break_active_protection(struct kernfs_node *kn)
* invoked before finishing the kernfs operation. Note that while this
* function restores the active reference, it doesn't and can't actually
* restore the active protection - @kn may already or be in the process of
- * being removed. Once kernfs_break_active_protection() is invoked, that
- * protection is irreversibly gone for the kernfs operation instance.
+ * being drained and removed. Once kernfs_break_active_protection() is
+ * invoked, that protection is irreversibly gone for the kernfs operation
+ * instance.
*
* While this function may be called at any point after
* kernfs_break_active_protection() is invoked, its most useful location
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index adf3536cfec8..cf57b7cc3a43 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -820,8 +820,9 @@ bool kernfs_should_drain_open_files(struct kernfs_node *kn)
/*
* @kn being deactivated guarantees that @kn->attr.open can't change
* beneath us making the lockless test below safe.
+ * Callers post kernfs_unbreak_active_protection may be counted in
+ * kn->active by now, do not WARN_ON because of them.
*/
- WARN_ON_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS);
rcu_read_lock();
on = rcu_dereference(kn->attr.open);
diff --git a/fs/namei.c b/fs/namei.c
index 166d71c82d7a..6ce07cde1c27 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -5156,10 +5156,9 @@ const char *vfs_get_link(struct dentry *dentry, struct delayed_call *done)
EXPORT_SYMBOL(vfs_get_link);
/* get the link contents into pagecache */
-const char *page_get_link(struct dentry *dentry, struct inode *inode,
- struct delayed_call *callback)
+static char *__page_get_link(struct dentry *dentry, struct inode *inode,
+ struct delayed_call *callback)
{
- char *kaddr;
struct page *page;
struct address_space *mapping = inode->i_mapping;
@@ -5178,8 +5177,23 @@ const char *page_get_link(struct dentry *dentry, struct inode *inode,
}
set_delayed_call(callback, page_put_link, page);
BUG_ON(mapping_gfp_mask(mapping) & __GFP_HIGHMEM);
- kaddr = page_address(page);
- nd_terminate_link(kaddr, inode->i_size, PAGE_SIZE - 1);
+ return page_address(page);
+}
+
+const char *page_get_link_raw(struct dentry *dentry, struct inode *inode,
+ struct delayed_call *callback)
+{
+ return __page_get_link(dentry, inode, callback);
+}
+EXPORT_SYMBOL_GPL(page_get_link_raw);
+
+const char *page_get_link(struct dentry *dentry, struct inode *inode,
+ struct delayed_call *callback)
+{
+ char *kaddr = __page_get_link(dentry, inode, callback);
+
+ if (!IS_ERR(kaddr))
+ nd_terminate_link(kaddr, inode->i_size, PAGE_SIZE - 1);
return kaddr;
}
diff --git a/fs/namespace.c b/fs/namespace.c
index 59a9f877738b..f0fa2a1a6b05 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -629,15 +629,11 @@ int __legitimize_mnt(struct vfsmount *bastard, unsigned seq)
return 0;
mnt = real_mount(bastard);
mnt_add_count(mnt, 1);
- smp_mb(); // see mntput_no_expire()
+ smp_mb(); // see mntput_no_expire() and do_umount()
if (likely(!read_seqretry(&mount_lock, seq)))
return 0;
- if (bastard->mnt_flags & MNT_SYNC_UMOUNT) {
- mnt_add_count(mnt, -1);
- return 1;
- }
lock_mount_hash();
- if (unlikely(bastard->mnt_flags & MNT_DOOMED)) {
+ if (unlikely(bastard->mnt_flags & (MNT_SYNC_UMOUNT | MNT_DOOMED))) {
mnt_add_count(mnt, -1);
unlock_mount_hash();
return 1;
@@ -1707,6 +1703,7 @@ static int do_umount(struct mount *mnt, int flags)
umount_tree(mnt, UMOUNT_PROPAGATE);
retval = 0;
} else {
+ smp_mb(); // paired with __legitimize_mnt()
shrink_submounts(mnt);
retval = -EBUSY;
if (!propagate_mount_busy(mnt, 2)) {
@@ -1777,6 +1774,7 @@ static void warn_mandlock(void)
static int can_umount(const struct path *path, int flags)
{
struct mount *mnt = real_mount(path->mnt);
+ struct super_block *sb = path->dentry->d_sb;
if (!may_mount())
return -EPERM;
@@ -1786,7 +1784,7 @@ static int can_umount(const struct path *path, int flags)
return -EINVAL;
if (mnt->mnt.mnt_flags & MNT_LOCKED) /* Check optimistically */
return -EINVAL;
- if (flags & MNT_FORCE && !capable(CAP_SYS_ADMIN))
+ if (flags & MNT_FORCE && !ns_capable(sb->s_user_ns, CAP_SYS_ADMIN))
return -EPERM;
return 0;
}
@@ -2020,6 +2018,11 @@ struct vfsmount *clone_private_mount(const struct path *path)
if (!check_mnt(old_mnt))
goto invalid;
+ if (!ns_capable(old_mnt->mnt_ns->user_ns, CAP_SYS_ADMIN)) {
+ up_read(&namespace_sem);
+ return ERR_PTR(-EPERM);
+ }
+
if (has_locked_children(old_mnt, path->dentry))
goto invalid;
@@ -2251,14 +2254,14 @@ static int attach_recursive_mnt(struct mount *source_mnt,
hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) {
struct mount *q;
hlist_del_init(&child->mnt_hash);
- q = __lookup_mnt(&child->mnt_parent->mnt,
- child->mnt_mountpoint);
- if (q)
- mnt_change_mountpoint(child, smp, q);
/* Notice when we are propagating across user namespaces */
if (child->mnt_parent->mnt_ns->user_ns != user_ns)
lock_mnt_tree(child);
child->mnt.mnt_flags &= ~MNT_LOCKED;
+ q = __lookup_mnt(&child->mnt_parent->mnt,
+ child->mnt_mountpoint);
+ if (q)
+ mnt_change_mountpoint(child, smp, q);
commit_tree(child);
}
put_mountpoint(smp);
@@ -2373,6 +2376,10 @@ static int do_change_type(struct path *path, int ms_flags)
return -EINVAL;
namespace_lock();
+ if (!check_mnt(mnt)) {
+ err = -EINVAL;
+ goto out_unlock;
+ }
if (type == MS_SHARED) {
err = invent_group_ids(mnt, recurse);
if (err)
@@ -2811,7 +2818,7 @@ static int do_set_group(struct path *from_path, struct path *to_path)
if (IS_MNT_SLAVE(from)) {
struct mount *m = from->mnt_master;
- list_add(&to->mnt_slave, &m->mnt_slave_list);
+ list_add(&to->mnt_slave, &from->mnt_slave);
to->mnt_master = m;
}
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index 14a72224b657..899e25e9b4eb 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -2,6 +2,7 @@
config NFS_FS
tristate "NFS client support"
depends on INET && FILE_LOCKING && MULTIUSER
+ select CRC32
select LOCKD
select SUNRPC
select NFS_ACL_SUPPORT if NFS_V3_ACL
@@ -194,7 +195,6 @@ config NFS_USE_KERNEL_DNS
config NFS_DEBUG
bool
depends on NFS_FS && SUNRPC_DEBUG
- select CRC32
default y
config NFS_DISABLE_UDP_SUPPORT
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index a8930e6c417f..de4ad41b14e2 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -1052,6 +1052,8 @@ struct nfs_server *nfs_create_server(struct fs_context *fc)
if (server->namelen == 0 || server->namelen > NFS2_MAXNAMLEN)
server->namelen = NFS2_MAXNAMLEN;
}
+ /* Linux 'subtree_check' borkenness mandates this setting */
+ server->fh_expire_type = NFS_FH_VOL_RENAME;
if (!(fattr->valid & NFS_ATTR_FATTR)) {
error = ctx->nfs_mod->rpc_ops->getattr(server, ctx->mntfh,
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 39c697e100b1..17b38da17288 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -297,7 +297,8 @@ nfs_start_delegation_return_locked(struct nfs_inode *nfsi)
if (delegation == NULL)
goto out;
spin_lock(&delegation->lock);
- if (!test_and_set_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) {
+ if (delegation->inode &&
+ !test_and_set_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) {
clear_bit(NFS_DELEGATION_RETURN_DELAYED, &delegation->flags);
/* Refcount matched in nfs_end_delegation_return() */
ret = nfs_get_delegation(delegation);
@@ -570,17 +571,6 @@ static bool nfs_delegation_need_return(struct nfs_delegation *delegation)
if (test_and_clear_bit(NFS_DELEGATION_RETURN, &delegation->flags))
ret = true;
- else if (test_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags)) {
- struct inode *inode;
-
- spin_lock(&delegation->lock);
- inode = delegation->inode;
- if (inode && list_empty(&NFS_I(inode)->open_files))
- ret = true;
- spin_unlock(&delegation->lock);
- }
- if (ret)
- clear_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags);
if (test_bit(NFS_DELEGATION_RETURNING, &delegation->flags) ||
test_bit(NFS_DELEGATION_RETURN_DELAYED, &delegation->flags) ||
test_bit(NFS_DELEGATION_REVOKED, &delegation->flags))
@@ -821,11 +811,25 @@ int nfs4_inode_make_writeable(struct inode *inode)
return nfs4_inode_return_delegation(inode);
}
-static void nfs_mark_return_if_closed_delegation(struct nfs_server *server,
- struct nfs_delegation *delegation)
+static void
+nfs_mark_return_if_closed_delegation(struct nfs_server *server,
+ struct nfs_delegation *delegation)
{
- set_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags);
- set_bit(NFS4CLNT_DELEGRETURN, &server->nfs_client->cl_state);
+ struct inode *inode;
+
+ if (test_bit(NFS_DELEGATION_RETURN, &delegation->flags) ||
+ test_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags))
+ return;
+ spin_lock(&delegation->lock);
+ inode = delegation->inode;
+ if (!inode)
+ goto out;
+ if (list_empty(&NFS_I(inode)->open_files))
+ nfs_mark_return_delegation(server, delegation);
+ else
+ set_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags);
+out:
+ spin_unlock(&delegation->lock);
}
static bool nfs_server_mark_return_all_delegations(struct nfs_server *server)
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 70660ff248b7..3c98049912df 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1825,9 +1825,7 @@ static void block_revalidate(struct dentry *dentry)
static void unblock_revalidate(struct dentry *dentry)
{
- /* store_release ensures wait_var_event() sees the update */
- smp_store_release(&dentry->d_fsdata, NULL);
- wake_up_var(&dentry->d_fsdata);
+ store_release_wake_up(&dentry->d_fsdata, NULL);
}
/*
@@ -2632,6 +2630,18 @@ nfs_unblock_rename(struct rpc_task *task, struct nfs_renamedata *data)
unblock_revalidate(new_dentry);
}
+static bool nfs_rename_is_unsafe_cross_dir(struct dentry *old_dentry,
+ struct dentry *new_dentry)
+{
+ struct nfs_server *server = NFS_SB(old_dentry->d_sb);
+
+ if (old_dentry->d_parent != new_dentry->d_parent)
+ return false;
+ if (server->fh_expire_type & NFS_FH_RENAME_UNSAFE)
+ return !(server->fh_expire_type & NFS_FH_NOEXPIRE_WITH_OPEN);
+ return true;
+}
+
/*
* RENAME
* FIXME: Some nfsds, like the Linux user space nfsd, may generate a
@@ -2719,7 +2729,8 @@ int nfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
}
- if (S_ISREG(old_inode->i_mode))
+ if (S_ISREG(old_inode->i_mode) &&
+ nfs_rename_is_unsafe_cross_dir(old_dentry, new_dentry))
nfs_sync_inode(old_inode);
task = nfs_async_rename(old_dir, new_dir, old_dentry, new_dentry,
must_unblock ? nfs_unblock_rename : NULL);
diff --git a/fs/nfs/export.c b/fs/nfs/export.c
index 9fe9586a51b7..aacf6220ab44 100644
--- a/fs/nfs/export.c
+++ b/fs/nfs/export.c
@@ -66,14 +66,21 @@ nfs_fh_to_dentry(struct super_block *sb, struct fid *fid,
{
struct nfs_fattr *fattr = NULL;
struct nfs_fh *server_fh = nfs_exp_embedfh(fid->raw);
- size_t fh_size = offsetof(struct nfs_fh, data) + server_fh->size;
+ size_t fh_size = offsetof(struct nfs_fh, data);
const struct nfs_rpc_ops *rpc_ops;
struct dentry *dentry;
struct inode *inode;
- int len = EMBED_FH_OFF + XDR_QUADLEN(fh_size);
+ int len = EMBED_FH_OFF;
u32 *p = fid->raw;
int ret;
+ /* Initial check of bounds */
+ if (fh_len < len + XDR_QUADLEN(fh_size) ||
+ fh_len > XDR_QUADLEN(NFS_MAXFHSIZE))
+ return NULL;
+ /* Calculate embedded filehandle size */
+ fh_size += server_fh->size;
+ len += XDR_QUADLEN(fh_size);
/* NULL translates to ESTALE */
if (fh_len < len || fh_type != len)
return NULL;
diff --git a/fs/nfs/filelayout/filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c
index acf4b88889dc..d5f1fbfd9a0c 100644
--- a/fs/nfs/filelayout/filelayoutdev.c
+++ b/fs/nfs/filelayout/filelayoutdev.c
@@ -75,6 +75,7 @@ nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
struct page *scratch;
struct list_head dsaddrs;
struct nfs4_pnfs_ds_addr *da;
+ struct net *net = server->nfs_client->cl_net;
/* set up xdr stream */
scratch = alloc_page(gfp_flags);
@@ -158,8 +159,7 @@ nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
mp_count = be32_to_cpup(p); /* multipath count */
for (j = 0; j < mp_count; j++) {
- da = nfs4_decode_mp_ds_addr(server->nfs_client->cl_net,
- &stream, gfp_flags);
+ da = nfs4_decode_mp_ds_addr(net, &stream, gfp_flags);
if (da)
list_add_tail(&da->da_node, &dsaddrs);
}
@@ -169,7 +169,7 @@ nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
goto out_err_free_deviceid;
}
- dsaddr->ds_list[i] = nfs4_pnfs_ds_add(&dsaddrs, gfp_flags);
+ dsaddr->ds_list[i] = nfs4_pnfs_ds_add(net, &dsaddrs, gfp_flags);
if (!dsaddr->ds_list[i])
goto out_err_drain_dsaddrs;
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 8056b05bd8dc..5dd16f4ae74d 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -745,14 +745,14 @@ ff_layout_choose_ds_for_read(struct pnfs_layout_segment *lseg,
{
struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg);
struct nfs4_ff_layout_mirror *mirror;
- struct nfs4_pnfs_ds *ds;
+ struct nfs4_pnfs_ds *ds = ERR_PTR(-EAGAIN);
u32 idx;
/* mirrors are initially sorted by efficiency */
for (idx = start_idx; idx < fls->mirror_array_cnt; idx++) {
mirror = FF_LAYOUT_COMP(lseg, idx);
ds = nfs4_ff_layout_prepare_ds(lseg, mirror, false);
- if (!ds)
+ if (IS_ERR(ds))
continue;
if (check_device &&
@@ -760,10 +760,10 @@ ff_layout_choose_ds_for_read(struct pnfs_layout_segment *lseg,
continue;
*best_idx = idx;
- return ds;
+ break;
}
- return NULL;
+ return ds;
}
static struct nfs4_pnfs_ds *
@@ -933,7 +933,7 @@ retry:
for (i = 0; i < pgio->pg_mirror_count; i++) {
mirror = FF_LAYOUT_COMP(pgio->pg_lseg, i);
ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, mirror, true);
- if (!ds) {
+ if (IS_ERR(ds)) {
if (!ff_layout_no_fallback_to_mds(pgio->pg_lseg))
goto out_mds;
pnfs_generic_pg_cleanup(pgio);
@@ -1096,6 +1096,7 @@ static void ff_layout_reset_read(struct nfs_pgio_header *hdr)
}
static int ff_layout_async_handle_error_v4(struct rpc_task *task,
+ u32 op_status,
struct nfs4_state *state,
struct nfs_client *clp,
struct pnfs_layout_segment *lseg,
@@ -1106,32 +1107,42 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task,
struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx);
struct nfs4_slot_table *tbl = &clp->cl_session->fc_slot_table;
- switch (task->tk_status) {
- case -NFS4ERR_BADSESSION:
- case -NFS4ERR_BADSLOT:
- case -NFS4ERR_BAD_HIGH_SLOT:
- case -NFS4ERR_DEADSESSION:
- case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
- case -NFS4ERR_SEQ_FALSE_RETRY:
- case -NFS4ERR_SEQ_MISORDERED:
+ switch (op_status) {
+ case NFS4_OK:
+ case NFS4ERR_NXIO:
+ break;
+ case NFSERR_PERM:
+ if (!task->tk_xprt)
+ break;
+ xprt_force_disconnect(task->tk_xprt);
+ goto out_retry;
+ case NFS4ERR_BADSESSION:
+ case NFS4ERR_BADSLOT:
+ case NFS4ERR_BAD_HIGH_SLOT:
+ case NFS4ERR_DEADSESSION:
+ case NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+ case NFS4ERR_SEQ_FALSE_RETRY:
+ case NFS4ERR_SEQ_MISORDERED:
dprintk("%s ERROR %d, Reset session. Exchangeid "
"flags 0x%x\n", __func__, task->tk_status,
clp->cl_exchange_flags);
nfs4_schedule_session_recovery(clp->cl_session, task->tk_status);
- break;
- case -NFS4ERR_DELAY:
- case -NFS4ERR_GRACE:
+ goto out_retry;
+ case NFS4ERR_DELAY:
+ nfs_inc_stats(lseg->pls_layout->plh_inode, NFSIOS_DELAY);
+ fallthrough;
+ case NFS4ERR_GRACE:
rpc_delay(task, FF_LAYOUT_POLL_RETRY_MAX);
- break;
- case -NFS4ERR_RETRY_UNCACHED_REP:
- break;
+ goto out_retry;
+ case NFS4ERR_RETRY_UNCACHED_REP:
+ goto out_retry;
/* Invalidate Layout errors */
- case -NFS4ERR_PNFS_NO_LAYOUT:
- case -ESTALE: /* mapped NFS4ERR_STALE */
- case -EBADHANDLE: /* mapped NFS4ERR_BADHANDLE */
- case -EISDIR: /* mapped NFS4ERR_ISDIR */
- case -NFS4ERR_FHEXPIRED:
- case -NFS4ERR_WRONG_TYPE:
+ case NFS4ERR_PNFS_NO_LAYOUT:
+ case NFS4ERR_STALE:
+ case NFS4ERR_BADHANDLE:
+ case NFS4ERR_ISDIR:
+ case NFS4ERR_FHEXPIRED:
+ case NFS4ERR_WRONG_TYPE:
dprintk("%s Invalid layout error %d\n", __func__,
task->tk_status);
/*
@@ -1144,6 +1155,11 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task,
pnfs_destroy_layout(NFS_I(inode));
rpc_wake_up(&tbl->slot_tbl_waitq);
goto reset;
+ default:
+ break;
+ }
+
+ switch (task->tk_status) {
/* RPC connection errors */
case -ECONNREFUSED:
case -EHOSTDOWN:
@@ -1159,26 +1175,56 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task,
nfs4_delete_deviceid(devid->ld, devid->nfs_client,
&devid->deviceid);
rpc_wake_up(&tbl->slot_tbl_waitq);
- fallthrough;
+ break;
default:
- if (ff_layout_avoid_mds_available_ds(lseg))
- return -NFS4ERR_RESET_TO_PNFS;
-reset:
- dprintk("%s Retry through MDS. Error %d\n", __func__,
- task->tk_status);
- return -NFS4ERR_RESET_TO_MDS;
+ break;
}
+
+ if (ff_layout_avoid_mds_available_ds(lseg))
+ return -NFS4ERR_RESET_TO_PNFS;
+reset:
+ dprintk("%s Retry through MDS. Error %d\n", __func__,
+ task->tk_status);
+ return -NFS4ERR_RESET_TO_MDS;
+
+out_retry:
task->tk_status = 0;
return -EAGAIN;
}
/* Retry all errors through either pNFS or MDS except for -EJUKEBOX */
static int ff_layout_async_handle_error_v3(struct rpc_task *task,
+ u32 op_status,
+ struct nfs_client *clp,
struct pnfs_layout_segment *lseg,
u32 idx)
{
struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx);
+ switch (op_status) {
+ case NFS_OK:
+ case NFSERR_NXIO:
+ break;
+ case NFSERR_PERM:
+ if (!task->tk_xprt)
+ break;
+ xprt_force_disconnect(task->tk_xprt);
+ goto out_retry;
+ case NFSERR_ACCES:
+ case NFSERR_BADHANDLE:
+ case NFSERR_FBIG:
+ case NFSERR_IO:
+ case NFSERR_NOSPC:
+ case NFSERR_ROFS:
+ case NFSERR_STALE:
+ goto out_reset_to_pnfs;
+ case NFSERR_JUKEBOX:
+ nfs_inc_stats(lseg->pls_layout->plh_inode, NFSIOS_DELAY);
+ goto out_retry;
+ default:
+ break;
+ }
+
switch (task->tk_status) {
/* File access problems. Don't mark the device as unavailable */
case -EACCES:
@@ -1197,6 +1243,7 @@ static int ff_layout_async_handle_error_v3(struct rpc_task *task,
nfs4_delete_deviceid(devid->ld, devid->nfs_client,
&devid->deviceid);
}
+out_reset_to_pnfs:
/* FIXME: Need to prevent infinite looping here. */
return -NFS4ERR_RESET_TO_PNFS;
out_retry:
@@ -1207,6 +1254,7 @@ out_retry:
}
static int ff_layout_async_handle_error(struct rpc_task *task,
+ u32 op_status,
struct nfs4_state *state,
struct nfs_client *clp,
struct pnfs_layout_segment *lseg,
@@ -1225,10 +1273,11 @@ static int ff_layout_async_handle_error(struct rpc_task *task,
switch (vers) {
case 3:
- return ff_layout_async_handle_error_v3(task, lseg, idx);
- case 4:
- return ff_layout_async_handle_error_v4(task, state, clp,
+ return ff_layout_async_handle_error_v3(task, op_status, clp,
lseg, idx);
+ case 4:
+ return ff_layout_async_handle_error_v4(task, op_status, state,
+ clp, lseg, idx);
default:
/* should never happen */
WARN_ON_ONCE(1);
@@ -1255,6 +1304,7 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
case -ECONNRESET:
case -EHOSTDOWN:
case -EHOSTUNREACH:
+ case -ENETDOWN:
case -ENETUNREACH:
case -EADDRINUSE:
case -ENOBUFS:
@@ -1280,6 +1330,7 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
switch (status) {
case NFS4ERR_DELAY:
case NFS4ERR_GRACE:
+ case NFS4ERR_PERM:
break;
case NFS4ERR_NXIO:
ff_layout_mark_ds_unreachable(lseg, idx);
@@ -1312,7 +1363,8 @@ static int ff_layout_read_done_cb(struct rpc_task *task,
trace_ff_layout_read_error(hdr);
}
- err = ff_layout_async_handle_error(task, hdr->args.context->state,
+ err = ff_layout_async_handle_error(task, hdr->res.op_status,
+ hdr->args.context->state,
hdr->ds_clp, hdr->lseg,
hdr->pgio_mirror_idx);
@@ -1482,7 +1534,8 @@ static int ff_layout_write_done_cb(struct rpc_task *task,
trace_ff_layout_write_error(hdr);
}
- err = ff_layout_async_handle_error(task, hdr->args.context->state,
+ err = ff_layout_async_handle_error(task, hdr->res.op_status,
+ hdr->args.context->state,
hdr->ds_clp, hdr->lseg,
hdr->pgio_mirror_idx);
@@ -1528,8 +1581,9 @@ static int ff_layout_commit_done_cb(struct rpc_task *task,
trace_ff_layout_commit_error(data);
}
- err = ff_layout_async_handle_error(task, NULL, data->ds_clp,
- data->lseg, data->ds_commit_index);
+ err = ff_layout_async_handle_error(task, data->res.op_status,
+ NULL, data->ds_clp, data->lseg,
+ data->ds_commit_index);
trace_nfs4_pnfs_commit_ds(data, err);
switch (err) {
@@ -1785,6 +1839,7 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
u32 idx = hdr->pgio_mirror_idx;
int vers;
struct nfs_fh *fh;
+ bool ds_fatal_error = false;
dprintk("--> %s ino %lu pgbase %u req %zu@%llu\n",
__func__, hdr->inode->i_ino,
@@ -1792,8 +1847,10 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
mirror = FF_LAYOUT_COMP(lseg, idx);
ds = nfs4_ff_layout_prepare_ds(lseg, mirror, false);
- if (!ds)
+ if (IS_ERR(ds)) {
+ ds_fatal_error = nfs_error_is_fatal(PTR_ERR(ds));
goto out_failed;
+ }
ds_clnt = nfs4_ff_find_or_create_ds_client(mirror, ds->ds_clp,
hdr->inode);
@@ -1834,7 +1891,7 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
return PNFS_ATTEMPTED;
out_failed:
- if (ff_layout_avoid_mds_available_ds(lseg))
+ if (ff_layout_avoid_mds_available_ds(lseg) && !ds_fatal_error)
return PNFS_TRY_AGAIN;
trace_pnfs_mds_fallback_read_pagelist(hdr->inode,
hdr->args.offset, hdr->args.count,
@@ -1855,11 +1912,14 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
int vers;
struct nfs_fh *fh;
u32 idx = hdr->pgio_mirror_idx;
+ bool ds_fatal_error = false;
mirror = FF_LAYOUT_COMP(lseg, idx);
ds = nfs4_ff_layout_prepare_ds(lseg, mirror, true);
- if (!ds)
+ if (IS_ERR(ds)) {
+ ds_fatal_error = nfs_error_is_fatal(PTR_ERR(ds));
goto out_failed;
+ }
ds_clnt = nfs4_ff_find_or_create_ds_client(mirror, ds->ds_clp,
hdr->inode);
@@ -1902,7 +1962,7 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
return PNFS_ATTEMPTED;
out_failed:
- if (ff_layout_avoid_mds_available_ds(lseg))
+ if (ff_layout_avoid_mds_available_ds(lseg) && !ds_fatal_error)
return PNFS_TRY_AGAIN;
trace_pnfs_mds_fallback_write_pagelist(hdr->inode,
hdr->args.offset, hdr->args.count,
@@ -1944,7 +2004,7 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how)
idx = calc_ds_index_from_commit(lseg, data->ds_commit_index);
mirror = FF_LAYOUT_COMP(lseg, idx);
ds = nfs4_ff_layout_prepare_ds(lseg, mirror, true);
- if (!ds)
+ if (IS_ERR(ds))
goto out_err;
ds_clnt = nfs4_ff_find_or_create_ds_client(mirror, ds->ds_clp,
diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
index e028f5a0ef5f..95d5dca67145 100644
--- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c
+++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
@@ -49,6 +49,7 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
struct nfs4_pnfs_ds_addr *da;
struct nfs4_ff_layout_ds *new_ds = NULL;
struct nfs4_ff_ds_version *ds_versions = NULL;
+ struct net *net = server->nfs_client->cl_net;
u32 mp_count;
u32 version_count;
__be32 *p;
@@ -80,8 +81,7 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
for (i = 0; i < mp_count; i++) {
/* multipath ds */
- da = nfs4_decode_mp_ds_addr(server->nfs_client->cl_net,
- &stream, gfp_flags);
+ da = nfs4_decode_mp_ds_addr(net, &stream, gfp_flags);
if (da)
list_add_tail(&da->da_node, &dsaddrs);
}
@@ -149,7 +149,7 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
new_ds->ds_versions = ds_versions;
new_ds->ds_versions_cnt = version_count;
- new_ds->ds = nfs4_pnfs_ds_add(&dsaddrs, gfp_flags);
+ new_ds->ds = nfs4_pnfs_ds_add(net, &dsaddrs, gfp_flags);
if (!new_ds->ds)
goto out_err_drain_dsaddrs;
@@ -370,11 +370,11 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg,
struct nfs4_ff_layout_mirror *mirror,
bool fail_return)
{
- struct nfs4_pnfs_ds *ds = NULL;
+ struct nfs4_pnfs_ds *ds;
struct inode *ino = lseg->pls_layout->plh_inode;
struct nfs_server *s = NFS_SERVER(ino);
unsigned int max_payload;
- int status;
+ int status = -EAGAIN;
if (!ff_layout_init_mirror_ds(lseg->pls_layout, mirror))
goto noconnect;
@@ -412,7 +412,7 @@ noconnect:
ff_layout_send_layouterror(lseg);
if (fail_return || !ff_layout_has_available_ds(lseg))
pnfs_error_mark_layout_for_return(ino, lseg);
- ds = NULL;
+ ds = ERR_PTR(status);
out:
return ds;
}
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 964df0725f4c..627410be2e88 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -74,6 +74,8 @@ nfs_fattr_to_ino_t(struct nfs_fattr *fattr)
int nfs_wait_bit_killable(struct wait_bit_key *key, int mode)
{
+ if (unlikely(nfs_current_task_exiting()))
+ return -EINTR;
schedule();
if (signal_pending_state(mode, current))
return -ERESTARTSYS;
@@ -553,6 +555,8 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
set_nlink(inode, fattr->nlink);
else if (fattr_supported & NFS_ATTR_FATTR_NLINK)
nfs_set_cache_invalid(inode, NFS_INO_INVALID_NLINK);
+ else
+ set_nlink(inode, 1);
if (fattr->valid & NFS_ATTR_FATTR_OWNER)
inode->i_uid = fattr->uid;
else if (fattr_supported & NFS_ATTR_FATTR_OWNER)
@@ -2427,15 +2431,26 @@ EXPORT_SYMBOL_GPL(nfs_net_id);
static int nfs_net_init(struct net *net)
{
struct nfs_net *nn = net_generic(net, nfs_net_id);
+ int err;
nfs_clients_init(net);
if (!rpc_proc_register(net, &nn->rpcstats)) {
- nfs_clients_exit(net);
- return -ENOMEM;
+ err = -ENOMEM;
+ goto err_proc_rpc;
}
- return nfs_fs_proc_net_init(net);
+ err = nfs_fs_proc_net_init(net);
+ if (err)
+ goto err_proc_nfs;
+
+ return 0;
+
+err_proc_nfs:
+ rpc_proc_unregister(net, "nfs");
+err_proc_rpc:
+ nfs_clients_exit(net);
+ return err;
}
static void nfs_net_exit(struct net *net)
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 7fa23a6368e0..6ea10abfa851 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -598,9 +598,12 @@ nfs_write_match_verf(const struct nfs_writeverf *verf,
static inline gfp_t nfs_io_gfp_mask(void)
{
- if (current->flags & PF_WQ_WORKER)
- return GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
- return GFP_KERNEL;
+ gfp_t ret = current_gfp_context(GFP_KERNEL);
+
+ /* For workers __GFP_NORETRY only with __GFP_IO or __GFP_FS */
+ if ((current->flags & PF_WQ_WORKER) && ret == GFP_KERNEL)
+ ret |= __GFP_NORETRY | __GFP_NOWARN;
+ return ret;
}
/*
@@ -826,33 +829,16 @@ u64 nfs_timespec_to_change_attr(const struct timespec64 *ts)
return ((u64)ts->tv_sec << 30) + ts->tv_nsec;
}
-#ifdef CONFIG_CRC32
-/**
- * nfs_fhandle_hash - calculate the crc32 hash for the filehandle
- * @fh - pointer to filehandle
- *
- * returns a crc32 hash for the filehandle that is compatible with
- * the one displayed by "wireshark".
- */
-static inline u32 nfs_fhandle_hash(const struct nfs_fh *fh)
-{
- return ~crc32_le(0xFFFFFFFF, &fh->data[0], fh->size);
-}
static inline u32 nfs_stateid_hash(const nfs4_stateid *stateid)
{
return ~crc32_le(0xFFFFFFFF, &stateid->other[0],
NFS4_STATEID_OTHER_SIZE);
}
-#else
-static inline u32 nfs_fhandle_hash(const struct nfs_fh *fh)
-{
- return 0;
-}
-static inline u32 nfs_stateid_hash(nfs4_stateid *stateid)
+
+static inline bool nfs_current_task_exiting(void)
{
- return 0;
+ return (current->flags & PF_EXITING) != 0;
}
-#endif
static inline bool nfs_error_is_fatal(int err)
{
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 2e7579626cf0..f036d30f7515 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -39,7 +39,7 @@ nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags)
__set_current_state(TASK_KILLABLE|TASK_FREEZABLE_UNSAFE);
schedule_timeout(NFS_JUKEBOX_RETRY_TIME);
res = -ERESTARTSYS;
- } while (!fatal_signal_pending(current));
+ } while (!fatal_signal_pending(current) && !nfs_current_task_exiting());
return res;
}
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 5b06b8d4e014..4abac68a4f0f 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -422,6 +422,8 @@ static int nfs4_delay_killable(long *timeout)
{
might_sleep();
+ if (unlikely(nfs_current_task_exiting()))
+ return -EINTR;
__set_current_state(TASK_KILLABLE|TASK_FREEZABLE_UNSAFE);
schedule_timeout(nfs4_update_delay(timeout));
if (!__fatal_signal_pending(current))
@@ -433,6 +435,8 @@ static int nfs4_delay_interruptible(long *timeout)
{
might_sleep();
+ if (unlikely(nfs_current_task_exiting()))
+ return -EINTR;
__set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE_UNSAFE);
schedule_timeout(nfs4_update_delay(timeout));
if (!signal_pending(current))
@@ -1712,7 +1716,8 @@ static void nfs_set_open_stateid_locked(struct nfs4_state *state,
rcu_read_unlock();
trace_nfs4_open_stateid_update_wait(state->inode, stateid, 0);
- if (!fatal_signal_pending(current)) {
+ if (!fatal_signal_pending(current) &&
+ !nfs_current_task_exiting()) {
if (schedule_timeout(5*HZ) == 0)
status = -EAGAIN;
else
@@ -3500,7 +3505,7 @@ static bool nfs4_refresh_open_old_stateid(nfs4_stateid *dst,
write_sequnlock(&state->seqlock);
trace_nfs4_close_stateid_update_wait(state->inode, dst, 0);
- if (fatal_signal_pending(current))
+ if (fatal_signal_pending(current) || nfs_current_task_exiting())
status = -EINTR;
else
if (schedule_timeout(5*HZ) != 0)
@@ -6060,6 +6065,8 @@ static ssize_t nfs4_proc_get_acl(struct inode *inode, void *buf, size_t buflen,
struct nfs_server *server = NFS_SERVER(inode);
int ret;
+ if (unlikely(NFS_FH(inode)->size == 0))
+ return -ENODATA;
if (!nfs4_server_supports_acls(server, type))
return -EOPNOTSUPP;
ret = nfs_revalidate_inode(inode, NFS_INO_INVALID_CHANGE);
@@ -6134,6 +6141,9 @@ static int nfs4_proc_set_acl(struct inode *inode, const void *buf,
{
struct nfs4_exception exception = { };
int err;
+
+ if (unlikely(NFS_FH(inode)->size == 0))
+ return -ENODATA;
do {
err = __nfs4_proc_set_acl(inode, buf, buflen, type);
trace_nfs4_set_acl(inode, err);
@@ -6886,10 +6896,18 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl,
struct nfs4_unlockdata *p;
struct nfs4_state *state = lsp->ls_state;
struct inode *inode = state->inode;
+ struct nfs_lock_context *l_ctx;
p = kzalloc(sizeof(*p), GFP_KERNEL);
if (p == NULL)
return NULL;
+ l_ctx = nfs_get_lock_context(ctx);
+ if (!IS_ERR(l_ctx)) {
+ p->l_ctx = l_ctx;
+ } else {
+ kfree(p);
+ return NULL;
+ }
p->arg.fh = NFS_FH(inode);
p->arg.fl = &p->fl;
p->arg.seqid = seqid;
@@ -6897,7 +6915,6 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl,
p->lsp = lsp;
/* Ensure we don't close file until we're done freeing locks! */
p->ctx = get_nfs_open_context(ctx);
- p->l_ctx = nfs_get_lock_context(ctx);
locks_init_lock(&p->fl);
locks_copy_lock(&p->fl, fl);
p->server = NFS_SERVER(inode);
@@ -10618,7 +10635,7 @@ const struct nfs4_minor_version_ops *nfs_v4_minor_ops[] = {
static ssize_t nfs4_listxattr(struct dentry *dentry, char *list, size_t size)
{
- ssize_t error, error2, error3;
+ ssize_t error, error2, error3, error4 = 0;
size_t left = size;
error = generic_listxattr(dentry, list, left);
@@ -10641,8 +10658,18 @@ static ssize_t nfs4_listxattr(struct dentry *dentry, char *list, size_t size)
error3 = nfs4_listxattr_nfs4_user(d_inode(dentry), list, left);
if (error3 < 0)
return error3;
+ if (list) {
+ list += error3;
+ left -= error3;
+ }
+
+ if (!nfs_server_capable(d_inode(dentry), NFS_CAP_SECURITY_LABEL)) {
+ error4 = security_inode_listsecurity(d_inode(dentry), list, left);
+ if (error4 < 0)
+ return error4;
+ }
- error += error2 + error3;
+ error += error2 + error3 + error4;
if (size && error > size)
return -ERANGE;
return error;
diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h
index 351616c61df5..f9c291e2165c 100644
--- a/fs/nfs/nfs4session.h
+++ b/fs/nfs/nfs4session.h
@@ -148,16 +148,12 @@ static inline void nfs4_copy_sessionid(struct nfs4_sessionid *dst,
memcpy(dst->data, src->data, NFS4_MAX_SESSIONID_LEN);
}
-#ifdef CONFIG_CRC32
/*
* nfs_session_id_hash - calculate the crc32 hash for the session id
* @session - pointer to session
*/
#define nfs_session_id_hash(sess_id) \
(~crc32_le(0xFFFFFFFF, &(sess_id)->data[0], sizeof((sess_id)->data)))
-#else
-#define nfs_session_id_hash(session) (0)
-#endif
#else /* defined(CONFIG_NFS_V4_1) */
static inline int nfs4_init_session(struct nfs_client *clp)
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 48ea40660422..80a7c5bd7a47 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -2737,7 +2737,15 @@ out_error:
pr_warn_ratelimited("NFS: state manager%s%s failed on NFSv4 server %s"
" with error %d\n", section_sep, section,
clp->cl_hostname, -status);
- ssleep(1);
+ switch (status) {
+ case -ENETDOWN:
+ case -ENETUNREACH:
+ nfs_mark_client_ready(clp, -EIO);
+ break;
+ default:
+ ssleep(1);
+ break;
+ }
out_drain:
memalloc_nofs_restore(memflags);
nfs4_end_drain_session(clp);
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index f68286932019..7f48e0d870bd 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -732,6 +732,14 @@ pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
return remaining;
}
+static void pnfs_reset_return_info(struct pnfs_layout_hdr *lo)
+{
+ struct pnfs_layout_segment *lseg;
+
+ list_for_each_entry(lseg, &lo->plh_return_segs, pls_list)
+ pnfs_set_plh_return_info(lo, lseg->pls_range.iomode, 0);
+}
+
static void
pnfs_free_returned_lsegs(struct pnfs_layout_hdr *lo,
struct list_head *free_me,
@@ -1180,6 +1188,7 @@ void pnfs_layoutreturn_free_lsegs(struct pnfs_layout_hdr *lo,
pnfs_mark_matching_lsegs_invalid(lo, &freeme, range, seq);
pnfs_free_returned_lsegs(lo, &freeme, range, seq);
pnfs_set_layout_stateid(lo, stateid, NULL, true);
+ pnfs_reset_return_info(lo);
} else
pnfs_mark_layout_stateid_invalid(lo, &freeme);
out_unlock:
@@ -1921,8 +1930,10 @@ static void nfs_layoutget_begin(struct pnfs_layout_hdr *lo)
static void nfs_layoutget_end(struct pnfs_layout_hdr *lo)
{
if (atomic_dec_and_test(&lo->plh_outstanding) &&
- test_and_clear_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags))
+ test_and_clear_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags)) {
+ smp_mb__after_atomic();
wake_up_bit(&lo->plh_flags, NFS_LAYOUT_DRAIN);
+ }
}
static bool pnfs_is_first_layoutget(struct pnfs_layout_hdr *lo)
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index e3e6a41f19de..f5173c188184 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -59,6 +59,7 @@ struct nfs4_pnfs_ds {
struct list_head ds_node; /* nfs4_pnfs_dev_hlist dev_dslist */
char *ds_remotestr; /* comma sep list of addrs */
struct list_head ds_addrs;
+ const struct net *ds_net;
struct nfs_client *ds_clp;
refcount_t ds_count;
unsigned long ds_state;
@@ -405,7 +406,8 @@ int pnfs_generic_commit_pagelist(struct inode *inode,
int pnfs_generic_scan_commit_lists(struct nfs_commit_info *cinfo, int max);
void pnfs_generic_write_commit_done(struct rpc_task *task, void *data);
void nfs4_pnfs_ds_put(struct nfs4_pnfs_ds *ds);
-struct nfs4_pnfs_ds *nfs4_pnfs_ds_add(struct list_head *dsaddrs,
+struct nfs4_pnfs_ds *nfs4_pnfs_ds_add(const struct net *net,
+ struct list_head *dsaddrs,
gfp_t gfp_flags);
void nfs4_pnfs_v3_ds_connect_unload(void);
int nfs4_pnfs_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds,
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index 47a8da3f5c9f..31afa88742f6 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -651,12 +651,12 @@ _same_data_server_addrs_locked(const struct list_head *dsaddrs1,
* Lookup DS by addresses. nfs4_ds_cache_lock is held
*/
static struct nfs4_pnfs_ds *
-_data_server_lookup_locked(const struct list_head *dsaddrs)
+_data_server_lookup_locked(const struct net *net, const struct list_head *dsaddrs)
{
struct nfs4_pnfs_ds *ds;
list_for_each_entry(ds, &nfs4_data_server_cache, ds_node)
- if (_same_data_server_addrs_locked(&ds->ds_addrs, dsaddrs))
+ if (ds->ds_net == net && _same_data_server_addrs_locked(&ds->ds_addrs, dsaddrs))
return ds;
return NULL;
}
@@ -763,7 +763,7 @@ out_err:
* uncached and return cached struct nfs4_pnfs_ds.
*/
struct nfs4_pnfs_ds *
-nfs4_pnfs_ds_add(struct list_head *dsaddrs, gfp_t gfp_flags)
+nfs4_pnfs_ds_add(const struct net *net, struct list_head *dsaddrs, gfp_t gfp_flags)
{
struct nfs4_pnfs_ds *tmp_ds, *ds = NULL;
char *remotestr;
@@ -781,13 +781,14 @@ nfs4_pnfs_ds_add(struct list_head *dsaddrs, gfp_t gfp_flags)
remotestr = nfs4_pnfs_remotestr(dsaddrs, gfp_flags);
spin_lock(&nfs4_ds_cache_lock);
- tmp_ds = _data_server_lookup_locked(dsaddrs);
+ tmp_ds = _data_server_lookup_locked(net, dsaddrs);
if (tmp_ds == NULL) {
INIT_LIST_HEAD(&ds->ds_addrs);
list_splice_init(dsaddrs, &ds->ds_addrs);
ds->ds_remotestr = remotestr;
refcount_set(&ds->ds_count, 1);
INIT_LIST_HEAD(&ds->ds_node);
+ ds->ds_net = net;
ds->ds_clp = NULL;
list_add(&ds->ds_node, &nfs4_data_server_cache);
dprintk("%s add new data server %s\n", __func__,
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 3dffeb1d17b9..2dca011da034 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1018,6 +1018,16 @@ int nfs_reconfigure(struct fs_context *fc)
sync_filesystem(sb);
/*
+ * The SB_RDONLY flag has been removed from the superblock during
+ * mounts to prevent interference between different filesystems.
+ * Similarly, it is also necessary to ignore the SB_RDONLY flag
+ * during reconfiguration; otherwise, it may also result in the
+ * creation of redundant superblocks when mounting a directory with
+ * different rw and ro flags multiple times.
+ */
+ fc->sb_flags_mask &= ~SB_RDONLY;
+
+ /*
* Userspace mount programs that send binary options generally send
* them populated with default values. We have no way to know which
* ones were explicitly specified. Fall back to legacy behavior and
@@ -1273,8 +1283,17 @@ int nfs_get_tree_common(struct fs_context *fc)
if (IS_ERR(server))
return PTR_ERR(server);
+ /*
+ * When NFS_MOUNT_UNSHARED is not set, NFS forces the sharing of a
+ * superblock among each filesystem that mounts sub-directories
+ * belonging to a single exported root path.
+ * To prevent interference between different filesystems, the
+ * SB_RDONLY flag should be removed from the superblock.
+ */
if (server->flags & NFS_MOUNT_UNSHARED)
compare_super = NULL;
+ else
+ fc->sb_flags &= ~SB_RDONLY;
/* -o noac implies -o sync */
if (server->flags & NFS_MOUNT_NOAC)
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index 7c441f2bd444..4f704f868d9c 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -4,6 +4,7 @@ config NFSD
depends on INET
depends on FILE_LOCKING
depends on FSNOTIFY
+ select CRC32
select LOCKD
select SUNRPC
select EXPORTFS
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 6eb02390bd42..a5e017d94215 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -3536,7 +3536,8 @@ bool nfsd4_spo_must_allow(struct svc_rqst *rqstp)
struct nfs4_op_map *allow = &cstate->clp->cl_spo_must_allow;
u32 opiter;
- if (!cstate->minorversion)
+ if (rqstp->rq_procinfo != &nfsd_version4.vs_proc[NFSPROC4_COMPOUND] ||
+ cstate->minorversion == 0)
return false;
if (cstate->spo_must_allowed)
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index f6fa719ee326..bdee95d714d0 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1069,6 +1069,12 @@ static struct nfs4_ol_stateid * nfs4_alloc_open_stateid(struct nfs4_client *clp)
return openlockstateid(stid);
}
+/*
+ * As the sc_free callback of deleg, this may be called by nfs4_put_stid
+ * in nfsd_break_one_deleg.
+ * Considering nfsd_break_one_deleg is called with the flc->flc_lock held,
+ * this function mustn't ever sleep.
+ */
static void nfs4_free_deleg(struct nfs4_stid *stid)
{
struct nfs4_delegation *dp = delegstateid(stid);
@@ -4923,6 +4929,7 @@ static const struct nfsd4_callback_ops nfsd4_cb_recall_ops = {
static void nfsd_break_one_deleg(struct nfs4_delegation *dp)
{
+ bool queued;
/*
* We're assuming the state code never drops its reference
* without first removing the lease. Since we're in this lease
@@ -4931,7 +4938,10 @@ static void nfsd_break_one_deleg(struct nfs4_delegation *dp)
* we know it's safe to take a reference.
*/
refcount_inc(&dp->dl_stid.sc_count);
- WARN_ON_ONCE(!nfsd4_run_cb(&dp->dl_recall));
+ queued = nfsd4_run_cb(&dp->dl_recall);
+ WARN_ON_ONCE(!queued);
+ if (!queued)
+ refcount_dec(&dp->dl_stid.sc_count);
}
/* Called from break_lease() with flc_lock held. */
@@ -6237,14 +6247,19 @@ deleg_reaper(struct nfsd_net *nn)
spin_lock(&nn->client_lock);
list_for_each_safe(pos, next, &nn->client_lru) {
clp = list_entry(pos, struct nfs4_client, cl_lru);
- if (clp->cl_state != NFSD4_ACTIVE ||
- list_empty(&clp->cl_delegations) ||
- atomic_read(&clp->cl_delegs_in_recall) ||
- test_bit(NFSD4_CLIENT_CB_RECALL_ANY, &clp->cl_flags) ||
- (ktime_get_boottime_seconds() -
- clp->cl_ra_time < 5)) {
+
+ if (clp->cl_state != NFSD4_ACTIVE)
+ continue;
+ if (list_empty(&clp->cl_delegations))
+ continue;
+ if (atomic_read(&clp->cl_delegs_in_recall))
+ continue;
+ if (test_bit(NFSD4_CLIENT_CB_RECALL_ANY, &clp->cl_flags))
+ continue;
+ if (ktime_get_boottime_seconds() - clp->cl_ra_time < 5)
+ continue;
+ if (clp->cl_cb_state != NFSD4_CB_UP)
continue;
- }
list_add(&clp->cl_ra_cblist, &cblist);
/* release in nfsd4_cb_recall_any_release */
diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index 513e028b0bbe..40aee06ebd95 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -263,7 +263,6 @@ static inline bool fh_fsid_match(const struct knfsd_fh *fh1,
return true;
}
-#ifdef CONFIG_CRC32
/**
* knfsd_fh_hash - calculate the crc32 hash for the filehandle
* @fh - pointer to filehandle
@@ -275,12 +274,6 @@ static inline u32 knfsd_fh_hash(const struct knfsd_fh *fh)
{
return ~crc32_le(0xFFFFFFFF, fh->fh_raw, fh->fh_size);
}
-#else
-static inline u32 knfsd_fh_hash(const struct knfsd_fh *fh)
-{
- return 0;
-}
-#endif
/**
* fh_clear_pre_post_attrs - Reset pre/post attributes
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 80a2b3631adb..0f6428892e06 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -427,13 +427,13 @@ static int nfsd_startup_net(struct net *net, const struct cred *cred)
if (ret)
goto out_filecache;
+#ifdef CONFIG_NFSD_V4_2_INTER_SSC
+ nfsd4_ssc_init_umount_work(nn);
+#endif
ret = nfs4_state_start_net(net);
if (ret)
goto out_reply_cache;
-#ifdef CONFIG_NFSD_V4_2_INTER_SSC
- nfsd4_ssc_init_umount_work(nn);
-#endif
nn->nfsd_net_up = true;
return 0;
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index 3139a1863751..29cb1236e1a9 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -2094,11 +2094,13 @@ static int nilfs_btree_propagate(struct nilfs_bmap *btree,
ret = nilfs_btree_do_lookup(btree, path, key, NULL, level + 1, 0);
if (ret < 0) {
- if (unlikely(ret == -ENOENT))
+ if (unlikely(ret == -ENOENT)) {
nilfs_crit(btree->b_inode->i_sb,
"writing node/leaf block does not appear in b-tree (ino=%lu) at key=%llu, level=%d",
btree->b_inode->i_ino,
(unsigned long long)key, level);
+ ret = -EINVAL;
+ }
goto out;
}
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index 889e3e570213..0f3753af1674 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -64,12 +64,6 @@ static inline unsigned int nilfs_chunk_size(struct inode *inode)
return inode->i_sb->s_blocksize;
}
-static inline void nilfs_put_page(struct page *page)
-{
- kunmap(page);
- put_page(page);
-}
-
/*
* Return the offset into page `page_nr' of the last valid
* byte in that page, plus one.
@@ -450,8 +444,7 @@ int nilfs_inode_by_name(struct inode *dir, const struct qstr *qstr, ino_t *ino)
return 0;
}
-/* Releases the page */
-void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de,
+int nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de,
struct page *page, struct inode *inode)
{
unsigned int from = (char *)de - (char *)page_address(page);
@@ -461,12 +454,15 @@ void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de,
lock_page(page);
err = nilfs_prepare_chunk(page, from, to);
- BUG_ON(err);
+ if (unlikely(err)) {
+ unlock_page(page);
+ return err;
+ }
de->inode = cpu_to_le64(inode->i_ino);
nilfs_set_de_type(de, inode);
nilfs_commit_chunk(page, mapping, from, to);
- nilfs_put_page(page);
dir->i_mtime = dir->i_ctime = current_time(dir);
+ return 0;
}
/*
@@ -569,7 +565,7 @@ out_unlock:
/*
* nilfs_delete_entry deletes a directory entry by merging it with the
- * previous entry. Page is up-to-date. Releases the page.
+ * previous entry. Page is up-to-date.
*/
int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct page *page)
{
@@ -598,14 +594,16 @@ int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct page *page)
from = (char *)pde - (char *)page_address(page);
lock_page(page);
err = nilfs_prepare_chunk(page, from, to);
- BUG_ON(err);
+ if (unlikely(err)) {
+ unlock_page(page);
+ goto out;
+ }
if (pde)
pde->rec_len = nilfs_rec_len_to_disk(to - from);
dir->inode = 0;
nilfs_commit_chunk(page, mapping, from, to);
inode->i_ctime = inode->i_mtime = current_time(inode);
out:
- nilfs_put_page(page);
return err;
}
diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c
index 893ab36824cc..2d8dc6b35b54 100644
--- a/fs/nilfs2/direct.c
+++ b/fs/nilfs2/direct.c
@@ -273,6 +273,9 @@ static int nilfs_direct_propagate(struct nilfs_bmap *bmap,
dat = nilfs_bmap_get_dat(bmap);
key = nilfs_bmap_data_get_key(bmap, bh);
ptr = nilfs_direct_get_ptr(bmap, key);
+ if (ptr == NILFS_BMAP_INVALID_PTR)
+ return -EINVAL;
+
if (!buffer_nilfs_volatile(bh)) {
oldreq.pr_entry_nr = ptr;
newreq.pr_entry_nr = ptr;
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 452fb23d2e4c..1eb6c90fb7f4 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -517,11 +517,18 @@ static int __nilfs_read_inode(struct super_block *sb,
inode->i_op = &nilfs_symlink_inode_operations;
inode_nohighmem(inode);
inode->i_mapping->a_ops = &nilfs_aops;
- } else {
+ } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
+ S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
inode->i_op = &nilfs_special_inode_operations;
init_special_inode(
inode, inode->i_mode,
huge_decode_dev(le64_to_cpu(raw_inode->i_device_code)));
+ } else {
+ nilfs_error(sb,
+ "invalid file type bits in mode 0%o for inode %lu",
+ inode->i_mode, ino);
+ err = -EIO;
+ goto failed_unmap;
}
nilfs_ifile_unmap_inode(root->ifile, ino, bh);
brelse(bh);
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index a14f6342a025..67d66207fae1 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -297,6 +297,7 @@ static int nilfs_do_unlink(struct inode *dir, struct dentry *dentry)
set_nlink(inode, 1);
}
err = nilfs_delete_entry(de, page);
+ nilfs_put_page(page);
if (err)
goto out;
@@ -405,7 +406,10 @@ static int nilfs_rename(struct user_namespace *mnt_userns,
err = PTR_ERR(new_de);
goto out_dir;
}
- nilfs_set_link(new_dir, new_de, new_page, old_inode);
+ err = nilfs_set_link(new_dir, new_de, new_page, old_inode);
+ nilfs_put_page(new_page);
+ if (unlikely(err))
+ goto out_dir;
nilfs_mark_inode_dirty(new_dir);
new_inode->i_ctime = current_time(new_inode);
if (dir_de)
@@ -428,28 +432,27 @@ static int nilfs_rename(struct user_namespace *mnt_userns,
*/
old_inode->i_ctime = current_time(old_inode);
- nilfs_delete_entry(old_de, old_page);
-
- if (dir_de) {
- nilfs_set_link(old_inode, dir_de, dir_page, new_dir);
- drop_nlink(old_dir);
+ err = nilfs_delete_entry(old_de, old_page);
+ if (likely(!err)) {
+ if (dir_de) {
+ err = nilfs_set_link(old_inode, dir_de, dir_page,
+ new_dir);
+ drop_nlink(old_dir);
+ }
+ nilfs_mark_inode_dirty(old_dir);
}
- nilfs_mark_inode_dirty(old_dir);
nilfs_mark_inode_dirty(old_inode);
- err = nilfs_transaction_commit(old_dir->i_sb);
- return err;
-
out_dir:
- if (dir_de) {
- kunmap(dir_page);
- put_page(dir_page);
- }
+ if (dir_de)
+ nilfs_put_page(dir_page);
out_old:
- kunmap(old_page);
- put_page(old_page);
+ nilfs_put_page(old_page);
out:
- nilfs_transaction_abort(old_dir->i_sb);
+ if (likely(!err))
+ err = nilfs_transaction_commit(old_dir->i_sb);
+ else
+ nilfs_transaction_abort(old_dir->i_sb);
return err;
}
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 5a880b4edf3d..dadafad2fae7 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -240,8 +240,14 @@ nilfs_find_entry(struct inode *, const struct qstr *, struct page **);
extern int nilfs_delete_entry(struct nilfs_dir_entry *, struct page *);
extern int nilfs_empty_dir(struct inode *);
extern struct nilfs_dir_entry *nilfs_dotdot(struct inode *, struct page **);
-extern void nilfs_set_link(struct inode *, struct nilfs_dir_entry *,
- struct page *, struct inode *);
+int nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de,
+ struct page *page, struct inode *inode);
+
+static inline void nilfs_put_page(struct page *page)
+{
+ kunmap(page);
+ put_page(page);
+}
/* file.c */
extern int nilfs_sync_file(struct file *, loff_t, loff_t, int);
diff --git a/fs/ntfs3/attrib.c b/fs/ntfs3/attrib.c
index 0388e6b42100..feda45c7ca8e 100644
--- a/fs/ntfs3/attrib.c
+++ b/fs/ntfs3/attrib.c
@@ -176,7 +176,7 @@ out:
int attr_allocate_clusters(struct ntfs_sb_info *sbi, struct runs_tree *run,
CLST vcn, CLST lcn, CLST len, CLST *pre_alloc,
enum ALLOCATE_OPT opt, CLST *alen, const size_t fr,
- CLST *new_lcn)
+ CLST *new_lcn, CLST *new_len)
{
int err;
CLST flen, vcn0 = vcn, pre = pre_alloc ? *pre_alloc : 0;
@@ -196,20 +196,36 @@ int attr_allocate_clusters(struct ntfs_sb_info *sbi, struct runs_tree *run,
if (err)
goto out;
- if (new_lcn && vcn == vcn0)
- *new_lcn = lcn;
+ if (vcn == vcn0) {
+ /* Return the first fragment. */
+ if (new_lcn)
+ *new_lcn = lcn;
+ if (new_len)
+ *new_len = flen;
+ }
/* Add new fragment into run storage. */
- if (!run_add_entry(run, vcn, lcn, flen, opt == ALLOCATE_MFT)) {
+ if (!run_add_entry(run, vcn, lcn, flen, opt & ALLOCATE_MFT)) {
/* Undo last 'ntfs_look_for_free_space' */
mark_as_free_ex(sbi, lcn, len, false);
err = -ENOMEM;
goto out;
}
+ if (opt & ALLOCATE_ZERO) {
+ u8 shift = sbi->cluster_bits - SECTOR_SHIFT;
+
+ err = blkdev_issue_zeroout(sbi->sb->s_bdev,
+ (sector_t)lcn << shift,
+ (sector_t)flen << shift,
+ GFP_NOFS, 0);
+ if (err)
+ goto out;
+ }
+
vcn += flen;
- if (flen >= len || opt == ALLOCATE_MFT ||
+ if (flen >= len || (opt & ALLOCATE_MFT) ||
(fr && run->count - cnt >= fr)) {
*alen = vcn - vcn0;
return 0;
@@ -287,7 +303,8 @@ int attr_make_nonresident(struct ntfs_inode *ni, struct ATTRIB *attr,
const char *data = resident_data(attr);
err = attr_allocate_clusters(sbi, run, 0, 0, len, NULL,
- ALLOCATE_DEF, &alen, 0, NULL);
+ ALLOCATE_DEF, &alen, 0, NULL,
+ NULL);
if (err)
goto out1;
@@ -582,13 +599,13 @@ add_alloc_in_same_attr_seg:
/* ~3 bytes per fragment. */
err = attr_allocate_clusters(
sbi, run, vcn, lcn, to_allocate, &pre_alloc,
- is_mft ? ALLOCATE_MFT : 0, &alen,
+ is_mft ? ALLOCATE_MFT : ALLOCATE_DEF, &alen,
is_mft ? 0
: (sbi->record_size -
le32_to_cpu(rec->used) + 8) /
3 +
1,
- NULL);
+ NULL, NULL);
if (err)
goto out;
}
@@ -886,8 +903,19 @@ bad_inode:
return err;
}
+/*
+ * attr_data_get_block - Returns 'lcn' and 'len' for given 'vcn'.
+ *
+ * @new == NULL means just to get current mapping for 'vcn'
+ * @new != NULL means allocate real cluster if 'vcn' maps to hole
+ * @zero - zeroout new allocated clusters
+ *
+ * NOTE:
+ * - @new != NULL is called only for sparsed or compressed attributes.
+ * - new allocated clusters are zeroed via blkdev_issue_zeroout.
+ */
int attr_data_get_block(struct ntfs_inode *ni, CLST vcn, CLST clen, CLST *lcn,
- CLST *len, bool *new)
+ CLST *len, bool *new, bool zero)
{
int err = 0;
struct runs_tree *run = &ni->file.run;
@@ -896,29 +924,27 @@ int attr_data_get_block(struct ntfs_inode *ni, CLST vcn, CLST clen, CLST *lcn,
struct ATTRIB *attr = NULL, *attr_b;
struct ATTR_LIST_ENTRY *le, *le_b;
struct mft_inode *mi, *mi_b;
- CLST hint, svcn, to_alloc, evcn1, next_svcn, asize, end;
+ CLST hint, svcn, to_alloc, evcn1, next_svcn, asize, end, vcn0, alen;
+ unsigned int fr;
u64 total_size;
- u32 clst_per_frame;
- bool ok;
if (new)
*new = false;
+ /* Try to find in cache. */
down_read(&ni->file.run_lock);
- ok = run_lookup_entry(run, vcn, lcn, len, NULL);
+ if (!run_lookup_entry(run, vcn, lcn, len, NULL))
+ *len = 0;
up_read(&ni->file.run_lock);
- if (ok && (*lcn != SPARSE_LCN || !new)) {
- /* Normal way. */
- return 0;
+ if (*len) {
+ if (*lcn != SPARSE_LCN || !new)
+ return 0; /* Fast normal way without allocation. */
+ else if (clen > *len)
+ clen = *len;
}
- if (!clen)
- clen = 1;
-
- if (ok && clen > *len)
- clen = *len;
-
+ /* No cluster in cache or we need to allocate cluster in hole. */
sbi = ni->mi.sbi;
cluster_bits = sbi->cluster_bits;
@@ -944,12 +970,6 @@ int attr_data_get_block(struct ntfs_inode *ni, CLST vcn, CLST clen, CLST *lcn,
goto out;
}
- clst_per_frame = 1u << attr_b->nres.c_unit;
- to_alloc = (clen + clst_per_frame - 1) & ~(clst_per_frame - 1);
-
- if (vcn + to_alloc > asize)
- to_alloc = asize - vcn;
-
svcn = le64_to_cpu(attr_b->nres.svcn);
evcn1 = le64_to_cpu(attr_b->nres.evcn) + 1;
@@ -968,36 +988,68 @@ int attr_data_get_block(struct ntfs_inode *ni, CLST vcn, CLST clen, CLST *lcn,
evcn1 = le64_to_cpu(attr->nres.evcn) + 1;
}
+ /* Load in cache actual information. */
err = attr_load_runs(attr, ni, run, NULL);
if (err)
goto out;
- if (!ok) {
- ok = run_lookup_entry(run, vcn, lcn, len, NULL);
- if (ok && (*lcn != SPARSE_LCN || !new)) {
- /* Normal way. */
- err = 0;
- goto ok;
- }
+ if (!*len) {
+ if (run_lookup_entry(run, vcn, lcn, len, NULL)) {
+ if (*lcn != SPARSE_LCN || !new)
+ goto ok; /* Slow normal way without allocation. */
- if (!ok && !new) {
- *len = 0;
- err = 0;
+ if (clen > *len)
+ clen = *len;
+ } else if (!new) {
+ /* Here we may return -ENOENT.
+ * In any case caller gets zero length. */
goto ok;
}
-
- if (ok && clen > *len) {
- clen = *len;
- to_alloc = (clen + clst_per_frame - 1) &
- ~(clst_per_frame - 1);
- }
}
if (!is_attr_ext(attr_b)) {
+ /* The code below only for sparsed or compressed attributes. */
err = -EINVAL;
goto out;
}
+ vcn0 = vcn;
+ to_alloc = clen;
+ fr = (sbi->record_size - le32_to_cpu(mi->mrec->used) + 8) / 3 + 1;
+ /* Allocate frame aligned clusters.
+ * ntfs.sys usually uses 16 clusters per frame for sparsed or compressed.
+ * ntfs3 uses 1 cluster per frame for new created sparsed files. */
+ if (attr_b->nres.c_unit) {
+ CLST clst_per_frame = 1u << attr_b->nres.c_unit;
+ CLST cmask = ~(clst_per_frame - 1);
+
+ /* Get frame aligned vcn and to_alloc. */
+ vcn = vcn0 & cmask;
+ to_alloc = ((vcn0 + clen + clst_per_frame - 1) & cmask) - vcn;
+ if (fr < clst_per_frame)
+ fr = clst_per_frame;
+ zero = true;
+
+ /* Check if 'vcn' and 'vcn0' in different attribute segments. */
+ if (vcn < svcn || evcn1 <= vcn) {
+ /* Load attribute for truncated vcn. */
+ attr = ni_find_attr(ni, attr_b, &le, ATTR_DATA, NULL, 0,
+ &vcn, &mi);
+ if (!attr) {
+ err = -EINVAL;
+ goto out;
+ }
+ svcn = le64_to_cpu(attr->nres.svcn);
+ evcn1 = le64_to_cpu(attr->nres.evcn) + 1;
+ err = attr_load_runs(attr, ni, run, NULL);
+ if (err)
+ goto out;
+ }
+ }
+
+ if (vcn + to_alloc > asize)
+ to_alloc = asize - vcn;
+
/* Get the last LCN to allocate from. */
hint = 0;
@@ -1011,18 +1063,33 @@ int attr_data_get_block(struct ntfs_inode *ni, CLST vcn, CLST clen, CLST *lcn,
hint = -1;
}
- err = attr_allocate_clusters(
- sbi, run, vcn, hint + 1, to_alloc, NULL, 0, len,
- (sbi->record_size - le32_to_cpu(mi->mrec->used) + 8) / 3 + 1,
- lcn);
+ /* Allocate and zeroout new clusters. */
+ err = attr_allocate_clusters(sbi, run, vcn, hint + 1, to_alloc, NULL,
+ zero ? ALLOCATE_ZERO : ALLOCATE_DEF, &alen,
+ fr, lcn, len);
if (err)
goto out;
*new = true;
- end = vcn + *len;
-
+ end = vcn + alen;
total_size = le64_to_cpu(attr_b->nres.total_size) +
- ((u64)*len << cluster_bits);
+ ((u64)alen << cluster_bits);
+
+ if (vcn != vcn0) {
+ if (!run_lookup_entry(run, vcn0, lcn, len, NULL)) {
+ err = -EINVAL;
+ goto out;
+ }
+ if (*lcn == SPARSE_LCN) {
+ /* Internal error. Should not happened. */
+ WARN_ON(1);
+ err = -EINVAL;
+ goto out;
+ }
+ /* Check case when vcn0 + len overlaps new allocated clusters. */
+ if (vcn0 + *len > end)
+ *len = end - vcn0;
+ }
repack:
err = mi_pack_runs(mi, attr, run, max(end, evcn1) - svcn);
@@ -1547,7 +1614,7 @@ int attr_allocate_frame(struct ntfs_inode *ni, CLST frame, size_t compr_size,
struct ATTRIB *attr = NULL, *attr_b;
struct ATTR_LIST_ENTRY *le, *le_b;
struct mft_inode *mi, *mi_b;
- CLST svcn, evcn1, next_svcn, lcn, len;
+ CLST svcn, evcn1, next_svcn, len;
CLST vcn, end, clst_data;
u64 total_size, valid_size, data_size;
@@ -1623,8 +1690,9 @@ int attr_allocate_frame(struct ntfs_inode *ni, CLST frame, size_t compr_size,
}
err = attr_allocate_clusters(sbi, run, vcn + clst_data,
- hint + 1, len - clst_data, NULL, 0,
- &alen, 0, &lcn);
+ hint + 1, len - clst_data, NULL,
+ ALLOCATE_DEF, &alen, 0, NULL,
+ NULL);
if (err)
goto out;
diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c
index 70b38465aee3..6d9c1dfe9b1b 100644
--- a/fs/ntfs3/file.c
+++ b/fs/ntfs3/file.c
@@ -122,8 +122,8 @@ static int ntfs_extend_initialized_size(struct file *file,
bits = sbi->cluster_bits;
vcn = pos >> bits;
- err = attr_data_get_block(ni, vcn, 0, &lcn, &clen,
- NULL);
+ err = attr_data_get_block(ni, vcn, 1, &lcn, &clen, NULL,
+ false);
if (err)
goto out;
@@ -196,18 +196,18 @@ static int ntfs_zero_range(struct inode *inode, u64 vbo, u64 vbo_to)
struct address_space *mapping = inode->i_mapping;
u32 blocksize = 1 << inode->i_blkbits;
pgoff_t idx = vbo >> PAGE_SHIFT;
- u32 z_start = vbo & (PAGE_SIZE - 1);
+ u32 from = vbo & (PAGE_SIZE - 1);
pgoff_t idx_end = (vbo_to + PAGE_SIZE - 1) >> PAGE_SHIFT;
loff_t page_off;
struct buffer_head *head, *bh;
- u32 bh_next, bh_off, z_end;
+ u32 bh_next, bh_off, to;
sector_t iblock;
struct page *page;
- for (; idx < idx_end; idx += 1, z_start = 0) {
+ for (; idx < idx_end; idx += 1, from = 0) {
page_off = (loff_t)idx << PAGE_SHIFT;
- z_end = (page_off + PAGE_SIZE) > vbo_to ? (vbo_to - page_off)
- : PAGE_SIZE;
+ to = (page_off + PAGE_SIZE) > vbo_to ? (vbo_to - page_off)
+ : PAGE_SIZE;
iblock = page_off >> inode->i_blkbits;
page = find_or_create_page(mapping, idx,
@@ -224,7 +224,7 @@ static int ntfs_zero_range(struct inode *inode, u64 vbo, u64 vbo_to)
do {
bh_next = bh_off + blocksize;
- if (bh_next <= z_start || bh_off >= z_end)
+ if (bh_next <= from || bh_off >= to)
continue;
if (!buffer_mapped(bh)) {
@@ -258,7 +258,7 @@ static int ntfs_zero_range(struct inode *inode, u64 vbo, u64 vbo_to)
} while (bh_off = bh_next, iblock += 1,
head != (bh = bh->b_this_page));
- zero_user_segment(page, z_start, z_end);
+ zero_user_segment(page, from, to);
unlock_page(page);
put_page(page);
@@ -270,81 +270,6 @@ out:
}
/*
- * ntfs_sparse_cluster - Helper function to zero a new allocated clusters.
- *
- * NOTE: 512 <= cluster size <= 2M
- */
-void ntfs_sparse_cluster(struct inode *inode, struct page *page0, CLST vcn,
- CLST len)
-{
- struct address_space *mapping = inode->i_mapping;
- struct ntfs_sb_info *sbi = inode->i_sb->s_fs_info;
- u64 vbo = (u64)vcn << sbi->cluster_bits;
- u64 bytes = (u64)len << sbi->cluster_bits;
- u32 blocksize = 1 << inode->i_blkbits;
- pgoff_t idx0 = page0 ? page0->index : -1;
- loff_t vbo_clst = vbo & sbi->cluster_mask_inv;
- loff_t end = ntfs_up_cluster(sbi, vbo + bytes);
- pgoff_t idx = vbo_clst >> PAGE_SHIFT;
- u32 from = vbo_clst & (PAGE_SIZE - 1);
- pgoff_t idx_end = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
- loff_t page_off;
- u32 to;
- bool partial;
- struct page *page;
-
- for (; idx < idx_end; idx += 1, from = 0) {
- page = idx == idx0 ? page0 : grab_cache_page(mapping, idx);
-
- if (!page)
- continue;
-
- page_off = (loff_t)idx << PAGE_SHIFT;
- to = (page_off + PAGE_SIZE) > end ? (end - page_off)
- : PAGE_SIZE;
- partial = false;
-
- if ((from || PAGE_SIZE != to) &&
- likely(!page_has_buffers(page))) {
- create_empty_buffers(page, blocksize, 0);
- }
-
- if (page_has_buffers(page)) {
- struct buffer_head *head, *bh;
- u32 bh_off = 0;
-
- bh = head = page_buffers(page);
- do {
- u32 bh_next = bh_off + blocksize;
-
- if (from <= bh_off && bh_next <= to) {
- set_buffer_uptodate(bh);
- mark_buffer_dirty(bh);
- } else if (!buffer_uptodate(bh)) {
- partial = true;
- }
- bh_off = bh_next;
- } while (head != (bh = bh->b_this_page));
- }
-
- zero_user_segment(page, from, to);
-
- if (!partial) {
- if (!PageUptodate(page))
- SetPageUptodate(page);
- set_page_dirty(page);
- }
-
- if (idx != idx0) {
- unlock_page(page);
- put_page(page);
- }
- cond_resched();
- }
- mark_inode_dirty(inode);
-}
-
-/*
* ntfs_file_mmap - file_operations::mmap
*/
static int ntfs_file_mmap(struct file *file, struct vm_area_struct *vma)
@@ -385,18 +310,17 @@ static int ntfs_file_mmap(struct file *file, struct vm_area_struct *vma)
for (; vcn < end; vcn += len) {
err = attr_data_get_block(ni, vcn, 1, &lcn,
- &len, &new);
+ &len, &new, true);
if (err)
goto out;
-
- if (!new)
- continue;
- ntfs_sparse_cluster(inode, NULL, vcn, 1);
}
}
if (ni->i_valid < to) {
- inode_lock(inode);
+ if (!inode_trylock(inode)) {
+ err = -EAGAIN;
+ goto out;
+ }
err = ntfs_extend_initialized_size(file, ni,
ni->i_valid, to);
inode_unlock(inode);
@@ -433,6 +357,7 @@ static int ntfs_extend(struct inode *inode, loff_t pos, size_t count,
}
if (extend_init && !is_compressed(ni)) {
+ WARN_ON(ni->i_valid >= pos);
err = ntfs_extend_initialized_size(file, ni, ni->i_valid, pos);
if (err)
goto out;
@@ -532,7 +457,8 @@ static long ntfs_fallocate(struct file *file, int mode, loff_t vbo, loff_t len)
struct ntfs_sb_info *sbi = sb->s_fs_info;
struct ntfs_inode *ni = ntfs_i(inode);
loff_t end = vbo + len;
- loff_t vbo_down = round_down(vbo, PAGE_SIZE);
+ loff_t vbo_down = round_down(vbo, max_t(unsigned long,
+ sbi->cluster_size, PAGE_SIZE));
bool is_supported_holes = is_sparsed(ni) || is_compressed(ni);
loff_t i_size, new_size;
bool map_locked;
@@ -585,11 +511,8 @@ static long ntfs_fallocate(struct file *file, int mode, loff_t vbo, loff_t len)
u32 frame_size;
loff_t mask, vbo_a, end_a, tmp;
- err = filemap_write_and_wait_range(mapping, vbo, end - 1);
- if (err)
- goto out;
-
- err = filemap_write_and_wait_range(mapping, end, LLONG_MAX);
+ err = filemap_write_and_wait_range(mapping, vbo_down,
+ LLONG_MAX);
if (err)
goto out;
@@ -692,39 +615,35 @@ static long ntfs_fallocate(struct file *file, int mode, loff_t vbo, loff_t len)
goto out;
if (is_supported_holes) {
- CLST vcn_v = ni->i_valid >> sbi->cluster_bits;
CLST vcn = vbo >> sbi->cluster_bits;
CLST cend = bytes_to_cluster(sbi, end);
+ CLST cend_v = bytes_to_cluster(sbi, ni->i_valid);
CLST lcn, clen;
bool new;
+ if (cend_v > cend)
+ cend_v = cend;
+
/*
- * Allocate but do not zero new clusters. (see below comments)
- * This breaks security: One can read unused on-disk areas.
+ * Allocate and zero new clusters.
* Zeroing these clusters may be too long.
- * Maybe we should check here for root rights?
+ */
+ for (; vcn < cend_v; vcn += clen) {
+ err = attr_data_get_block(ni, vcn, cend_v - vcn,
+ &lcn, &clen, &new,
+ true);
+ if (err)
+ goto out;
+ }
+ /*
+ * Allocate but not zero new clusters.
*/
for (; vcn < cend; vcn += clen) {
err = attr_data_get_block(ni, vcn, cend - vcn,
- &lcn, &clen, &new);
+ &lcn, &clen, &new,
+ false);
if (err)
goto out;
- if (!new || vcn >= vcn_v)
- continue;
-
- /*
- * Unwritten area.
- * NTFS is not able to store several unwritten areas.
- * Activate 'ntfs_sparse_cluster' to zero new allocated clusters.
- *
- * Dangerous in case:
- * 1G of sparsed clusters + 1 cluster of data =>
- * valid_size == 1G + 1 cluster
- * fallocate(1G) will zero 1G and this can be very long
- * xfstest 016/086 will fail without 'ntfs_sparse_cluster'.
- */
- ntfs_sparse_cluster(inode, NULL, vcn,
- min(vcn_v - vcn, clen));
}
}
@@ -945,8 +864,8 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from)
frame_vbo = valid & ~(frame_size - 1);
off = valid & (frame_size - 1);
- err = attr_data_get_block(ni, frame << NTFS_LZNT_CUNIT, 0, &lcn,
- &clen, NULL);
+ err = attr_data_get_block(ni, frame << NTFS_LZNT_CUNIT, 1, &lcn,
+ &clen, NULL, false);
if (err)
goto out;
diff --git a/fs/ntfs3/frecord.c b/fs/ntfs3/frecord.c
index d41ddc06f207..fb572688f919 100644
--- a/fs/ntfs3/frecord.c
+++ b/fs/ntfs3/frecord.c
@@ -2297,7 +2297,7 @@ int ni_decompress_file(struct ntfs_inode *ni)
for (vcn = vbo >> sbi->cluster_bits; vcn < end; vcn += clen) {
err = attr_data_get_block(ni, vcn, cend - vcn, &lcn,
- &clen, &new);
+ &clen, &new, false);
if (err)
goto out;
}
diff --git a/fs/ntfs3/index.c b/fs/ntfs3/index.c
index 2589f6d1215f..ee6de53d2ad1 100644
--- a/fs/ntfs3/index.c
+++ b/fs/ntfs3/index.c
@@ -617,7 +617,7 @@ static bool index_hdr_check(const struct INDEX_HDR *hdr, u32 bytes)
u32 off = le32_to_cpu(hdr->de_off);
if (!IS_ALIGNED(off, 8) || tot > bytes || end > tot ||
- off + sizeof(struct NTFS_DE) > end) {
+ size_add(off, sizeof(struct NTFS_DE)) > end) {
/* incorrect index buffer. */
return false;
}
@@ -736,7 +736,7 @@ fill_table:
if (end > total)
return NULL;
- if (off + sizeof(struct NTFS_DE) > end)
+ if (size_add(off, sizeof(struct NTFS_DE)) > end)
return NULL;
e = Add2Ptr(hdr, off);
@@ -1442,8 +1442,8 @@ static int indx_create_allocate(struct ntfs_index *indx, struct ntfs_inode *ni,
run_init(&run);
- err = attr_allocate_clusters(sbi, &run, 0, 0, len, NULL, 0, &alen, 0,
- NULL);
+ err = attr_allocate_clusters(sbi, &run, 0, 0, len, NULL, ALLOCATE_DEF,
+ &alen, 0, NULL, NULL);
if (err)
goto out;
@@ -2166,6 +2166,10 @@ static int indx_get_entry_to_replace(struct ntfs_index *indx,
e = hdr_first_de(&n->index->ihdr);
fnd_push(fnd, n, e);
+ if (!e) {
+ err = -EINVAL;
+ goto out;
+ }
if (!de_is_last(e)) {
/*
@@ -2187,6 +2191,10 @@ static int indx_get_entry_to_replace(struct ntfs_index *indx,
n = fnd->nodes[level];
te = hdr_first_de(&n->index->ihdr);
+ if (!te) {
+ err = -EINVAL;
+ goto out;
+ }
/* Copy the candidate entry into the replacement entry buffer. */
re = kmalloc(le16_to_cpu(te->size) + sizeof(u64), GFP_NOFS);
if (!re) {
diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c
index 057aa3cec902..5baf6a2b3d48 100644
--- a/fs/ntfs3/inode.c
+++ b/fs/ntfs3/inode.c
@@ -592,7 +592,8 @@ static noinline int ntfs_get_block_vbo(struct inode *inode, u64 vbo,
off = vbo & sbi->cluster_mask;
new = false;
- err = attr_data_get_block(ni, vcn, 1, &lcn, &len, create ? &new : NULL);
+ err = attr_data_get_block(ni, vcn, 1, &lcn, &len, create ? &new : NULL,
+ create && sbi->cluster_size > PAGE_SIZE);
if (err)
goto out;
@@ -610,11 +611,8 @@ static noinline int ntfs_get_block_vbo(struct inode *inode, u64 vbo,
WARN_ON(1);
}
- if (new) {
+ if (new)
set_buffer_new(bh);
- if ((len << cluster_bits) > block_size)
- ntfs_sparse_cluster(inode, page, vcn, len);
- }
lbo = ((u64)lcn << cluster_bits) + off;
@@ -1533,8 +1531,8 @@ struct inode *ntfs_create_inode(struct user_namespace *mnt_userns,
cpu_to_le64(ntfs_up_cluster(sbi, nsize));
err = attr_allocate_clusters(sbi, &ni->file.run, 0, 0,
- clst, NULL, 0, &alen, 0,
- NULL);
+ clst, NULL, ALLOCATE_DEF,
+ &alen, 0, NULL, NULL);
if (err)
goto out5;
diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h
index 26dbe1b46fdd..f2f32e304b3d 100644
--- a/fs/ntfs3/ntfs_fs.h
+++ b/fs/ntfs3/ntfs_fs.h
@@ -42,9 +42,11 @@ enum utf16_endian;
#define MINUS_ONE_T ((size_t)(-1))
/* Biggest MFT / smallest cluster */
#define MAXIMUM_BYTES_PER_MFT 4096
+#define MAXIMUM_SHIFT_BYTES_PER_MFT 12
#define NTFS_BLOCKS_PER_MFT_RECORD (MAXIMUM_BYTES_PER_MFT / 512)
#define MAXIMUM_BYTES_PER_INDEX 4096
+#define MAXIMUM_SHIFT_BYTES_PER_INDEX 12
#define NTFS_BLOCKS_PER_INODE (MAXIMUM_BYTES_PER_INDEX / 512)
/* NTFS specific error code when fixup failed. */
@@ -126,6 +128,7 @@ struct ntfs_buffers {
enum ALLOCATE_OPT {
ALLOCATE_DEF = 0, // Allocate all clusters.
ALLOCATE_MFT = 1, // Allocate for MFT.
+ ALLOCATE_ZERO = 2, // Zeroout new allocated clusters
};
enum bitmap_mutex_classes {
@@ -416,7 +419,7 @@ enum REPARSE_SIGN {
int attr_allocate_clusters(struct ntfs_sb_info *sbi, struct runs_tree *run,
CLST vcn, CLST lcn, CLST len, CLST *pre_alloc,
enum ALLOCATE_OPT opt, CLST *alen, const size_t fr,
- CLST *new_lcn);
+ CLST *new_lcn, CLST *new_len);
int attr_make_nonresident(struct ntfs_inode *ni, struct ATTRIB *attr,
struct ATTR_LIST_ENTRY *le, struct mft_inode *mi,
u64 new_size, struct runs_tree *run,
@@ -426,7 +429,7 @@ int attr_set_size(struct ntfs_inode *ni, enum ATTR_TYPE type,
u64 new_size, const u64 *new_valid, bool keep_prealloc,
struct ATTRIB **ret);
int attr_data_get_block(struct ntfs_inode *ni, CLST vcn, CLST clen, CLST *lcn,
- CLST *len, bool *new);
+ CLST *len, bool *new, bool zero);
int attr_data_read_resident(struct ntfs_inode *ni, struct page *page);
int attr_data_write_resident(struct ntfs_inode *ni, struct page *page);
int attr_load_runs_vcn(struct ntfs_inode *ni, enum ATTR_TYPE type,
@@ -491,8 +494,6 @@ extern const struct file_operations ntfs_dir_operations;
/* Globals from file.c */
int ntfs_getattr(struct user_namespace *mnt_userns, const struct path *path,
struct kstat *stat, u32 request_mask, u32 flags);
-void ntfs_sparse_cluster(struct inode *inode, struct page *page0, CLST vcn,
- CLST len);
int ntfs3_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
struct iattr *attr);
int ntfs_file_open(struct inode *inode, struct file *file);
diff --git a/fs/ntfs3/record.c b/fs/ntfs3/record.c
index b2b98631a000..bfb1f4c2f271 100644
--- a/fs/ntfs3/record.c
+++ b/fs/ntfs3/record.c
@@ -325,6 +325,9 @@ struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr)
} else {
if (attr->nres.c_unit)
return NULL;
+
+ if (alloc_size > mi->sbi->volume.size)
+ return NULL;
}
return attr;
diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c
index eee54214f4a3..674a16c0c66b 100644
--- a/fs/ntfs3/super.c
+++ b/fs/ntfs3/super.c
@@ -680,7 +680,7 @@ static u32 true_sectors_per_clst(const struct NTFS_BOOT *boot)
* ntfs_init_from_boot - Init internal info from on-disk boot sector.
*/
static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size,
- u64 dev_size)
+ u64 dev_size)
{
struct ntfs_sb_info *sbi = sb->s_fs_info;
int err;
@@ -705,12 +705,12 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size,
/* 0x55AA is not mandaroty. Thanks Maxim Suhanov*/
/*if (0x55 != boot->boot_magic[0] || 0xAA != boot->boot_magic[1])
- * goto out;
+ * goto out;
*/
boot_sector_size = (u32)boot->bytes_per_sector[1] << 8;
if (boot->bytes_per_sector[0] || boot_sector_size < SECTOR_SIZE ||
- !is_power_of_2(boot_sector_size)) {
+ !is_power_of_2(boot_sector_size)) {
goto out;
}
@@ -733,15 +733,49 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size,
/* Check MFT record size. */
if ((boot->record_size < 0 &&
- SECTOR_SIZE > (2U << (-boot->record_size))) ||
- (boot->record_size >= 0 && !is_power_of_2(boot->record_size))) {
+ SECTOR_SIZE > (2U << (-boot->record_size))) ||
+ (boot->record_size >= 0 && !is_power_of_2(boot->record_size))) {
+ goto out;
+ }
+
+ /* Calculate cluster size */
+ sbi->cluster_size = boot_sector_size * sct_per_clst;
+ sbi->cluster_bits = blksize_bits(sbi->cluster_size);
+
+ if (boot->record_size >= 0) {
+ record_size = (u32)boot->record_size << sbi->cluster_bits;
+ } else if (-boot->record_size <= MAXIMUM_SHIFT_BYTES_PER_MFT) {
+ record_size = 1u << (-boot->record_size);
+ } else {
+ ntfs_err(sb, "%s: invalid record size %d.", "NTFS",
+ boot->record_size);
+ goto out;
+ }
+
+ sbi->record_size = record_size;
+ sbi->record_bits = blksize_bits(record_size);
+ sbi->attr_size_tr = (5 * record_size >> 4); // ~320 bytes
+
+ if (record_size > MAXIMUM_BYTES_PER_MFT) {
+ ntfs_err(sb, "Unsupported bytes per MFT record %u.",
+ record_size);
+ goto out;
+ }
+
+ if (boot->index_size >= 0) {
+ sbi->index_size = (u32)boot->index_size << sbi->cluster_bits;
+ } else if (-boot->index_size <= MAXIMUM_SHIFT_BYTES_PER_INDEX) {
+ sbi->index_size = 1u << (-boot->index_size);
+ } else {
+ ntfs_err(sb, "%s: invalid index size %d.", "NTFS",
+ boot->index_size);
goto out;
}
/* Check index record size. */
if ((boot->index_size < 0 &&
- SECTOR_SIZE > (2U << (-boot->index_size))) ||
- (boot->index_size >= 0 && !is_power_of_2(boot->index_size))) {
+ SECTOR_SIZE > (2U << (-boot->index_size))) ||
+ (boot->index_size >= 0 && !is_power_of_2(boot->index_size))) {
goto out;
}
@@ -762,9 +796,6 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size,
dev_size += sector_size - 1;
}
- sbi->cluster_size = boot_sector_size * sct_per_clst;
- sbi->cluster_bits = blksize_bits(sbi->cluster_size);
-
sbi->mft.lbo = mlcn << sbi->cluster_bits;
sbi->mft.lbo2 = mlcn2 << sbi->cluster_bits;
@@ -785,9 +816,9 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size,
sbi->cluster_mask = sbi->cluster_size - 1;
sbi->cluster_mask_inv = ~(u64)sbi->cluster_mask;
sbi->record_size = record_size = boot->record_size < 0
- ? 1 << (-boot->record_size)
- : (u32)boot->record_size
- << sbi->cluster_bits;
+ ? 1 << (-boot->record_size)
+ : (u32)boot->record_size
+ << sbi->cluster_bits;
if (record_size > MAXIMUM_BYTES_PER_MFT || record_size < SECTOR_SIZE)
goto out;
@@ -801,8 +832,8 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size,
ALIGN(sizeof(enum ATTR_TYPE), 8);
sbi->index_size = boot->index_size < 0
- ? 1u << (-boot->index_size)
- : (u32)boot->index_size << sbi->cluster_bits;
+ ? 1u << (-boot->index_size)
+ : (u32)boot->index_size << sbi->cluster_bits;
sbi->volume.ser_num = le64_to_cpu(boot->serial_num);
@@ -871,13 +902,6 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size,
sb->s_maxbytes = 0xFFFFFFFFull << sbi->cluster_bits;
#endif
- /*
- * Compute the MFT zone at two steps.
- * It would be nice if we are able to allocate 1/8 of
- * total clusters for MFT but not more then 512 MB.
- */
- sbi->zone_max = min_t(CLST, 0x20000000 >> sbi->cluster_bits, clusters >> 3);
-
err = 0;
out:
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 51c93929a146..7f11ffacc915 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -1796,6 +1796,14 @@ static int __ocfs2_find_path(struct ocfs2_caching_info *ci,
el = root_el;
while (el->l_tree_depth) {
+ if (unlikely(le16_to_cpu(el->l_tree_depth) >= OCFS2_MAX_PATH_DEPTH)) {
+ ocfs2_error(ocfs2_metadata_cache_get_super(ci),
+ "Owner %llu has invalid tree depth %u in extent list\n",
+ (unsigned long long)ocfs2_metadata_cache_owner(ci),
+ le16_to_cpu(el->l_tree_depth));
+ ret = -EROFS;
+ goto out;
+ }
if (le16_to_cpu(el->l_next_free_rec) == 0) {
ocfs2_error(ocfs2_metadata_cache_get_super(ci),
"Owner %llu has empty extent list at depth %u\n",
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 775766856a54..7f5fd16d3740 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -173,7 +173,7 @@ int ocfs2_recovery_init(struct ocfs2_super *osb)
struct ocfs2_recovery_map *rm;
mutex_init(&osb->recovery_lock);
- osb->disable_recovery = 0;
+ osb->recovery_state = OCFS2_REC_ENABLED;
osb->recovery_thread_task = NULL;
init_waitqueue_head(&osb->recovery_event);
@@ -192,31 +192,53 @@ int ocfs2_recovery_init(struct ocfs2_super *osb)
return 0;
}
-/* we can't grab the goofy sem lock from inside wait_event, so we use
- * memory barriers to make sure that we'll see the null task before
- * being woken up */
static int ocfs2_recovery_thread_running(struct ocfs2_super *osb)
{
- mb();
return osb->recovery_thread_task != NULL;
}
-void ocfs2_recovery_exit(struct ocfs2_super *osb)
+static void ocfs2_recovery_disable(struct ocfs2_super *osb,
+ enum ocfs2_recovery_state state)
{
- struct ocfs2_recovery_map *rm;
-
- /* disable any new recovery threads and wait for any currently
- * running ones to exit. Do this before setting the vol_state. */
mutex_lock(&osb->recovery_lock);
- osb->disable_recovery = 1;
+ /*
+ * If recovery thread is not running, we can directly transition to
+ * final state.
+ */
+ if (!ocfs2_recovery_thread_running(osb)) {
+ osb->recovery_state = state + 1;
+ goto out_lock;
+ }
+ osb->recovery_state = state;
+ /* Wait for recovery thread to acknowledge state transition */
+ wait_event_cmd(osb->recovery_event,
+ !ocfs2_recovery_thread_running(osb) ||
+ osb->recovery_state >= state + 1,
+ mutex_unlock(&osb->recovery_lock),
+ mutex_lock(&osb->recovery_lock));
+out_lock:
mutex_unlock(&osb->recovery_lock);
- wait_event(osb->recovery_event, !ocfs2_recovery_thread_running(osb));
- /* At this point, we know that no more recovery threads can be
- * launched, so wait for any recovery completion work to
- * complete. */
+ /*
+ * At this point we know that no more recovery work can be queued so
+ * wait for any recovery completion work to complete.
+ */
if (osb->ocfs2_wq)
flush_workqueue(osb->ocfs2_wq);
+}
+
+void ocfs2_recovery_disable_quota(struct ocfs2_super *osb)
+{
+ ocfs2_recovery_disable(osb, OCFS2_REC_QUOTA_WANT_DISABLE);
+}
+
+void ocfs2_recovery_exit(struct ocfs2_super *osb)
+{
+ struct ocfs2_recovery_map *rm;
+
+ /* disable any new recovery threads and wait for any currently
+ * running ones to exit. Do this before setting the vol_state. */
+ ocfs2_recovery_disable(osb, OCFS2_REC_WANT_DISABLE);
/*
* Now that recovery is shut down, and the osb is about to be
@@ -1439,6 +1461,18 @@ static int __ocfs2_recovery_thread(void *arg)
}
}
restart:
+ if (quota_enabled) {
+ mutex_lock(&osb->recovery_lock);
+ /* Confirm that recovery thread will no longer recover quotas */
+ if (osb->recovery_state == OCFS2_REC_QUOTA_WANT_DISABLE) {
+ osb->recovery_state = OCFS2_REC_QUOTA_DISABLED;
+ wake_up(&osb->recovery_event);
+ }
+ if (osb->recovery_state >= OCFS2_REC_QUOTA_DISABLED)
+ quota_enabled = 0;
+ mutex_unlock(&osb->recovery_lock);
+ }
+
status = ocfs2_super_lock(osb, 1);
if (status < 0) {
mlog_errno(status);
@@ -1536,27 +1570,29 @@ bail:
ocfs2_free_replay_slots(osb);
osb->recovery_thread_task = NULL;
- mb(); /* sync with ocfs2_recovery_thread_running */
+ if (osb->recovery_state == OCFS2_REC_WANT_DISABLE)
+ osb->recovery_state = OCFS2_REC_DISABLED;
wake_up(&osb->recovery_event);
mutex_unlock(&osb->recovery_lock);
- if (quota_enabled)
- kfree(rm_quota);
+ kfree(rm_quota);
return status;
}
void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num)
{
+ int was_set = -1;
+
mutex_lock(&osb->recovery_lock);
+ if (osb->recovery_state < OCFS2_REC_WANT_DISABLE)
+ was_set = ocfs2_recovery_map_set(osb, node_num);
trace_ocfs2_recovery_thread(node_num, osb->node_num,
- osb->disable_recovery, osb->recovery_thread_task,
- osb->disable_recovery ?
- -1 : ocfs2_recovery_map_set(osb, node_num));
+ osb->recovery_state, osb->recovery_thread_task, was_set);
- if (osb->disable_recovery)
+ if (osb->recovery_state >= OCFS2_REC_WANT_DISABLE)
goto out;
if (osb->recovery_thread_task)
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 689c340c6363..5ebde794a653 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -148,6 +148,7 @@ void ocfs2_wait_for_recovery(struct ocfs2_super *osb);
int ocfs2_recovery_init(struct ocfs2_super *osb);
void ocfs2_recovery_exit(struct ocfs2_super *osb);
+void ocfs2_recovery_disable_quota(struct ocfs2_super *osb);
int ocfs2_compute_replay_slots(struct ocfs2_super *osb);
void ocfs2_free_replay_slots(struct ocfs2_super *osb);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 740b64238312..22882c636bfb 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -284,6 +284,21 @@ enum ocfs2_mount_options
#define OCFS2_OSB_ERROR_FS 0x0004
#define OCFS2_DEFAULT_ATIME_QUANTUM 60
+enum ocfs2_recovery_state {
+ OCFS2_REC_ENABLED = 0,
+ OCFS2_REC_QUOTA_WANT_DISABLE,
+ /*
+ * Must be OCFS2_REC_QUOTA_WANT_DISABLE + 1 for
+ * ocfs2_recovery_disable_quota() to work.
+ */
+ OCFS2_REC_QUOTA_DISABLED,
+ OCFS2_REC_WANT_DISABLE,
+ /*
+ * Must be OCFS2_REC_WANT_DISABLE + 1 for ocfs2_recovery_exit() to work
+ */
+ OCFS2_REC_DISABLED,
+};
+
struct ocfs2_journal;
struct ocfs2_slot_info;
struct ocfs2_recovery_map;
@@ -346,7 +361,7 @@ struct ocfs2_super
struct ocfs2_recovery_map *recovery_map;
struct ocfs2_replay_map *replay_map;
struct task_struct *recovery_thread_task;
- int disable_recovery;
+ enum ocfs2_recovery_state recovery_state;
wait_queue_head_t checkpoint_event;
struct ocfs2_journal *journal;
unsigned long osb_commit_interval;
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index 4b4fa58cd32f..c7bda48b5fb2 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -453,8 +453,7 @@ out:
/* Sync changes in local quota file into global quota file and
* reinitialize local quota file.
- * The function expects local quota file to be already locked and
- * s_umount locked in shared mode. */
+ * The function expects local quota file to be already locked. */
static int ocfs2_recover_local_quota_file(struct inode *lqinode,
int type,
struct ocfs2_quota_recovery *rec)
@@ -585,7 +584,6 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
{
unsigned int ino[OCFS2_MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE,
LOCAL_GROUP_QUOTA_SYSTEM_INODE };
- struct super_block *sb = osb->sb;
struct ocfs2_local_disk_dqinfo *ldinfo;
struct buffer_head *bh;
handle_t *handle;
@@ -597,7 +595,6 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
printk(KERN_NOTICE "ocfs2: Finishing quota recovery on device (%s) for "
"slot %u\n", osb->dev_str, slot_num);
- down_read(&sb->s_umount);
for (type = 0; type < OCFS2_MAXQUOTAS; type++) {
if (list_empty(&(rec->r_list[type])))
continue;
@@ -674,8 +671,7 @@ out_put:
break;
}
out:
- up_read(&sb->s_umount);
- kfree(rec);
+ ocfs2_free_quota_recovery(rec);
return status;
}
@@ -840,8 +836,7 @@ static int ocfs2_local_free_info(struct super_block *sb, int type)
ocfs2_release_local_quota_bitmaps(&oinfo->dqi_chunk);
/*
- * s_umount held in exclusive mode protects us against racing with
- * recovery thread...
+ * ocfs2_dismount_volume() has already aborted quota recovery...
*/
if (oinfo->dqi_rec) {
ocfs2_free_quota_recovery(oinfo->dqi_rec);
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 8f7bb76d9cde..b30f9d71a35d 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1869,6 +1869,9 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
/* Orphan scan should be stopped as early as possible */
ocfs2_orphan_scan_stop(osb);
+ /* Stop quota recovery so that we can disable quotas */
+ ocfs2_recovery_disable_quota(osb);
+
ocfs2_disable_quotas(osb);
/* All dquots should be freed by now */
diff --git a/fs/omfs/file.c b/fs/omfs/file.c
index fa7fe2393ff6..425ff6800c41 100644
--- a/fs/omfs/file.c
+++ b/fs/omfs/file.c
@@ -14,7 +14,7 @@ static u32 omfs_max_extents(struct omfs_sb_info *sbi, int offset)
{
return (sbi->s_sys_blocksize - offset -
sizeof(struct omfs_extent)) /
- sizeof(struct omfs_extent_entry) + 1;
+ sizeof(struct omfs_extent_entry);
}
void omfs_make_empty_table(struct buffer_head *bh, int offset)
@@ -24,8 +24,8 @@ void omfs_make_empty_table(struct buffer_head *bh, int offset)
oe->e_next = ~cpu_to_be64(0ULL);
oe->e_extent_count = cpu_to_be32(1),
oe->e_fill = cpu_to_be32(0x22),
- oe->e_entry.e_cluster = ~cpu_to_be64(0ULL);
- oe->e_entry.e_blocks = ~cpu_to_be64(0ULL);
+ oe->e_entry[0].e_cluster = ~cpu_to_be64(0ULL);
+ oe->e_entry[0].e_blocks = ~cpu_to_be64(0ULL);
}
int omfs_shrink_inode(struct inode *inode)
@@ -68,7 +68,7 @@ int omfs_shrink_inode(struct inode *inode)
last = next;
next = be64_to_cpu(oe->e_next);
- entry = &oe->e_entry;
+ entry = oe->e_entry;
/* ignore last entry as it is the terminator */
for (; extent_count > 1; extent_count--) {
@@ -117,7 +117,7 @@ static int omfs_grow_extent(struct inode *inode, struct omfs_extent *oe,
u64 *ret_block)
{
struct omfs_extent_entry *terminator;
- struct omfs_extent_entry *entry = &oe->e_entry;
+ struct omfs_extent_entry *entry = oe->e_entry;
struct omfs_sb_info *sbi = OMFS_SB(inode->i_sb);
u32 extent_count = be32_to_cpu(oe->e_extent_count);
u64 new_block = 0;
@@ -245,7 +245,7 @@ static int omfs_get_block(struct inode *inode, sector_t block,
extent_count = be32_to_cpu(oe->e_extent_count);
next = be64_to_cpu(oe->e_next);
- entry = &oe->e_entry;
+ entry = oe->e_entry;
if (extent_count > max_extents)
goto out_brelse;
diff --git a/fs/omfs/omfs_fs.h b/fs/omfs/omfs_fs.h
index caecb3d5a344..1ff6b9e41297 100644
--- a/fs/omfs/omfs_fs.h
+++ b/fs/omfs/omfs_fs.h
@@ -77,7 +77,7 @@ struct omfs_extent {
__be64 e_next; /* next extent table location */
__be32 e_extent_count; /* total # extents in this table */
__be32 e_fill;
- struct omfs_extent_entry e_entry; /* start of extent entries */
+ struct omfs_extent_entry e_entry[]; /* start of extent entries */
};
#endif
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index b3bbb5a5787a..cc81ff6ac735 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -23,9 +23,9 @@ static int orangefs_writepage_locked(struct page *page,
struct orangefs_write_range *wr = NULL;
struct iov_iter iter;
struct bio_vec bv;
- size_t len, wlen;
+ size_t wlen;
ssize_t ret;
- loff_t off;
+ loff_t len, off;
set_page_writeback(page);
@@ -94,8 +94,7 @@ static int orangefs_writepages_work(struct orangefs_writepages *ow,
struct orangefs_write_range *wrp, wr;
struct iov_iter iter;
ssize_t ret;
- size_t len;
- loff_t off;
+ loff_t len, off;
int i;
len = i_size_read(inode);
diff --git a/fs/orangefs/orangefs-debugfs.c b/fs/orangefs/orangefs-debugfs.c
index fa41db088488..b57140ebfad0 100644
--- a/fs/orangefs/orangefs-debugfs.c
+++ b/fs/orangefs/orangefs-debugfs.c
@@ -728,8 +728,8 @@ static void do_k_string(void *k_mask, int index)
if (*mask & s_kmod_keyword_mask_map[index].mask_val) {
if ((strlen(kernel_debug_string) +
- strlen(s_kmod_keyword_mask_map[index].keyword))
- < ORANGEFS_MAX_DEBUG_STRING_LEN - 1) {
+ strlen(s_kmod_keyword_mask_map[index].keyword) + 1)
+ < ORANGEFS_MAX_DEBUG_STRING_LEN) {
strcat(kernel_debug_string,
s_kmod_keyword_mask_map[index].keyword);
strcat(kernel_debug_string, ",");
@@ -756,7 +756,7 @@ static void do_c_string(void *c_mask, int index)
(mask->mask2 & cdm_array[index].mask2)) {
if ((strlen(client_debug_string) +
strlen(cdm_array[index].keyword) + 1)
- < ORANGEFS_MAX_DEBUG_STRING_LEN - 2) {
+ < ORANGEFS_MAX_DEBUG_STRING_LEN) {
strcat(client_debug_string,
cdm_array[index].keyword);
strcat(client_debug_string, ",");
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index 83cd20f79c5c..6922d8d705cb 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -229,7 +229,9 @@ enum ovl_path_type ovl_path_realdata(struct dentry *dentry, struct path *path)
struct dentry *ovl_dentry_upper(struct dentry *dentry)
{
- return ovl_upperdentry_dereference(OVL_I(d_inode(dentry)));
+ struct inode *inode = d_inode(dentry);
+
+ return inode ? ovl_upperdentry_dereference(OVL_I(inode)) : NULL;
}
struct dentry *ovl_dentry_lower(struct dentry *dentry)
diff --git a/fs/proc/base.c b/fs/proc/base.c
index ecc45389ea79..fb169bdfd9a8 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -415,7 +415,7 @@ static const struct file_operations proc_pid_cmdline_ops = {
#ifdef CONFIG_KALLSYMS
/*
* Provides a wchan file via kallsyms in a proper one-value-per-file format.
- * Returns the resolved symbol. If that fails, simply return the address.
+ * Returns the resolved symbol to user space.
*/
static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task)
@@ -2633,10 +2633,11 @@ static ssize_t timerslack_ns_write(struct file *file, const char __user *buf,
}
task_lock(p);
- if (slack_ns == 0)
- p->timer_slack_ns = p->default_timer_slack_ns;
- else
- p->timer_slack_ns = slack_ns;
+ if (task_is_realtime(p))
+ slack_ns = 0;
+ else if (slack_ns == 0)
+ slack_ns = p->default_timer_slack_ns;
+ p->timer_slack_ns = slack_ns;
task_unlock(p);
out:
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 587b91d9d998..c3a809e1d719 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -558,10 +558,18 @@ struct proc_dir_entry *proc_create_reg(const char *name, umode_t mode,
return p;
}
-static inline void pde_set_flags(struct proc_dir_entry *pde)
+static void pde_set_flags(struct proc_dir_entry *pde)
{
if (pde->proc_ops->proc_flags & PROC_ENTRY_PERMANENT)
pde->flags |= PROC_ENTRY_PERMANENT;
+ if (pde->proc_ops->proc_read_iter)
+ pde->flags |= PROC_ENTRY_proc_read_iter;
+#ifdef CONFIG_COMPAT
+ if (pde->proc_ops->proc_compat_ioctl)
+ pde->flags |= PROC_ENTRY_proc_compat_ioctl;
+#endif
+ if (pde->proc_ops->proc_lseek)
+ pde->flags |= PROC_ENTRY_proc_lseek;
}
struct proc_dir_entry *proc_create_data(const char *name, umode_t mode,
@@ -625,6 +633,7 @@ struct proc_dir_entry *proc_create_seq_private(const char *name, umode_t mode,
p->proc_ops = &proc_seq_ops;
p->seq_ops = ops;
p->state_size = state_size;
+ pde_set_flags(p);
return proc_register(parent, p);
}
EXPORT_SYMBOL(proc_create_seq_private);
@@ -655,6 +664,7 @@ struct proc_dir_entry *proc_create_single_data(const char *name, umode_t mode,
return NULL;
p->proc_ops = &proc_single_ops;
p->single_show = show;
+ pde_set_flags(p);
return proc_register(parent, p);
}
EXPORT_SYMBOL(proc_create_single_data);
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index f495fdb39151..623aa0d97a6d 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -52,7 +52,7 @@ static void proc_evict_inode(struct inode *inode)
head = ei->sysctl;
if (head) {
- RCU_INIT_POINTER(ei->sysctl, NULL);
+ WRITE_ONCE(ei->sysctl, NULL);
proc_sys_evict_inode(inode, head);
}
}
@@ -494,7 +494,7 @@ static int proc_reg_open(struct inode *inode, struct file *file)
typeof_member(struct proc_ops, proc_release) release;
struct pde_opener *pdeo;
- if (!pde->proc_ops->proc_lseek)
+ if (!pde_has_proc_lseek(pde))
file->f_mode &= ~FMODE_LSEEK;
if (pde_is_permanent(pde)) {
@@ -679,13 +679,13 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)
if (S_ISREG(inode->i_mode)) {
inode->i_op = de->proc_iops;
- if (de->proc_ops->proc_read_iter)
+ if (pde_has_proc_read_iter(de))
inode->i_fop = &proc_iter_file_ops;
else
inode->i_fop = &proc_reg_file_ops;
#ifdef CONFIG_COMPAT
- if (de->proc_ops->proc_compat_ioctl) {
- if (de->proc_ops->proc_read_iter)
+ if (pde_has_proc_compat_ioctl(de)) {
+ if (pde_has_proc_read_iter(de))
inode->i_fop = &proc_iter_file_ops_compat;
else
inode->i_fop = &proc_reg_file_ops_compat;
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 6b921826d85b..019137261a03 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -84,6 +84,25 @@ static inline void pde_make_permanent(struct proc_dir_entry *pde)
pde->flags |= PROC_ENTRY_PERMANENT;
}
+static inline bool pde_has_proc_read_iter(const struct proc_dir_entry *pde)
+{
+ return pde->flags & PROC_ENTRY_proc_read_iter;
+}
+
+static inline bool pde_has_proc_compat_ioctl(const struct proc_dir_entry *pde)
+{
+#ifdef CONFIG_COMPAT
+ return pde->flags & PROC_ENTRY_proc_compat_ioctl;
+#else
+ return false;
+#endif
+}
+
+static inline bool pde_has_proc_lseek(const struct proc_dir_entry *pde)
+{
+ return pde->flags & PROC_ENTRY_proc_lseek;
+}
+
extern struct kmem_cache *proc_dir_entry_cache;
void pde_free(struct proc_dir_entry *pde);
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index df77a7bcce49..6db1489abc68 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -924,17 +924,21 @@ static int proc_sys_compare(const struct dentry *dentry,
struct ctl_table_header *head;
struct inode *inode;
- /* Although proc doesn't have negative dentries, rcu-walk means
- * that inode here can be NULL */
- /* AV: can it, indeed? */
- inode = d_inode_rcu(dentry);
- if (!inode)
- return 1;
if (name->len != len)
return 1;
if (memcmp(name->name, str, len))
return 1;
- head = rcu_dereference(PROC_I(inode)->sysctl);
+
+ // false positive is fine here - we'll recheck anyway
+ if (d_in_lookup(dentry))
+ return 0;
+
+ inode = d_inode_rcu(dentry);
+ // we just might have run into dentry in the middle of __dentry_kill()
+ if (!inode)
+ return 1;
+
+ head = READ_ONCE(PROC_I(inode)->sysctl);
return !head || !sysctl_is_seen(head);
}
diff --git a/fs/select.c b/fs/select.c
index 3f730b8581f6..e66b6189845e 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -77,19 +77,16 @@ u64 select_estimate_accuracy(struct timespec64 *tv)
{
u64 ret;
struct timespec64 now;
+ u64 slack = current->timer_slack_ns;
- /*
- * Realtime tasks get a slack of 0 for obvious reasons.
- */
-
- if (rt_task(current))
+ if (slack == 0)
return 0;
ktime_get_ts64(&now);
now = timespec64_sub(*tv, now);
ret = __estimate_accuracy(&now);
- if (ret < current->timer_slack_ns)
- return current->timer_slack_ns;
+ if (ret < slack)
+ return slack;
return ret;
}
diff --git a/fs/smb/client/asn1.c b/fs/smb/client/asn1.c
index b5724ef9f182..214a44509e7b 100644
--- a/fs/smb/client/asn1.c
+++ b/fs/smb/client/asn1.c
@@ -52,6 +52,8 @@ int cifs_neg_token_init_mech_type(void *context, size_t hdrlen,
server->sec_kerberos = true;
else if (oid == OID_ntlmssp)
server->sec_ntlmssp = true;
+ else if (oid == OID_IAKerb)
+ server->sec_iakerb = true;
else {
char buf[50];
diff --git a/fs/smb/client/cached_dir.h b/fs/smb/client/cached_dir.h
index 2f4e764c9ca9..dad31f0b7ffb 100644
--- a/fs/smb/client/cached_dir.h
+++ b/fs/smb/client/cached_dir.h
@@ -21,10 +21,10 @@ struct cached_dirent {
struct cached_dirents {
bool is_valid:1;
bool is_failed:1;
- struct dir_context *ctx; /*
- * Only used to make sure we only take entries
- * from a single context. Never dereferenced.
- */
+ struct file *file; /*
+ * Used to associate the cache with a single
+ * open file instance.
+ */
struct mutex de_mutex;
int pos; /* Expected ctx->pos */
struct list_head entries;
diff --git a/fs/smb/client/cifs_dfs_ref.c b/fs/smb/client/cifs_dfs_ref.c
index 020e71fe1454..876f9a43a99d 100644
--- a/fs/smb/client/cifs_dfs_ref.c
+++ b/fs/smb/client/cifs_dfs_ref.c
@@ -258,6 +258,31 @@ compose_mount_options_err:
goto compose_mount_options_out;
}
+static int set_dest_addr(struct smb3_fs_context *ctx, const char *full_path)
+{
+ struct sockaddr *addr = (struct sockaddr *)&ctx->dstaddr;
+ char *str_addr = NULL;
+ int rc;
+
+ rc = dns_resolve_server_name_to_ip(full_path, &str_addr, NULL);
+ if (rc < 0)
+ goto out;
+
+ rc = cifs_convert_address(addr, str_addr, strlen(str_addr));
+ if (!rc) {
+ cifs_dbg(FYI, "%s: failed to convert ip address\n", __func__);
+ rc = -EINVAL;
+ goto out;
+ }
+
+ cifs_set_port(addr, ctx->port);
+ rc = 0;
+
+out:
+ kfree(str_addr);
+ return rc;
+}
+
/*
* Create a vfsmount that we can automount
*/
@@ -295,8 +320,7 @@ static struct vfsmount *cifs_dfs_do_automount(struct path *path)
ctx = smb3_fc2context(fc);
page = alloc_dentry_path();
- /* always use tree name prefix */
- full_path = build_path_from_dentry_optional_prefix(mntpt, page, true);
+ full_path = dfs_get_automount_devname(mntpt, page);
if (IS_ERR(full_path)) {
mnt = ERR_CAST(full_path);
goto out;
@@ -315,6 +339,12 @@ static struct vfsmount *cifs_dfs_do_automount(struct path *path)
goto out;
}
+ rc = set_dest_addr(ctx, full_path);
+ if (rc) {
+ mnt = ERR_PTR(rc);
+ goto out;
+ }
+
rc = smb3_parse_devname(full_path, ctx);
if (!rc)
mnt = fc_mount(fc);
diff --git a/fs/smb/client/cifs_spnego.c b/fs/smb/client/cifs_spnego.c
index 1e6819daaaa7..8b58f494235f 100644
--- a/fs/smb/client/cifs_spnego.c
+++ b/fs/smb/client/cifs_spnego.c
@@ -130,11 +130,13 @@ cifs_get_spnego_key(struct cifs_ses *sesInfo,
dp = description + strlen(description);
- /* for now, only sec=krb5 and sec=mskrb5 are valid */
+ /* for now, only sec=krb5 and sec=mskrb5 and iakerb are valid */
if (server->sec_kerberos)
sprintf(dp, ";sec=krb5");
else if (server->sec_mskerberos)
sprintf(dp, ";sec=mskrb5");
+ else if (server->sec_iakerb)
+ sprintf(dp, ";sec=iakerb");
else {
cifs_dbg(VFS, "unknown or missing server auth type, use krb5\n");
sprintf(dp, ";sec=krb5");
diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h
index 71e519bf65e2..6df50ff6d918 100644
--- a/fs/smb/client/cifsglob.h
+++ b/fs/smb/client/cifsglob.h
@@ -148,6 +148,7 @@ enum securityEnum {
NTLMv2, /* Legacy NTLM auth with NTLMv2 hash */
RawNTLMSSP, /* NTLMSSP without SPNEGO, NTLMv2 hash */
Kerberos, /* Kerberos via SPNEGO */
+ IAKerb, /* Kerberos proxy */
};
struct session_key {
@@ -674,8 +675,10 @@ struct TCP_Server_Info {
char workstation_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL];
__u32 sequence_number; /* for signing, protected by srv_mutex */
__u32 reconnect_instance; /* incremented on each reconnect */
+ __le32 session_key_id; /* retrieved from negotiate response and send in session setup request */
struct session_key session_key;
unsigned long lstrp; /* when we got last response from this server */
+ unsigned long neg_start; /* when negotiate started (jiffies) */
struct cifs_secmech secmech; /* crypto sec mech functs, descriptors */
#define CIFS_NEGFLAVOR_UNENCAP 1 /* wct == 17, but no ext_sec */
#define CIFS_NEGFLAVOR_EXTENDED 2 /* wct == 17, ext_sec bit set */
@@ -685,6 +688,7 @@ struct TCP_Server_Info {
bool sec_kerberosu2u; /* supports U2U Kerberos */
bool sec_kerberos; /* supports plain Kerberos */
bool sec_mskerberos; /* supports legacy MS Kerberos */
+ bool sec_iakerb; /* supports pass-through auth for Kerberos (krb5 proxy) */
bool large_buf; /* is current buffer large? */
/* use SMBD connection instead of socket */
bool rdma;
@@ -2049,6 +2053,8 @@ static inline char *get_security_type_str(enum securityEnum sectype)
return "Kerberos";
case NTLMv2:
return "NTLMv2";
+ case IAKerb:
+ return "IAKerb";
default:
return "Unknown";
}
diff --git a/fs/smb/client/cifspdu.h b/fs/smb/client/cifspdu.h
index 9cb457706334..a682c50d7ace 100644
--- a/fs/smb/client/cifspdu.h
+++ b/fs/smb/client/cifspdu.h
@@ -557,7 +557,7 @@ typedef union smb_com_session_setup_andx {
__le16 MaxBufferSize;
__le16 MaxMpxCount;
__le16 VcNumber;
- __u32 SessionKey;
+ __le32 SessionKey;
__le16 SecurityBlobLength;
__u32 Reserved;
__le32 Capabilities; /* see below */
@@ -576,7 +576,7 @@ typedef union smb_com_session_setup_andx {
__le16 MaxBufferSize;
__le16 MaxMpxCount;
__le16 VcNumber;
- __u32 SessionKey;
+ __le32 SessionKey;
__le16 CaseInsensitivePasswordLength; /* ASCII password len */
__le16 CaseSensitivePasswordLength; /* Unicode password length*/
__u32 Reserved; /* see below */
@@ -614,7 +614,7 @@ typedef union smb_com_session_setup_andx {
__le16 MaxBufferSize;
__le16 MaxMpxCount;
__le16 VcNumber;
- __u32 SessionKey;
+ __le32 SessionKey;
__le16 PasswordLength;
__u32 Reserved; /* encrypt key len and offset */
__le16 ByteCount;
diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h
index f37e4da0fe40..9a30425b75a9 100644
--- a/fs/smb/client/cifsproto.h
+++ b/fs/smb/client/cifsproto.h
@@ -30,6 +30,9 @@ extern void cifs_small_buf_release(void *);
extern void free_rsp_buf(int, void *);
extern int smb_send(struct TCP_Server_Info *, struct smb_hdr *,
unsigned int /* length */);
+extern int smb_send_kvec(struct TCP_Server_Info *server,
+ struct msghdr *msg,
+ size_t *sent);
extern unsigned int _get_xid(void);
extern void _free_xid(unsigned int);
#define get_xid() \
@@ -57,8 +60,29 @@ extern void exit_cifs_idmap(void);
extern int init_cifs_spnego(void);
extern void exit_cifs_spnego(void);
extern const char *build_path_from_dentry(struct dentry *, void *);
+char *__build_path_from_dentry_optional_prefix(struct dentry *direntry, void *page,
+ const char *tree, int tree_len,
+ bool prefix);
extern char *build_path_from_dentry_optional_prefix(struct dentry *direntry,
void *page, bool prefix);
+
+#ifdef CONFIG_CIFS_DFS_UPCALL
+static inline char *dfs_get_automount_devname(struct dentry *dentry, void *page)
+{
+ struct cifs_sb_info *cifs_sb = CIFS_SB(dentry->d_sb);
+ struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
+ struct TCP_Server_Info *server = tcon->ses->server;
+
+ if (unlikely(!server->origin_fullpath))
+ return ERR_PTR(-EREMOTE);
+
+ return __build_path_from_dentry_optional_prefix(dentry, page,
+ server->origin_fullpath,
+ strlen(server->origin_fullpath),
+ true);
+}
+#endif
+
static inline void *alloc_dentry_path(void)
{
return __getname();
@@ -152,6 +176,8 @@ extern int cifs_get_writable_path(struct cifs_tcon *tcon, const char *name,
extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *, bool);
extern int cifs_get_readable_path(struct cifs_tcon *tcon, const char *name,
struct cifsFileInfo **ret_file);
+extern int cifs_get_hardlink_path(struct cifs_tcon *tcon, struct inode *inode,
+ struct file *file);
extern unsigned int smbCalcSize(void *buf);
extern int decode_negTokenInit(unsigned char *security_blob, int length,
struct TCP_Server_Info *server);
diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c
index 6077fe1dcc9c..0c6ade196894 100644
--- a/fs/smb/client/cifssmb.c
+++ b/fs/smb/client/cifssmb.c
@@ -469,6 +469,7 @@ CIFSSMBNegotiate(const unsigned int xid,
server->max_rw = le32_to_cpu(pSMBr->MaxRawSize);
cifs_dbg(NOISY, "Max buf = %d\n", ses->server->maxBuf);
server->capabilities = le32_to_cpu(pSMBr->Capabilities);
+ server->session_key_id = pSMBr->SessionKey;
server->timeAdj = (int)(__s16)le16_to_cpu(pSMBr->ServerTimeZone);
server->timeAdj *= 60;
diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c
index db30c4b8a221..c3480e84f5c6 100644
--- a/fs/smb/client/connect.c
+++ b/fs/smb/client/connect.c
@@ -410,6 +410,13 @@ static int __cifs_reconnect(struct TCP_Server_Info *server,
if (!cifs_tcp_ses_needs_reconnect(server, 1))
return 0;
+ /*
+ * if smb session has been marked for reconnect, also reconnect all
+ * connections. This way, the other connections do not end up bad.
+ */
+ if (mark_smb_session)
+ cifs_signal_cifsd_for_reconnect(server, mark_smb_session);
+
cifs_mark_tcp_ses_conns_for_reconnect(server, mark_smb_session);
cifs_abort_connection(server);
@@ -682,12 +689,12 @@ server_unresponsive(struct TCP_Server_Info *server)
/*
* If we're in the process of mounting a share or reconnecting a session
* and the server abruptly shut down (e.g. socket wasn't closed, packet
- * had been ACK'ed but no SMB response), don't wait longer than 20s to
- * negotiate protocol.
+ * had been ACK'ed but no SMB response), don't wait longer than 20s from
+ * when negotiate actually started.
*/
spin_lock(&server->srv_lock);
if (server->tcpStatus == CifsInNegotiate &&
- time_after(jiffies, server->lstrp + 20 * HZ)) {
+ time_after(jiffies, server->neg_start + 20 * HZ)) {
spin_unlock(&server->srv_lock);
cifs_reconnect(server, false);
return true;
@@ -1881,9 +1888,8 @@ out_err:
/* this function must be called with ses_lock and chan_lock held */
static int match_session(struct cifs_ses *ses, struct smb3_fs_context *ctx)
{
- if (ctx->sectype != Unspecified &&
- ctx->sectype != ses->sectype)
- return 0;
+ struct TCP_Server_Info *server = ses->server;
+ enum securityEnum ctx_sec, ses_sec;
/*
* If an existing session is limited to less channels than
@@ -1892,11 +1898,20 @@ static int match_session(struct cifs_ses *ses, struct smb3_fs_context *ctx)
if (ses->chan_max < ctx->max_channels)
return 0;
- switch (ses->sectype) {
+ ctx_sec = server->ops->select_sectype(server, ctx->sectype);
+ ses_sec = server->ops->select_sectype(server, ses->sectype);
+
+ if (ctx_sec != ses_sec)
+ return 0;
+
+ switch (ctx_sec) {
+ case IAKerb:
case Kerberos:
if (!uid_eq(ctx->cred_uid, ses->cred_uid))
return 0;
break;
+ case NTLMv2:
+ case RawNTLMSSP:
default:
/* NULL username means anonymous session */
if (ses->user_name == NULL) {
@@ -2409,6 +2424,8 @@ static int match_tcon(struct cifs_tcon *tcon, struct smb3_fs_context *ctx)
return 0;
if (tcon->nodelete != ctx->nodelete)
return 0;
+ if (tcon->posix_extensions != ctx->linux_ext)
+ return 0;
return 1;
}
@@ -2956,8 +2973,10 @@ ip_rfc1001_connect(struct TCP_Server_Info *server)
* sessinit is sent but no second negprot
*/
struct rfc1002_session_packet req = {};
- struct smb_hdr *smb_buf = (struct smb_hdr *)&req;
+ struct msghdr msg = {};
+ struct kvec iov = {};
unsigned int len;
+ size_t sent;
req.trailer.session_req.called_len = sizeof(req.trailer.session_req.called_name);
@@ -2986,10 +3005,18 @@ ip_rfc1001_connect(struct TCP_Server_Info *server)
* As per rfc1002, @len must be the number of bytes that follows the
* length field of a rfc1002 session request payload.
*/
- len = sizeof(req) - offsetof(struct rfc1002_session_packet, trailer.session_req);
+ len = sizeof(req.trailer.session_req);
+ req.type = RFC1002_SESSION_REQUEST;
+ req.flags = 0;
+ req.length = cpu_to_be16(len);
+ len += offsetof(typeof(req), trailer.session_req);
+ iov.iov_base = &req;
+ iov.iov_len = len;
+ iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, &iov, 1, len);
+ rc = smb_send_kvec(server, &msg, &sent);
+ if (rc < 0 || len != sent)
+ return (rc == -EINTR || rc == -EAGAIN) ? rc : -ECONNABORTED;
- smb_buf->smb_buf_length = cpu_to_be32((RFC1002_SESSION_REQUEST << 24) | len);
- rc = smb_send(server, smb_buf, len);
/*
* RFC1001 layer in at least one server requires very short break before
* negprot presumably because not expecting negprot to follow so fast.
@@ -2998,7 +3025,7 @@ ip_rfc1001_connect(struct TCP_Server_Info *server)
*/
usleep_range(1000, 2000);
- return rc;
+ return 0;
}
static int
@@ -4168,11 +4195,13 @@ int
cifs_negotiate_protocol(const unsigned int xid, struct cifs_ses *ses,
struct TCP_Server_Info *server)
{
+ bool in_retry = false;
int rc = 0;
if (!server->ops->need_neg || !server->ops->negotiate)
return -ENOSYS;
+retry:
/* only send once per connect */
spin_lock(&server->srv_lock);
if (server->tcpStatus != CifsGood &&
@@ -4188,10 +4217,20 @@ cifs_negotiate_protocol(const unsigned int xid, struct cifs_ses *ses,
return 0;
}
+ server->lstrp = jiffies;
server->tcpStatus = CifsInNegotiate;
+ server->neg_start = jiffies;
spin_unlock(&server->srv_lock);
rc = server->ops->negotiate(xid, ses, server);
+ if (rc == -EAGAIN) {
+ /* Allow one retry attempt */
+ if (!in_retry) {
+ in_retry = true;
+ goto retry;
+ }
+ rc = -EHOSTDOWN;
+ }
if (rc == 0) {
spin_lock(&server->srv_lock);
if (server->tcpStatus == CifsInNegotiate)
diff --git a/fs/smb/client/dir.c b/fs/smb/client/dir.c
index 863c7bc3db86..477302157ab3 100644
--- a/fs/smb/client/dir.c
+++ b/fs/smb/client/dir.c
@@ -78,14 +78,13 @@ build_path_from_dentry(struct dentry *direntry, void *page)
prefix);
}
-char *
-build_path_from_dentry_optional_prefix(struct dentry *direntry, void *page,
- bool prefix)
+char *__build_path_from_dentry_optional_prefix(struct dentry *direntry, void *page,
+ const char *tree, int tree_len,
+ bool prefix)
{
int dfsplen;
int pplen = 0;
struct cifs_sb_info *cifs_sb = CIFS_SB(direntry->d_sb);
- struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
char dirsep = CIFS_DIR_SEP(cifs_sb);
char *s;
@@ -93,7 +92,7 @@ build_path_from_dentry_optional_prefix(struct dentry *direntry, void *page,
return ERR_PTR(-ENOMEM);
if (prefix)
- dfsplen = strnlen(tcon->tree_name, MAX_TREE_SIZE + 1);
+ dfsplen = strnlen(tree, tree_len + 1);
else
dfsplen = 0;
@@ -123,7 +122,7 @@ build_path_from_dentry_optional_prefix(struct dentry *direntry, void *page,
}
if (dfsplen) {
s -= dfsplen;
- memcpy(s, tcon->tree_name, dfsplen);
+ memcpy(s, tree, dfsplen);
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) {
int i;
for (i = 0; i < dfsplen; i++) {
@@ -135,6 +134,16 @@ build_path_from_dentry_optional_prefix(struct dentry *direntry, void *page,
return s;
}
+char *build_path_from_dentry_optional_prefix(struct dentry *direntry, void *page,
+ bool prefix)
+{
+ struct cifs_sb_info *cifs_sb = CIFS_SB(direntry->d_sb);
+ struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
+
+ return __build_path_from_dentry_optional_prefix(direntry, page, tcon->tree_name,
+ MAX_TREE_SIZE, prefix);
+}
+
/*
* Don't allow path components longer than the server max.
* Don't allow the separator character in a path component.
diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c
index d23dfc83de50..3551054ef097 100644
--- a/fs/smb/client/file.c
+++ b/fs/smb/client/file.c
@@ -687,6 +687,11 @@ int cifs_open(struct inode *inode, struct file *file)
} else {
_cifsFileInfo_put(cfile, true, false);
}
+ } else {
+ /* hard link on the defeered close file */
+ rc = cifs_get_hardlink_path(tcon, inode, file);
+ if (rc)
+ cifs_close_deferred_file(CIFS_I(inode));
}
if (server->oplocks)
@@ -1735,6 +1740,29 @@ cifs_move_llist(struct list_head *source, struct list_head *dest)
list_move(li, dest);
}
+int
+cifs_get_hardlink_path(struct cifs_tcon *tcon, struct inode *inode,
+ struct file *file)
+{
+ struct cifsFileInfo *open_file = NULL;
+ struct cifsInodeInfo *cinode = CIFS_I(inode);
+ int rc = 0;
+
+ spin_lock(&tcon->open_file_lock);
+ spin_lock(&cinode->open_file_lock);
+
+ list_for_each_entry(open_file, &cinode->openFileList, flist) {
+ if (file->f_flags == open_file->f_flags) {
+ rc = -EINVAL;
+ break;
+ }
+ }
+
+ spin_unlock(&cinode->open_file_lock);
+ spin_unlock(&tcon->open_file_lock);
+ return rc;
+}
+
void
cifs_free_llist(struct list_head *llist)
{
@@ -5161,7 +5189,8 @@ void cifs_oplock_break(struct work_struct *work)
struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo,
oplock_break);
struct inode *inode = d_inode(cfile->dentry);
- struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+ struct super_block *sb = inode->i_sb;
+ struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
struct cifsInodeInfo *cinode = CIFS_I(inode);
struct cifs_tcon *tcon;
struct TCP_Server_Info *server;
@@ -5171,6 +5200,12 @@ void cifs_oplock_break(struct work_struct *work)
__u64 persistent_fid, volatile_fid;
__u16 net_fid;
+ /*
+ * Hold a reference to the superblock to prevent it and its inodes from
+ * being freed while we are accessing cinode. Otherwise, _cifsFileInfo_put()
+ * may release the last reference to the sb and trigger inode eviction.
+ */
+ cifs_sb_active(sb);
wait_on_bit(&cinode->flags, CIFS_INODE_PENDING_WRITERS,
TASK_UNINTERRUPTIBLE);
@@ -5243,6 +5278,7 @@ oplock_break_ack:
cifs_put_tlink(tlink);
out:
cifs_done_oplock_break(cinode);
+ cifs_sb_deactive(sb);
}
/*
diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c
index ca39d01077cd..e6d2e4162b08 100644
--- a/fs/smb/client/fs_context.c
+++ b/fs/smb/client/fs_context.c
@@ -1078,21 +1078,21 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
ctx->got_wsize = true;
break;
case Opt_acregmax:
- ctx->acregmax = HZ * result.uint_32;
- if (ctx->acregmax > CIFS_MAX_ACTIMEO) {
+ if (result.uint_32 > CIFS_MAX_ACTIMEO / HZ) {
cifs_errorf(fc, "acregmax too large\n");
goto cifs_parse_mount_err;
}
+ ctx->acregmax = HZ * result.uint_32;
break;
case Opt_acdirmax:
- ctx->acdirmax = HZ * result.uint_32;
- if (ctx->acdirmax > CIFS_MAX_ACTIMEO) {
+ if (result.uint_32 > CIFS_MAX_ACTIMEO / HZ) {
cifs_errorf(fc, "acdirmax too large\n");
goto cifs_parse_mount_err;
}
+ ctx->acdirmax = HZ * result.uint_32;
break;
case Opt_actimeo:
- if (HZ * result.uint_32 > CIFS_MAX_ACTIMEO) {
+ if (result.uint_32 > CIFS_MAX_ACTIMEO / HZ) {
cifs_errorf(fc, "timeout too large\n");
goto cifs_parse_mount_err;
}
@@ -1104,13 +1104,18 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
ctx->acdirmax = ctx->acregmax = HZ * result.uint_32;
break;
case Opt_closetimeo:
- ctx->closetimeo = HZ * result.uint_32;
- if (ctx->closetimeo > SMB3_MAX_DCLOSETIMEO) {
+ if (result.uint_32 > SMB3_MAX_DCLOSETIMEO / HZ) {
cifs_errorf(fc, "closetimeo too large\n");
goto cifs_parse_mount_err;
}
+ ctx->closetimeo = HZ * result.uint_32;
break;
case Opt_echo_interval:
+ if (result.uint_32 < SMB_ECHO_INTERVAL_MIN ||
+ result.uint_32 > SMB_ECHO_INTERVAL_MAX) {
+ cifs_errorf(fc, "echo interval is out of bounds\n");
+ goto cifs_parse_mount_err;
+ }
ctx->echo_interval = result.uint_32;
break;
case Opt_snapshot:
diff --git a/fs/smb/client/ioctl.c b/fs/smb/client/ioctl.c
index ae9905e2b9d4..7402070b7a06 100644
--- a/fs/smb/client/ioctl.c
+++ b/fs/smb/client/ioctl.c
@@ -246,7 +246,9 @@ static int cifs_dump_full_key(struct cifs_tcon *tcon, struct smb3_full_key_debug
spin_lock(&cifs_tcp_ses_lock);
list_for_each_entry(server_it, &cifs_tcp_ses_list, tcp_ses_list) {
list_for_each_entry(ses_it, &server_it->smb_ses_list, smb_ses_list) {
- if (ses_it->Suid == out.session_id) {
+ spin_lock(&ses_it->ses_lock);
+ if (ses_it->ses_status != SES_EXITING &&
+ ses_it->Suid == out.session_id) {
ses = ses_it;
/*
* since we are using the session outside the crit
@@ -254,9 +256,11 @@ static int cifs_dump_full_key(struct cifs_tcon *tcon, struct smb3_full_key_debug
* so increment its refcount
*/
ses->ses_count++;
+ spin_unlock(&ses_it->ses_lock);
found = true;
goto search_end;
}
+ spin_unlock(&ses_it->ses_lock);
}
}
search_end:
diff --git a/fs/smb/client/link.c b/fs/smb/client/link.c
index c0f101fc1e5d..d71feb3fdbd2 100644
--- a/fs/smb/client/link.c
+++ b/fs/smb/client/link.c
@@ -269,7 +269,7 @@ cifs_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
struct cifs_open_parms oparms;
struct cifs_io_parms io_parms = {0};
int buf_type = CIFS_NO_BUFFER;
- FILE_ALL_INFO file_info;
+ struct cifs_open_info_data query_data;
oparms = (struct cifs_open_parms) {
.tcon = tcon,
@@ -281,11 +281,11 @@ cifs_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
.fid = &fid,
};
- rc = CIFS_open(xid, &oparms, &oplock, &file_info);
+ rc = tcon->ses->server->ops->open(xid, &oparms, &oplock, &query_data);
if (rc)
return rc;
- if (file_info.EndOfFile != cpu_to_le64(CIFS_MF_SYMLINK_FILE_SIZE)) {
+ if (query_data.fi.EndOfFile != cpu_to_le64(CIFS_MF_SYMLINK_FILE_SIZE)) {
rc = -ENOENT;
/* it's not a symlink */
goto out;
@@ -324,7 +324,7 @@ cifs_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
.fid = &fid,
};
- rc = CIFS_open(xid, &oparms, &oplock, NULL);
+ rc = tcon->ses->server->ops->open(xid, &oparms, &oplock, NULL);
if (rc)
return rc;
diff --git a/fs/smb/client/misc.c b/fs/smb/client/misc.c
index 3826f7176608..99a0a1fe6618 100644
--- a/fs/smb/client/misc.c
+++ b/fs/smb/client/misc.c
@@ -307,6 +307,14 @@ check_smb_hdr(struct smb_hdr *smb)
if (smb->Command == SMB_COM_LOCKING_ANDX)
return 0;
+ /*
+ * Windows NT server returns error resposne (e.g. STATUS_DELETE_PENDING
+ * or STATUS_OBJECT_NAME_NOT_FOUND or ERRDOS/ERRbadfile or any other)
+ * for some TRANS2 requests without the RESPONSE flag set in header.
+ */
+ if (smb->Command == SMB_COM_TRANSACTION2 && smb->Status.CifsError != 0)
+ return 0;
+
cifs_dbg(VFS, "Server sent request, not response. mid=%u\n",
get_mid(smb));
return 1;
diff --git a/fs/smb/client/readdir.c b/fs/smb/client/readdir.c
index 9a1f1913fb59..20955d595e6a 100644
--- a/fs/smb/client/readdir.c
+++ b/fs/smb/client/readdir.c
@@ -765,7 +765,10 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, loff_t pos,
else
cifs_buf_release(cfile->srch_inf.
ntwrk_buf_start);
+ /* Reset all pointers to the network buffer to prevent stale references */
cfile->srch_inf.ntwrk_buf_start = NULL;
+ cfile->srch_inf.srch_entries_start = NULL;
+ cfile->srch_inf.last_entry = NULL;
}
rc = initiate_cifs_search(xid, file, full_path);
if (rc) {
@@ -788,11 +791,11 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, loff_t pos,
rc = server->ops->query_dir_next(xid, tcon, &cfile->fid,
search_flags,
&cfile->srch_inf);
+ if (rc)
+ return -ENOENT;
/* FindFirst/Next set last_entry to NULL on malformed reply */
if (cfile->srch_inf.last_entry)
cifs_save_resume_key(cfile->srch_inf.last_entry, cfile);
- if (rc)
- return -ENOENT;
}
if (index_to_find < cfile->srch_inf.index_of_last_entry) {
/* we found the buffer that contains the entry */
@@ -879,9 +882,9 @@ static bool emit_cached_dirents(struct cached_dirents *cde,
}
static void update_cached_dirents_count(struct cached_dirents *cde,
- struct dir_context *ctx)
+ struct file *file)
{
- if (cde->ctx != ctx)
+ if (cde->file != file)
return;
if (cde->is_valid || cde->is_failed)
return;
@@ -890,9 +893,9 @@ static void update_cached_dirents_count(struct cached_dirents *cde,
}
static void finished_cached_dirents_count(struct cached_dirents *cde,
- struct dir_context *ctx)
+ struct dir_context *ctx, struct file *file)
{
- if (cde->ctx != ctx)
+ if (cde->file != file)
return;
if (cde->is_valid || cde->is_failed)
return;
@@ -905,11 +908,12 @@ static void finished_cached_dirents_count(struct cached_dirents *cde,
static void add_cached_dirent(struct cached_dirents *cde,
struct dir_context *ctx,
const char *name, int namelen,
- struct cifs_fattr *fattr)
+ struct cifs_fattr *fattr,
+ struct file *file)
{
struct cached_dirent *de;
- if (cde->ctx != ctx)
+ if (cde->file != file)
return;
if (cde->is_valid || cde->is_failed)
return;
@@ -939,7 +943,8 @@ static void add_cached_dirent(struct cached_dirents *cde,
static bool cifs_dir_emit(struct dir_context *ctx,
const char *name, int namelen,
struct cifs_fattr *fattr,
- struct cached_fid *cfid)
+ struct cached_fid *cfid,
+ struct file *file)
{
bool rc;
ino_t ino = cifs_uniqueid_to_ino_t(fattr->cf_uniqueid);
@@ -951,7 +956,7 @@ static bool cifs_dir_emit(struct dir_context *ctx,
if (cfid) {
mutex_lock(&cfid->dirents.de_mutex);
add_cached_dirent(&cfid->dirents, ctx, name, namelen,
- fattr);
+ fattr, file);
mutex_unlock(&cfid->dirents.de_mutex);
}
@@ -1051,7 +1056,7 @@ static int cifs_filldir(char *find_entry, struct file *file,
cifs_prime_dcache(file_dentry(file), &name, &fattr);
return !cifs_dir_emit(ctx, name.name, name.len,
- &fattr, cfid);
+ &fattr, cfid, file);
}
@@ -1102,8 +1107,8 @@ int cifs_readdir(struct file *file, struct dir_context *ctx)
* we need to initialize scanning and storing the
* directory content.
*/
- if (ctx->pos == 0 && cfid->dirents.ctx == NULL) {
- cfid->dirents.ctx = ctx;
+ if (ctx->pos == 0 && cfid->dirents.file == NULL) {
+ cfid->dirents.file = file;
cfid->dirents.pos = 2;
}
/*
@@ -1171,7 +1176,7 @@ int cifs_readdir(struct file *file, struct dir_context *ctx)
} else {
if (cfid) {
mutex_lock(&cfid->dirents.de_mutex);
- finished_cached_dirents_count(&cfid->dirents, ctx);
+ finished_cached_dirents_count(&cfid->dirents, ctx, file);
mutex_unlock(&cfid->dirents.de_mutex);
}
cifs_dbg(FYI, "Could not find entry\n");
@@ -1212,7 +1217,7 @@ int cifs_readdir(struct file *file, struct dir_context *ctx)
ctx->pos++;
if (cfid) {
mutex_lock(&cfid->dirents.de_mutex);
- update_cached_dirents_count(&cfid->dirents, ctx);
+ update_cached_dirents_count(&cfid->dirents, file);
mutex_unlock(&cfid->dirents.de_mutex);
}
diff --git a/fs/smb/client/sess.c b/fs/smb/client/sess.c
index b8e14bcd2c68..883d1cb1fc8b 100644
--- a/fs/smb/client/sess.c
+++ b/fs/smb/client/sess.c
@@ -605,6 +605,7 @@ static __u32 cifs_ssetup_hdr(struct cifs_ses *ses,
USHRT_MAX));
pSMB->req.MaxMpxCount = cpu_to_le16(server->maxReq);
pSMB->req.VcNumber = cpu_to_le16(1);
+ pSMB->req.SessionKey = server->session_key_id;
/* Now no need to set SMBFLG_CASELESS or obsolete CANONICAL PATH */
@@ -1209,12 +1210,13 @@ cifs_select_sectype(struct TCP_Server_Info *server, enum securityEnum requested)
switch (requested) {
case Kerberos:
case RawNTLMSSP:
+ case IAKerb:
return requested;
case Unspecified:
if (server->sec_ntlmssp &&
(global_secflags & CIFSSEC_MAY_NTLMSSP))
return RawNTLMSSP;
- if ((server->sec_kerberos || server->sec_mskerberos) &&
+ if ((server->sec_kerberos || server->sec_mskerberos || server->sec_iakerb) &&
(global_secflags & CIFSSEC_MAY_KRB5))
return Kerberos;
fallthrough;
diff --git a/fs/smb/client/smb1ops.c b/fs/smb/client/smb1ops.c
index 225cc7e0304c..1489b9d21b60 100644
--- a/fs/smb/client/smb1ops.c
+++ b/fs/smb/client/smb1ops.c
@@ -426,13 +426,6 @@ cifs_negotiate(const unsigned int xid,
{
int rc;
rc = CIFSSMBNegotiate(xid, ses, server);
- if (rc == -EAGAIN) {
- /* retry only once on 1st time connection */
- set_credits(server, 1);
- rc = CIFSSMBNegotiate(xid, ses, server);
- if (rc == -EAGAIN)
- rc = -EHOSTDOWN;
- }
return rc;
}
diff --git a/fs/smb/client/smb2file.c b/fs/smb/client/smb2file.c
index a7475bc05cac..afdc78e92ee9 100644
--- a/fs/smb/client/smb2file.c
+++ b/fs/smb/client/smb2file.c
@@ -108,16 +108,25 @@ int smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms, __u32
int err_buftype = CIFS_NO_BUFFER;
struct cifs_fid *fid = oparms->fid;
struct network_resiliency_req nr_ioctl_req;
+ bool retry_without_read_attributes = false;
smb2_path = cifs_convert_path_to_utf16(oparms->path, oparms->cifs_sb);
if (smb2_path == NULL)
return -ENOMEM;
- oparms->desired_access |= FILE_READ_ATTRIBUTES;
+ if (!(oparms->desired_access & FILE_READ_ATTRIBUTES)) {
+ oparms->desired_access |= FILE_READ_ATTRIBUTES;
+ retry_without_read_attributes = true;
+ }
smb2_oplock = SMB2_OPLOCK_LEVEL_BATCH;
rc = SMB2_open(xid, oparms, smb2_path, &smb2_oplock, smb2_data, NULL, &err_iov,
&err_buftype);
+ if (rc == -EACCES && retry_without_read_attributes) {
+ oparms->desired_access &= ~FILE_READ_ATTRIBUTES;
+ rc = SMB2_open(xid, oparms, smb2_path, &smb2_oplock, smb2_data, NULL, &err_iov,
+ &err_buftype);
+ }
if (rc && data) {
struct smb2_hdr *hdr = err_iov.iov_base;
diff --git a/fs/smb/client/smb2misc.c b/fs/smb/client/smb2misc.c
index 8c149cb531d3..01352d7c10f1 100644
--- a/fs/smb/client/smb2misc.c
+++ b/fs/smb/client/smb2misc.c
@@ -814,11 +814,12 @@ smb2_handle_cancelled_close(struct cifs_tcon *tcon, __u64 persistent_fid,
WARN_ONCE(tcon->tc_count < 0, "tcon refcount is negative");
spin_unlock(&cifs_tcp_ses_lock);
- if (tcon->ses)
+ if (tcon->ses) {
server = tcon->ses->server;
-
- cifs_server_dbg(FYI, "tid=0x%x: tcon is closing, skipping async close retry of fid %llu %llu\n",
- tcon->tid, persistent_fid, volatile_fid);
+ cifs_server_dbg(FYI,
+ "tid=0x%x: tcon is closing, skipping async close retry of fid %llu %llu\n",
+ tcon->tid, persistent_fid, volatile_fid);
+ }
return 0;
}
diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c
index a62f3e5a7689..7d720cf9fe72 100644
--- a/fs/smb/client/smb2ops.c
+++ b/fs/smb/client/smb2ops.c
@@ -422,9 +422,6 @@ smb2_negotiate(const unsigned int xid,
server->CurrentMid = 0;
spin_unlock(&server->mid_lock);
rc = SMB2_negotiate(xid, ses, server);
- /* BB we probably don't need to retry with modern servers */
- if (rc == -EAGAIN)
- rc = -EHOSTDOWN;
return rc;
}
@@ -4500,6 +4497,7 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst,
u8 key[SMB3_ENC_DEC_KEY_SIZE];
struct aead_request *req;
u8 *iv;
+ DECLARE_CRYPTO_WAIT(wait);
unsigned int crypt_len = le32_to_cpu(tr_hdr->OriginalMessageSize);
void *creq;
@@ -4548,7 +4546,11 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst,
aead_request_set_crypt(req, sg, sg, crypt_len, iv);
aead_request_set_ad(req, assoc_data_len);
- rc = enc ? crypto_aead_encrypt(req) : crypto_aead_decrypt(req);
+ aead_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
+ crypto_req_done, &wait);
+
+ rc = crypto_wait_req(enc ? crypto_aead_encrypt(req)
+ : crypto_aead_decrypt(req), &wait);
if (!rc && enc)
memcpy(&tr_hdr->Signature, sign, SMB2_SIGNATURE_SIZE);
diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c
index 217d381eb9fe..c28e39c67c4f 100644
--- a/fs/smb/client/smb2pdu.c
+++ b/fs/smb/client/smb2pdu.c
@@ -1270,7 +1270,7 @@ smb2_select_sectype(struct TCP_Server_Info *server, enum securityEnum requested)
if (server->sec_ntlmssp &&
(global_secflags & CIFSSEC_MAY_NTLMSSP))
return RawNTLMSSP;
- if ((server->sec_kerberos || server->sec_mskerberos) &&
+ if ((server->sec_kerberos || server->sec_mskerberos || server->sec_iakerb) &&
(global_secflags & CIFSSEC_MAY_KRB5))
return Kerberos;
fallthrough;
@@ -1999,7 +1999,7 @@ tcon_exit:
tcon_error_exit:
if (rsp && rsp->hdr.Status == STATUS_BAD_NETWORK_NAME)
- cifs_tcon_dbg(VFS, "BAD_NETWORK_NAME: %s\n", tree);
+ cifs_dbg(VFS | ONCE, "BAD_NETWORK_NAME: %s\n", tree);
goto tcon_exit;
}
@@ -2826,7 +2826,7 @@ int smb311_posix_mkdir(const unsigned int xid, struct inode *inode,
/* Eventually save off posix specific response info and timestaps */
err_free_rsp_buf:
- free_rsp_buf(resp_buftype, rsp);
+ free_rsp_buf(resp_buftype, rsp_iov.iov_base);
kfree(pc_buf);
err_free_req:
cifs_small_buf_release(req);
diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c
index cf923f211c51..d47eae133a20 100644
--- a/fs/smb/client/smbdirect.c
+++ b/fs/smb/client/smbdirect.c
@@ -455,7 +455,6 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_RECV) {
log_rdma_recv(INFO, "wc->status=%d opcode=%d\n",
wc->status, wc->opcode);
- smbd_disconnect_rdma_connection(info);
goto error;
}
@@ -472,8 +471,9 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
info->full_packet_received = true;
info->negotiate_done =
process_negotiation_response(response, wc->byte_len);
+ put_receive_buffer(info, response);
complete(&info->negotiate_completion);
- break;
+ return;
/* SMBD data transfer packet */
case SMBD_TRANSFER_DATA:
@@ -530,14 +530,16 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
}
return;
-
- default:
- log_rdma_recv(ERR,
- "unexpected response type=%d\n", response->type);
}
+ /*
+ * This is an internal error!
+ */
+ log_rdma_recv(ERR, "unexpected response type=%d\n", response->type);
+ WARN_ON_ONCE(response->type != SMBD_TRANSFER_DATA);
error:
put_receive_buffer(info, response);
+ smbd_disconnect_rdma_connection(info);
}
static struct rdma_cm_id *smbd_create_id(
diff --git a/fs/smb/client/transport.c b/fs/smb/client/transport.c
index 3fdafb9297f1..d2867bd263c5 100644
--- a/fs/smb/client/transport.c
+++ b/fs/smb/client/transport.c
@@ -178,7 +178,7 @@ delete_mid(struct mid_q_entry *mid)
* Our basic "send data to server" function. Should be called with srv_mutex
* held. The caller is responsible for handling the results.
*/
-static int
+int
smb_send_kvec(struct TCP_Server_Info *server, struct msghdr *smb_msg,
size_t *sent)
{
diff --git a/fs/smb/server/auth.c b/fs/smb/server/auth.c
index 8e24a6665abd..f8a192cc82f2 100644
--- a/fs/smb/server/auth.c
+++ b/fs/smb/server/auth.c
@@ -544,7 +544,19 @@ int ksmbd_krb5_authenticate(struct ksmbd_session *sess, char *in_blob,
retval = -ENOMEM;
goto out;
}
- sess->user = user;
+
+ if (!sess->user) {
+ /* First successful authentication */
+ sess->user = user;
+ } else {
+ if (!ksmbd_compare_user(sess->user, user)) {
+ ksmbd_debug(AUTH, "different user tried to reuse session\n");
+ retval = -EPERM;
+ ksmbd_free_user(user);
+ goto out;
+ }
+ ksmbd_free_user(user);
+ }
memcpy(sess->sess_key, resp->payload, resp->session_key_len);
memcpy(out_blob, resp->payload + resp->session_key_len,
@@ -1010,9 +1022,9 @@ static int ksmbd_get_encryption_key(struct ksmbd_work *work, __u64 ses_id,
ses_enc_key = enc ? sess->smb3encryptionkey :
sess->smb3decryptionkey;
- if (enc)
- ksmbd_user_session_get(sess);
memcpy(key, ses_enc_key, SMB3_ENC_DEC_KEY_SIZE);
+ if (!enc)
+ ksmbd_user_session_put(sess);
return 0;
}
@@ -1211,7 +1223,7 @@ free_iv:
free_sg:
kfree(sg);
free_req:
- kfree(req);
+ aead_request_free(req);
free_ctx:
ksmbd_release_crypto_ctx(ctx);
return rc;
diff --git a/fs/smb/server/connection.h b/fs/smb/server/connection.h
index 0e04cf8b1d89..0e72be594e91 100644
--- a/fs/smb/server/connection.h
+++ b/fs/smb/server/connection.h
@@ -45,6 +45,7 @@ struct ksmbd_conn {
struct mutex srv_mutex;
int status;
unsigned int cli_cap;
+ __be32 inet_addr;
char *request_buf;
struct ksmbd_transport *transport;
struct nls_table *local_nls;
diff --git a/fs/smb/server/mgmt/user_session.c b/fs/smb/server/mgmt/user_session.c
index b1c2219ec0b4..f00fa7760412 100644
--- a/fs/smb/server/mgmt/user_session.c
+++ b/fs/smb/server/mgmt/user_session.c
@@ -180,7 +180,7 @@ static void ksmbd_expire_session(struct ksmbd_conn *conn)
down_write(&sessions_table_lock);
down_write(&conn->session_lock);
xa_for_each(&conn->sessions, id, sess) {
- if (atomic_read(&sess->refcnt) == 0 &&
+ if (atomic_read(&sess->refcnt) <= 1 &&
(sess->state != SMB2_SESSION_VALID ||
time_after(jiffies,
sess->last_active + SMB2_SESSION_TIMEOUT))) {
@@ -229,7 +229,11 @@ void ksmbd_sessions_deregister(struct ksmbd_conn *conn)
if (!ksmbd_chann_del(conn, sess) &&
xa_empty(&sess->ksmbd_chann_list)) {
hash_del(&sess->hlist);
- ksmbd_session_destroy(sess);
+ down_write(&conn->session_lock);
+ xa_erase(&conn->sessions, sess->id);
+ up_write(&conn->session_lock);
+ if (atomic_dec_and_test(&sess->refcnt))
+ ksmbd_session_destroy(sess);
}
}
}
@@ -248,13 +252,30 @@ void ksmbd_sessions_deregister(struct ksmbd_conn *conn)
if (xa_empty(&sess->ksmbd_chann_list)) {
xa_erase(&conn->sessions, sess->id);
hash_del(&sess->hlist);
- ksmbd_session_destroy(sess);
+ if (atomic_dec_and_test(&sess->refcnt))
+ ksmbd_session_destroy(sess);
}
}
up_write(&conn->session_lock);
up_write(&sessions_table_lock);
}
+bool is_ksmbd_session_in_connection(struct ksmbd_conn *conn,
+ unsigned long long id)
+{
+ struct ksmbd_session *sess;
+
+ down_read(&conn->session_lock);
+ sess = xa_load(&conn->sessions, id);
+ if (sess) {
+ up_read(&conn->session_lock);
+ return true;
+ }
+ up_read(&conn->session_lock);
+
+ return false;
+}
+
struct ksmbd_session *ksmbd_session_lookup(struct ksmbd_conn *conn,
unsigned long long id)
{
@@ -308,8 +329,8 @@ void ksmbd_user_session_put(struct ksmbd_session *sess)
if (atomic_read(&sess->refcnt) <= 0)
WARN_ON(1);
- else
- atomic_dec(&sess->refcnt);
+ else if (atomic_dec_and_test(&sess->refcnt))
+ ksmbd_session_destroy(sess);
}
struct preauth_session *ksmbd_preauth_session_alloc(struct ksmbd_conn *conn,
@@ -380,7 +401,7 @@ static struct ksmbd_session *__session_create(int protocol)
xa_init(&sess->rpc_handle_list);
sess->sequence_number = 1;
rwlock_init(&sess->tree_conns_lock);
- atomic_set(&sess->refcnt, 1);
+ atomic_set(&sess->refcnt, 2);
ret = __init_smb2_session(sess);
if (ret)
diff --git a/fs/smb/server/mgmt/user_session.h b/fs/smb/server/mgmt/user_session.h
index ce91b1d698e7..f4da293c4dbb 100644
--- a/fs/smb/server/mgmt/user_session.h
+++ b/fs/smb/server/mgmt/user_session.h
@@ -87,6 +87,8 @@ void ksmbd_session_destroy(struct ksmbd_session *sess);
struct ksmbd_session *ksmbd_session_lookup_slowpath(unsigned long long id);
struct ksmbd_session *ksmbd_session_lookup(struct ksmbd_conn *conn,
unsigned long long id);
+bool is_ksmbd_session_in_connection(struct ksmbd_conn *conn,
+ unsigned long long id);
int ksmbd_session_register(struct ksmbd_conn *conn,
struct ksmbd_session *sess);
void ksmbd_sessions_deregister(struct ksmbd_conn *conn);
diff --git a/fs/smb/server/oplock.c b/fs/smb/server/oplock.c
index b29e78b517bf..258ed9978b90 100644
--- a/fs/smb/server/oplock.c
+++ b/fs/smb/server/oplock.c
@@ -1515,7 +1515,7 @@ void create_lease_buf(u8 *rbuf, struct lease *lease)
* @open_req: buffer containing smb2 file open(create) request
* @is_dir: whether leasing file is directory
*
- * Return: oplock state, -ENOENT if create lease context not found
+ * Return: allocated lease context object on success, otherwise NULL
*/
struct lease_ctx_info *parse_lease_state(void *open_req, bool is_dir)
{
@@ -1534,6 +1534,10 @@ struct lease_ctx_info *parse_lease_state(void *open_req, bool is_dir)
if (sizeof(struct lease_context_v2) == le32_to_cpu(cc->DataLength)) {
struct create_lease_v2 *lc = (struct create_lease_v2 *)cc;
+ if (le16_to_cpu(cc->DataOffset) + le32_to_cpu(cc->DataLength) <
+ sizeof(struct create_lease_v2) - 4)
+ goto err_out;
+
memcpy(lreq->lease_key, lc->lcontext.LeaseKey, SMB2_LEASE_KEY_SIZE);
if (is_dir) {
lreq->req_state = lc->lcontext.LeaseState &
@@ -1551,6 +1555,10 @@ struct lease_ctx_info *parse_lease_state(void *open_req, bool is_dir)
} else {
struct create_lease *lc = (struct create_lease *)cc;
+ if (le16_to_cpu(cc->DataOffset) + le32_to_cpu(cc->DataLength) <
+ sizeof(struct create_lease))
+ goto err_out;
+
memcpy(lreq->lease_key, lc->lcontext.LeaseKey, SMB2_LEASE_KEY_SIZE);
lreq->req_state = lc->lcontext.LeaseState;
lreq->flags = lc->lcontext.LeaseFlags;
@@ -1558,6 +1566,9 @@ struct lease_ctx_info *parse_lease_state(void *open_req, bool is_dir)
lreq->version = 1;
}
return lreq;
+err_out:
+ kfree(lreq);
+ return NULL;
}
/**
diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c
index 9d041fc558e3..3e2cd22fb2bd 100644
--- a/fs/smb/server/smb2pdu.c
+++ b/fs/smb/server/smb2pdu.c
@@ -1356,8 +1356,7 @@ static int ntlm_negotiate(struct ksmbd_work *work,
return rc;
sz = le16_to_cpu(rsp->SecurityBufferOffset);
- chgblob =
- (struct challenge_message *)((char *)&rsp->hdr.ProtocolId + sz);
+ chgblob = (struct challenge_message *)rsp->Buffer;
memset(chgblob, 0, sizeof(struct challenge_message));
if (!work->conn->use_spnego) {
@@ -1390,8 +1389,7 @@ static int ntlm_negotiate(struct ksmbd_work *work,
goto out;
}
- sz = le16_to_cpu(rsp->SecurityBufferOffset);
- memcpy((char *)&rsp->hdr.ProtocolId + sz, spnego_blob, spnego_blob_len);
+ memcpy(rsp->Buffer, spnego_blob, spnego_blob_len);
rsp->SecurityBufferLength = cpu_to_le16(spnego_blob_len);
out:
@@ -1473,8 +1471,7 @@ static int ntlm_authenticate(struct ksmbd_work *work,
if (rc)
return -ENOMEM;
- sz = le16_to_cpu(rsp->SecurityBufferOffset);
- memcpy((char *)&rsp->hdr.ProtocolId + sz, spnego_blob, spnego_blob_len);
+ memcpy(rsp->Buffer, spnego_blob, spnego_blob_len);
rsp->SecurityBufferLength = cpu_to_le16(spnego_blob_len);
kfree(spnego_blob);
}
@@ -1610,27 +1607,38 @@ static int krb5_authenticate(struct ksmbd_work *work,
out_len = work->response_sz -
(le16_to_cpu(rsp->SecurityBufferOffset) + 4);
- /* Check previous session */
- prev_sess_id = le64_to_cpu(req->PreviousSessionId);
- if (prev_sess_id && prev_sess_id != sess->id)
- destroy_previous_session(conn, sess->user, prev_sess_id);
-
- if (sess->state == SMB2_SESSION_VALID)
- ksmbd_free_user(sess->user);
-
retval = ksmbd_krb5_authenticate(sess, in_blob, in_len,
out_blob, &out_len);
if (retval) {
ksmbd_debug(SMB, "krb5 authentication failed\n");
return -EINVAL;
}
+
+ /* Check previous session */
+ prev_sess_id = le64_to_cpu(req->PreviousSessionId);
+ if (prev_sess_id && prev_sess_id != sess->id)
+ destroy_previous_session(conn, sess->user, prev_sess_id);
+
rsp->SecurityBufferLength = cpu_to_le16(out_len);
- if ((conn->sign || server_conf.enforced_signing) ||
+ /*
+ * If session state is SMB2_SESSION_VALID, We can assume
+ * that it is reauthentication. And the user/password
+ * has been verified, so return it here.
+ */
+ if (sess->state == SMB2_SESSION_VALID) {
+ if (conn->binding)
+ goto binding_session;
+ return 0;
+ }
+
+ if ((rsp->SessionFlags != SMB2_SESSION_FLAG_IS_GUEST_LE &&
+ (conn->sign || server_conf.enforced_signing)) ||
(req->SecurityMode & SMB2_NEGOTIATE_SIGNING_REQUIRED))
sess->sign = true;
- if (smb3_encryption_negotiated(conn)) {
+ if (smb3_encryption_negotiated(conn) &&
+ !(req->Flags & SMB2_SESSION_REQ_FLAG_BINDING)) {
retval = conn->ops->generate_encryptionkey(conn, sess);
if (retval) {
ksmbd_debug(SMB,
@@ -1643,6 +1651,7 @@ static int krb5_authenticate(struct ksmbd_work *work,
sess->sign = false;
}
+binding_session:
if (conn->dialect >= SMB30_PROT_ID) {
chann = lookup_chann_list(sess, conn);
if (!chann) {
@@ -1723,44 +1732,38 @@ int smb2_sess_setup(struct ksmbd_work *work)
if (conn->dialect != sess->dialect) {
rc = -EINVAL;
- ksmbd_user_session_put(sess);
goto out_err;
}
if (!(req->hdr.Flags & SMB2_FLAGS_SIGNED)) {
rc = -EINVAL;
- ksmbd_user_session_put(sess);
goto out_err;
}
if (strncmp(conn->ClientGUID, sess->ClientGUID,
SMB2_CLIENT_GUID_SIZE)) {
rc = -ENOENT;
- ksmbd_user_session_put(sess);
goto out_err;
}
if (sess->state == SMB2_SESSION_IN_PROGRESS) {
rc = -EACCES;
- ksmbd_user_session_put(sess);
goto out_err;
}
if (sess->state == SMB2_SESSION_EXPIRED) {
rc = -EFAULT;
- ksmbd_user_session_put(sess);
goto out_err;
}
- ksmbd_user_session_put(sess);
if (ksmbd_conn_need_reconnect(conn)) {
rc = -EFAULT;
+ ksmbd_user_session_put(sess);
sess = NULL;
goto out_err;
}
- sess = ksmbd_session_lookup(conn, sess_id);
- if (!sess) {
+ if (is_ksmbd_session_in_connection(conn, sess_id)) {
rc = -EACCES;
goto out_err;
}
@@ -1839,8 +1842,6 @@ int smb2_sess_setup(struct ksmbd_work *work)
ksmbd_conn_set_good(conn);
sess->state = SMB2_SESSION_VALID;
}
- kfree(sess->Preauth_HashValue);
- sess->Preauth_HashValue = NULL;
} else if (conn->preferred_auth_mech == KSMBD_AUTH_NTLMSSP) {
if (negblob->MessageType == NtLmNegotiate) {
rc = ntlm_negotiate(work, negblob, negblob_len, rsp);
@@ -1867,8 +1868,6 @@ int smb2_sess_setup(struct ksmbd_work *work)
kfree(preauth_sess);
}
}
- kfree(sess->Preauth_HashValue);
- sess->Preauth_HashValue = NULL;
} else {
pr_info_ratelimited("Unknown NTLMSSP message type : 0x%x\n",
le32_to_cpu(negblob->MessageType));
@@ -1926,6 +1925,8 @@ out_err:
sess->last_active = jiffies;
sess->state = SMB2_SESSION_EXPIRED;
+ ksmbd_user_session_put(sess);
+ work->sess = NULL;
if (try_delay) {
ksmbd_conn_set_need_reconnect(conn);
ssleep(5);
@@ -2246,13 +2247,14 @@ int smb2_session_logoff(struct ksmbd_work *work)
return -ENOENT;
}
- ksmbd_destroy_file_table(&sess->file_table);
down_write(&conn->session_lock);
sess->state = SMB2_SESSION_EXPIRED;
up_write(&conn->session_lock);
- ksmbd_free_user(sess->user);
- sess->user = NULL;
+ if (sess->user) {
+ ksmbd_free_user(sess->user);
+ sess->user = NULL;
+ }
ksmbd_all_conn_set_status(sess_id, KSMBD_SESS_NEED_NEGOTIATE);
rsp->StructureSize = cpu_to_le16(4);
@@ -2691,7 +2693,7 @@ int smb2_open(struct ksmbd_work *work)
int req_op_level = 0, open_flags = 0, may_flags = 0, file_info = 0;
int rc = 0;
int contxt_cnt = 0, query_disk_id = 0;
- int maximal_access_ctxt = 0, posix_ctxt = 0;
+ bool maximal_access_ctxt = false, posix_ctxt = false;
int s_type = 0;
int next_off = 0;
char *name = NULL;
@@ -2718,6 +2720,27 @@ int smb2_open(struct ksmbd_work *work)
return create_smb2_pipe(work);
}
+ if (req->CreateContextsOffset && tcon->posix_extensions) {
+ context = smb2_find_context_vals(req, SMB2_CREATE_TAG_POSIX, 16);
+ if (IS_ERR(context)) {
+ rc = PTR_ERR(context);
+ goto err_out2;
+ } else if (context) {
+ struct create_posix *posix = (struct create_posix *)context;
+
+ if (le16_to_cpu(context->DataOffset) +
+ le32_to_cpu(context->DataLength) <
+ sizeof(struct create_posix) - 4) {
+ rc = -EINVAL;
+ goto err_out2;
+ }
+ ksmbd_debug(SMB, "get posix context\n");
+
+ posix_mode = le32_to_cpu(posix->Mode);
+ posix_ctxt = true;
+ }
+ }
+
if (req->NameLength) {
if ((req->CreateOptions & FILE_DIRECTORY_FILE_LE) &&
*(char *)req->Buffer == '\\') {
@@ -2749,9 +2772,11 @@ int smb2_open(struct ksmbd_work *work)
goto err_out2;
}
- rc = ksmbd_validate_filename(name);
- if (rc < 0)
- goto err_out2;
+ if (posix_ctxt == false) {
+ rc = ksmbd_validate_filename(name);
+ if (rc < 0)
+ goto err_out2;
+ }
if (ksmbd_share_veto_filename(share, name)) {
rc = -ENOENT;
@@ -2866,28 +2891,6 @@ int smb2_open(struct ksmbd_work *work)
rc = -EBADF;
goto err_out2;
}
-
- if (tcon->posix_extensions) {
- context = smb2_find_context_vals(req,
- SMB2_CREATE_TAG_POSIX, 16);
- if (IS_ERR(context)) {
- rc = PTR_ERR(context);
- goto err_out2;
- } else if (context) {
- struct create_posix *posix =
- (struct create_posix *)context;
- if (le16_to_cpu(context->DataOffset) +
- le32_to_cpu(context->DataLength) <
- sizeof(struct create_posix) - 4) {
- rc = -EINVAL;
- goto err_out2;
- }
- ksmbd_debug(SMB, "get posix context\n");
-
- posix_mode = le32_to_cpu(posix->Mode);
- posix_ctxt = 1;
- }
- }
}
if (ksmbd_override_fsids(work)) {
@@ -3244,7 +3247,7 @@ int smb2_open(struct ksmbd_work *work)
goto err_out1;
}
} else {
- if (req_op_level == SMB2_OPLOCK_LEVEL_LEASE) {
+ if (req_op_level == SMB2_OPLOCK_LEVEL_LEASE && lc) {
/*
* Compare parent lease using parent key. If there is no
* a lease that has same parent key, Send lease break
@@ -7124,17 +7127,17 @@ out_check_cl:
}
no_check_cl:
+ flock = smb_lock->fl;
+ list_del(&smb_lock->llist);
+
if (smb_lock->zero_len) {
err = 0;
goto skip;
}
-
- flock = smb_lock->fl;
- list_del(&smb_lock->llist);
retry:
rc = vfs_lock_file(filp, smb_lock->cmd, flock, NULL);
skip:
- if (flags & SMB2_LOCKFLAG_UNLOCK) {
+ if (smb_lock->flags & SMB2_LOCKFLAG_UNLOCK) {
if (!rc) {
ksmbd_debug(SMB, "File unlocked\n");
} else if (rc == -ENOENT) {
@@ -8151,11 +8154,6 @@ static void smb20_oplock_break_ack(struct ksmbd_work *work)
goto err_out;
}
- opinfo->op_state = OPLOCK_STATE_NONE;
- wake_up_interruptible_all(&opinfo->oplock_q);
- opinfo_put(opinfo);
- ksmbd_fd_put(work, fp);
-
rsp->StructureSize = cpu_to_le16(24);
rsp->OplockLevel = rsp_oplevel;
rsp->Reserved = 0;
@@ -8163,16 +8161,15 @@ static void smb20_oplock_break_ack(struct ksmbd_work *work)
rsp->VolatileFid = volatile_id;
rsp->PersistentFid = persistent_id;
ret = ksmbd_iov_pin_rsp(work, rsp, sizeof(struct smb2_oplock_break));
- if (!ret)
- return;
-
+ if (ret) {
err_out:
+ smb2_set_err_rsp(work);
+ }
+
opinfo->op_state = OPLOCK_STATE_NONE;
wake_up_interruptible_all(&opinfo->oplock_q);
-
opinfo_put(opinfo);
ksmbd_fd_put(work, fp);
- smb2_set_err_rsp(work);
}
static int check_lease_state(struct lease *lease, __le32 req_state)
@@ -8302,11 +8299,6 @@ static void smb21_lease_break_ack(struct ksmbd_work *work)
}
lease_state = lease->state;
- opinfo->op_state = OPLOCK_STATE_NONE;
- wake_up_interruptible_all(&opinfo->oplock_q);
- atomic_dec(&opinfo->breaking_cnt);
- wake_up_interruptible_all(&opinfo->oplock_brk);
- opinfo_put(opinfo);
rsp->StructureSize = cpu_to_le16(36);
rsp->Reserved = 0;
@@ -8315,16 +8307,16 @@ static void smb21_lease_break_ack(struct ksmbd_work *work)
rsp->LeaseState = lease_state;
rsp->LeaseDuration = 0;
ret = ksmbd_iov_pin_rsp(work, rsp, sizeof(struct smb2_lease_ack));
- if (!ret)
- return;
-
+ if (ret) {
err_out:
+ smb2_set_err_rsp(work);
+ }
+
+ opinfo->op_state = OPLOCK_STATE_NONE;
wake_up_interruptible_all(&opinfo->oplock_q);
atomic_dec(&opinfo->breaking_cnt);
wake_up_interruptible_all(&opinfo->oplock_brk);
-
opinfo_put(opinfo);
- smb2_set_err_rsp(work);
}
/**
diff --git a/fs/smb/server/smb_common.c b/fs/smb/server/smb_common.c
index 7134abeeb53e..2850802f4a50 100644
--- a/fs/smb/server/smb_common.c
+++ b/fs/smb/server/smb_common.c
@@ -508,7 +508,7 @@ int ksmbd_extract_shortname(struct ksmbd_conn *conn, const char *longname,
p = strrchr(longname, '.');
if (p == longname) { /*name starts with a dot*/
- strscpy(extension, "___", strlen("___"));
+ strscpy(extension, "___", sizeof(extension));
} else {
if (p) {
p++;
diff --git a/fs/smb/server/smbacl.c b/fs/smb/server/smbacl.c
index 6fd3560028d3..90c5e3edbf46 100644
--- a/fs/smb/server/smbacl.c
+++ b/fs/smb/server/smbacl.c
@@ -270,6 +270,11 @@ static int sid_to_id(struct user_namespace *user_ns,
return -EIO;
}
+ if (psid->num_subauth == 0) {
+ pr_err("%s: zero subauthorities!\n", __func__);
+ return -EIO;
+ }
+
if (sidtype == SIDOWNER) {
kuid_t uid;
uid_t id;
@@ -398,7 +403,9 @@ static void parse_dacl(struct user_namespace *user_ns,
if (num_aces <= 0)
return;
- if (num_aces > ULONG_MAX / sizeof(struct smb_ace *))
+ if (num_aces > (le16_to_cpu(pdacl->size) - sizeof(struct smb_acl)) /
+ (offsetof(struct smb_ace, sid) +
+ offsetof(struct smb_sid, sub_auth) + sizeof(__le16)))
return;
ret = init_acl_state(&acl_state, num_aces);
@@ -432,6 +439,7 @@ static void parse_dacl(struct user_namespace *user_ns,
offsetof(struct smb_sid, sub_auth);
if (end_of_acl - acl_base < acl_size ||
+ ppace[i]->sid.num_subauth == 0 ||
ppace[i]->sid.num_subauth > SID_MAX_SUB_AUTHORITIES ||
(end_of_acl - acl_base <
acl_size + sizeof(__le32) * ppace[i]->sid.num_subauth) ||
diff --git a/fs/smb/server/transport_ipc.c b/fs/smb/server/transport_ipc.c
index 496855f755ac..7fc4b33b89e3 100644
--- a/fs/smb/server/transport_ipc.c
+++ b/fs/smb/server/transport_ipc.c
@@ -267,6 +267,7 @@ static int handle_response(int type, void *payload, size_t sz)
if (entry->type + 1 != type) {
pr_err("Waiting for IPC type %d, got %d. Ignore.\n",
entry->type + 1, type);
+ continue;
}
entry->response = kvzalloc(sz, GFP_KERNEL);
@@ -295,7 +296,11 @@ static int ipc_server_config_on_startup(struct ksmbd_startup_request *req)
server_conf.signing = req->signing;
server_conf.tcp_port = req->tcp_port;
server_conf.ipc_timeout = req->ipc_timeout * HZ;
- server_conf.deadtime = req->deadtime * SMB_ECHO_INTERVAL;
+ if (check_mul_overflow(req->deadtime, SMB_ECHO_INTERVAL,
+ &server_conf.deadtime)) {
+ ret = -EINVAL;
+ goto out;
+ }
server_conf.share_fake_fscaps = req->share_fake_fscaps;
ksmbd_init_domain(req->sub_auth);
@@ -318,6 +323,7 @@ static int ipc_server_config_on_startup(struct ksmbd_startup_request *req)
ret |= ksmbd_set_work_group(req->work_group);
ret |= ksmbd_tcp_set_interfaces(KSMBD_STARTUP_CONFIG_INTERFACES(req),
req->ifc_list_sz);
+out:
if (ret) {
pr_err("Server configuration error: %s %s %s\n",
req->netbios_name, req->server_string,
diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c
index 8faa25c6e129..7d59ed6e1383 100644
--- a/fs/smb/server/transport_rdma.c
+++ b/fs/smb/server/transport_rdma.c
@@ -128,9 +128,6 @@ struct smb_direct_transport {
spinlock_t recvmsg_queue_lock;
struct list_head recvmsg_queue;
- spinlock_t empty_recvmsg_queue_lock;
- struct list_head empty_recvmsg_queue;
-
int send_credit_target;
atomic_t send_credits;
spinlock_t lock_new_recv_credits;
@@ -266,40 +263,19 @@ smb_direct_recvmsg *get_free_recvmsg(struct smb_direct_transport *t)
static void put_recvmsg(struct smb_direct_transport *t,
struct smb_direct_recvmsg *recvmsg)
{
- ib_dma_unmap_single(t->cm_id->device, recvmsg->sge.addr,
- recvmsg->sge.length, DMA_FROM_DEVICE);
+ if (likely(recvmsg->sge.length != 0)) {
+ ib_dma_unmap_single(t->cm_id->device,
+ recvmsg->sge.addr,
+ recvmsg->sge.length,
+ DMA_FROM_DEVICE);
+ recvmsg->sge.length = 0;
+ }
spin_lock(&t->recvmsg_queue_lock);
list_add(&recvmsg->list, &t->recvmsg_queue);
spin_unlock(&t->recvmsg_queue_lock);
}
-static struct
-smb_direct_recvmsg *get_empty_recvmsg(struct smb_direct_transport *t)
-{
- struct smb_direct_recvmsg *recvmsg = NULL;
-
- spin_lock(&t->empty_recvmsg_queue_lock);
- if (!list_empty(&t->empty_recvmsg_queue)) {
- recvmsg = list_first_entry(&t->empty_recvmsg_queue,
- struct smb_direct_recvmsg, list);
- list_del(&recvmsg->list);
- }
- spin_unlock(&t->empty_recvmsg_queue_lock);
- return recvmsg;
-}
-
-static void put_empty_recvmsg(struct smb_direct_transport *t,
- struct smb_direct_recvmsg *recvmsg)
-{
- ib_dma_unmap_single(t->cm_id->device, recvmsg->sge.addr,
- recvmsg->sge.length, DMA_FROM_DEVICE);
-
- spin_lock(&t->empty_recvmsg_queue_lock);
- list_add_tail(&recvmsg->list, &t->empty_recvmsg_queue);
- spin_unlock(&t->empty_recvmsg_queue_lock);
-}
-
static void enqueue_reassembly(struct smb_direct_transport *t,
struct smb_direct_recvmsg *recvmsg,
int data_length)
@@ -384,9 +360,6 @@ static struct smb_direct_transport *alloc_transport(struct rdma_cm_id *cm_id)
spin_lock_init(&t->recvmsg_queue_lock);
INIT_LIST_HEAD(&t->recvmsg_queue);
- spin_lock_init(&t->empty_recvmsg_queue_lock);
- INIT_LIST_HEAD(&t->empty_recvmsg_queue);
-
init_waitqueue_head(&t->wait_send_pending);
atomic_set(&t->send_pending, 0);
@@ -426,7 +399,8 @@ static void free_transport(struct smb_direct_transport *t)
if (t->qp) {
ib_drain_qp(t->qp);
ib_mr_pool_destroy(t->qp, &t->qp->rdma_mrs);
- ib_destroy_qp(t->qp);
+ t->qp = NULL;
+ rdma_destroy_qp(t->cm_id);
}
ksmbd_debug(RDMA, "drain the reassembly queue\n");
@@ -541,13 +515,13 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
t = recvmsg->transport;
if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_RECV) {
+ put_recvmsg(t, recvmsg);
if (wc->status != IB_WC_WR_FLUSH_ERR) {
pr_err("Recv error. status='%s (%d)' opcode=%d\n",
ib_wc_status_msg(wc->status), wc->status,
wc->opcode);
smb_direct_disconnect_rdma_connection(t);
}
- put_empty_recvmsg(t, recvmsg);
return;
}
@@ -561,7 +535,8 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
switch (recvmsg->type) {
case SMB_DIRECT_MSG_NEGOTIATE_REQ:
if (wc->byte_len < sizeof(struct smb_direct_negotiate_req)) {
- put_empty_recvmsg(t, recvmsg);
+ put_recvmsg(t, recvmsg);
+ smb_direct_disconnect_rdma_connection(t);
return;
}
t->negotiation_requested = true;
@@ -569,7 +544,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
t->status = SMB_DIRECT_CS_CONNECTED;
enqueue_reassembly(t, recvmsg, 0);
wake_up_interruptible(&t->wait_status);
- break;
+ return;
case SMB_DIRECT_MSG_DATA_TRANSFER: {
struct smb_direct_data_transfer *data_transfer =
(struct smb_direct_data_transfer *)recvmsg->packet;
@@ -578,7 +553,8 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
if (wc->byte_len <
offsetof(struct smb_direct_data_transfer, padding)) {
- put_empty_recvmsg(t, recvmsg);
+ put_recvmsg(t, recvmsg);
+ smb_direct_disconnect_rdma_connection(t);
return;
}
@@ -586,7 +562,8 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
if (data_length) {
if (wc->byte_len < sizeof(struct smb_direct_data_transfer) +
(u64)data_length) {
- put_empty_recvmsg(t, recvmsg);
+ put_recvmsg(t, recvmsg);
+ smb_direct_disconnect_rdma_connection(t);
return;
}
@@ -598,16 +575,11 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
else
t->full_packet_received = true;
- enqueue_reassembly(t, recvmsg, (int)data_length);
- wake_up_interruptible(&t->wait_reassembly_queue);
-
spin_lock(&t->receive_credit_lock);
receive_credits = --(t->recv_credits);
avail_recvmsg_count = t->count_avail_recvmsg;
spin_unlock(&t->receive_credit_lock);
} else {
- put_empty_recvmsg(t, recvmsg);
-
spin_lock(&t->receive_credit_lock);
receive_credits = --(t->recv_credits);
avail_recvmsg_count = ++(t->count_avail_recvmsg);
@@ -629,11 +601,23 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
if (is_receive_credit_post_required(receive_credits, avail_recvmsg_count))
mod_delayed_work(smb_direct_wq,
&t->post_recv_credits_work, 0);
- break;
+
+ if (data_length) {
+ enqueue_reassembly(t, recvmsg, (int)data_length);
+ wake_up_interruptible(&t->wait_reassembly_queue);
+ } else
+ put_recvmsg(t, recvmsg);
+
+ return;
}
- default:
- break;
}
+
+ /*
+ * This is an internal error!
+ */
+ WARN_ON_ONCE(recvmsg->type != SMB_DIRECT_MSG_DATA_TRANSFER);
+ put_recvmsg(t, recvmsg);
+ smb_direct_disconnect_rdma_connection(t);
}
static int smb_direct_post_recv(struct smb_direct_transport *t,
@@ -663,6 +647,7 @@ static int smb_direct_post_recv(struct smb_direct_transport *t,
ib_dma_unmap_single(t->cm_id->device,
recvmsg->sge.addr, recvmsg->sge.length,
DMA_FROM_DEVICE);
+ recvmsg->sge.length = 0;
smb_direct_disconnect_rdma_connection(t);
return ret;
}
@@ -804,7 +789,6 @@ static void smb_direct_post_recv_credits(struct work_struct *work)
struct smb_direct_recvmsg *recvmsg;
int receive_credits, credits = 0;
int ret;
- int use_free = 1;
spin_lock(&t->receive_credit_lock);
receive_credits = t->recv_credits;
@@ -812,18 +796,9 @@ static void smb_direct_post_recv_credits(struct work_struct *work)
if (receive_credits < t->recv_credit_target) {
while (true) {
- if (use_free)
- recvmsg = get_free_recvmsg(t);
- else
- recvmsg = get_empty_recvmsg(t);
- if (!recvmsg) {
- if (use_free) {
- use_free = 0;
- continue;
- } else {
- break;
- }
- }
+ recvmsg = get_free_recvmsg(t);
+ if (!recvmsg)
+ break;
recvmsg->type = SMB_DIRECT_MSG_DATA_TRANSFER;
recvmsg->first_segment = false;
@@ -1799,8 +1774,6 @@ static void smb_direct_destroy_pools(struct smb_direct_transport *t)
while ((recvmsg = get_free_recvmsg(t)))
mempool_free(recvmsg, t->recvmsg_mempool);
- while ((recvmsg = get_empty_recvmsg(t)))
- mempool_free(recvmsg, t->recvmsg_mempool);
mempool_destroy(t->recvmsg_mempool);
t->recvmsg_mempool = NULL;
@@ -1856,6 +1829,7 @@ static int smb_direct_create_pools(struct smb_direct_transport *t)
if (!recvmsg)
goto err;
recvmsg->transport = t;
+ recvmsg->sge.length = 0;
list_add(&recvmsg->list, &t->recvmsg_queue);
}
t->count_avail_recvmsg = t->recv_credit_max;
@@ -1934,8 +1908,8 @@ static int smb_direct_create_qpair(struct smb_direct_transport *t,
return 0;
err:
if (t->qp) {
- ib_destroy_qp(t->qp);
t->qp = NULL;
+ rdma_destroy_qp(t->cm_id);
}
if (t->recv_cq) {
ib_destroy_cq(t->recv_cq);
diff --git a/fs/smb/server/transport_tcp.c b/fs/smb/server/transport_tcp.c
index 25f7c86ba9b9..1222cf6be5ef 100644
--- a/fs/smb/server/transport_tcp.c
+++ b/fs/smb/server/transport_tcp.c
@@ -87,6 +87,7 @@ static struct tcp_transport *alloc_transport(struct socket *client_sk)
return NULL;
}
+ conn->inet_addr = inet_sk(client_sk->sk)->inet_daddr;
conn->transport = KSMBD_TRANS(t);
KSMBD_TRANS(t)->conn = conn;
KSMBD_TRANS(t)->ops = &ksmbd_tcp_transport_ops;
@@ -226,6 +227,8 @@ static int ksmbd_kthread_fn(void *p)
{
struct socket *client_sk = NULL;
struct interface *iface = (struct interface *)p;
+ struct inet_sock *csk_inet;
+ struct ksmbd_conn *conn;
int ret;
while (!kthread_should_stop()) {
@@ -244,6 +247,20 @@ static int ksmbd_kthread_fn(void *p)
continue;
}
+ /*
+ * Limits repeated connections from clients with the same IP.
+ */
+ csk_inet = inet_sk(client_sk->sk);
+ down_read(&conn_list_lock);
+ list_for_each_entry(conn, &conn_list, conns_list)
+ if (csk_inet->inet_daddr == conn->inet_addr) {
+ ret = -EAGAIN;
+ break;
+ }
+ up_read(&conn_list_lock);
+ if (ret == -EAGAIN)
+ continue;
+
if (server_conf.max_connections &&
atomic_inc_return(&active_num_conn) >= server_conf.max_connections) {
pr_info_ratelimited("Limit the maximum number of connections(%u)\n",
diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c
index 396d4ea77d34..871c0d8e5012 100644
--- a/fs/smb/server/vfs.c
+++ b/fs/smb/server/vfs.c
@@ -423,10 +423,15 @@ static int ksmbd_vfs_stream_write(struct ksmbd_file *fp, char *buf, loff_t *pos,
ksmbd_debug(VFS, "write stream data pos : %llu, count : %zd\n",
*pos, count);
+ if (*pos >= XATTR_SIZE_MAX) {
+ pr_err("stream write position %lld is out of bounds\n", *pos);
+ return -EINVAL;
+ }
+
size = *pos + count;
if (size > XATTR_SIZE_MAX) {
size = XATTR_SIZE_MAX;
- count = (*pos + count) - XATTR_SIZE_MAX;
+ count = XATTR_SIZE_MAX - *pos;
}
v_len = ksmbd_vfs_getcasexattr(user_ns,
@@ -493,7 +498,8 @@ int ksmbd_vfs_write(struct ksmbd_work *work, struct ksmbd_file *fp,
int err = 0;
if (work->conn->connection_type) {
- if (!(fp->daccess & (FILE_WRITE_DATA_LE | FILE_APPEND_DATA_LE))) {
+ if (!(fp->daccess & (FILE_WRITE_DATA_LE | FILE_APPEND_DATA_LE)) ||
+ S_ISDIR(file_inode(fp->filp)->i_mode)) {
pr_err("no right to write(%pD)\n", fp->filp);
err = -EACCES;
goto out;
@@ -556,7 +562,8 @@ int ksmbd_vfs_getattr(const struct path *path, struct kstat *stat)
{
int err;
- err = vfs_getattr(path, stat, STATX_BTIME, AT_STATX_SYNC_AS_STAT);
+ err = vfs_getattr(path, stat, STATX_BASIC_STATS | STATX_BTIME,
+ AT_STATX_SYNC_AS_STAT);
if (err)
pr_err("getattr failed, err %d\n", err);
return err;
@@ -1277,6 +1284,7 @@ out1:
err = ksmbd_vfs_lock_parent(parent_path->dentry, path->dentry);
if (err) {
+ mnt_drop_write(parent_path->mnt);
path_put(path);
path_put(parent_path);
}
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 32565dafa7f3..37579c07f6fd 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -137,6 +137,11 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc)
msblk->panic_on_errors = (opts->errors == Opt_errors_panic);
msblk->devblksize = sb_min_blocksize(sb, SQUASHFS_DEVBLK_SIZE);
+ if (!msblk->devblksize) {
+ errorf(fc, "squashfs: unable to set blocksize\n");
+ return -EINVAL;
+ }
+
msblk->devblksize_log2 = ffz(~msblk->devblksize);
mutex_init(&msblk->meta_index_mutex);
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 82101a2cf933..fcf96f52b2e9 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -1426,6 +1426,9 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
if (prev != vma)
mas_next(&mas, ULONG_MAX);
+ if (vma->vm_start < start)
+ prev = vma;
+
ret = 0;
do {
cond_resched();
@@ -1603,6 +1606,9 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
if (prev != vma)
mas_next(&mas, ULONG_MAX);
+ if (vma->vm_start < start)
+ prev = vma;
+
ret = 0;
do {
cond_resched();
diff --git a/fs/vboxsf/super.c b/fs/vboxsf/super.c
index 74952e58cca0..48f33d4994dc 100644
--- a/fs/vboxsf/super.c
+++ b/fs/vboxsf/super.c
@@ -21,7 +21,8 @@
#define VBOXSF_SUPER_MAGIC 0x786f4256 /* 'VBox' little endian */
-static const unsigned char VBSF_MOUNT_SIGNATURE[4] = "\000\377\376\375";
+static const unsigned char VBSF_MOUNT_SIGNATURE[4] = { '\000', '\377', '\376',
+ '\375' };
static int follow_symlinks;
module_param(follow_symlinks, int, 0444);
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index 9fac5ea8d0e4..dff90db507e3 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -154,6 +154,18 @@ config XFS_DEBUG
Say N unless you are an XFS developer, or you play one on TV.
+config XFS_DEBUG_EXPENSIVE
+ bool "XFS expensive debugging checks"
+ depends on XFS_FS && XFS_DEBUG
+ help
+ Say Y here to get an XFS build with expensive debugging checks
+ enabled. These checks may affect performance significantly.
+
+ Note that the resulting code will be HUGER and SLOWER, and probably
+ not useful unless you are debugging a particular problem.
+
+ Say N unless you are an XFS developer, or you play one on TV.
+
config XFS_ASSERT_FATAL
bool "XFS fatal asserts"
default y
diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
index bf47efe08a58..9743fa5b5388 100644
--- a/fs/xfs/libxfs/xfs_ag.c
+++ b/fs/xfs/libxfs/xfs_ag.c
@@ -259,6 +259,30 @@ xfs_agino_range(
return __xfs_agino_range(mp, xfs_ag_block_count(mp, agno), first, last);
}
+/*
+ * Free perag within the specified AG range, it is only used to free unused
+ * perags under the error handling path.
+ */
+void
+xfs_free_unused_perag_range(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agstart,
+ xfs_agnumber_t agend)
+{
+ struct xfs_perag *pag;
+ xfs_agnumber_t index;
+
+ for (index = agstart; index < agend; index++) {
+ spin_lock(&mp->m_perag_lock);
+ pag = radix_tree_delete(&mp->m_perag_tree, index);
+ spin_unlock(&mp->m_perag_lock);
+ if (!pag)
+ break;
+ xfs_buf_hash_destroy(pag);
+ kmem_free(pag);
+ }
+}
+
int
xfs_initialize_perag(
struct xfs_mount *mp,
@@ -345,18 +369,14 @@ xfs_initialize_perag(
return 0;
out_remove_pag:
+ spin_lock(&mp->m_perag_lock);
radix_tree_delete(&mp->m_perag_tree, index);
+ spin_unlock(&mp->m_perag_lock);
out_free_pag:
kmem_free(pag);
out_unwind_new_pags:
/* unwind any prior newly initialized pags */
- for (index = first_initialised; index < agcount; index++) {
- pag = radix_tree_delete(&mp->m_perag_tree, index);
- if (!pag)
- break;
- xfs_buf_hash_destroy(pag);
- kmem_free(pag);
- }
+ xfs_free_unused_perag_range(mp, first_initialised, agcount);
return error;
}
@@ -906,7 +926,10 @@ xfs_ag_shrink_space(
if (err2 != -ENOSPC)
goto resv_err;
- __xfs_free_extent_later(*tpp, args.fsbno, delta, NULL, true);
+ err2 = __xfs_free_extent_later(*tpp, args.fsbno, delta, NULL,
+ XFS_AG_RESV_NONE, true);
+ if (err2)
+ goto resv_err;
/*
* Roll the transaction before trying to re-init the per-ag
@@ -981,10 +1004,8 @@ xfs_ag_extend_space(
if (error)
return error;
- error = xfs_free_extent(tp, XFS_AGB_TO_FSB(pag->pag_mount, pag->pag_agno,
- be32_to_cpu(agf->agf_length) - len),
- len, &XFS_RMAP_OINFO_SKIP_UPDATE,
- XFS_AG_RESV_NONE);
+ error = xfs_free_extent(tp, pag, be32_to_cpu(agf->agf_length) - len,
+ len, &XFS_RMAP_OINFO_SKIP_UPDATE, XFS_AG_RESV_NONE);
if (error)
return error;
diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h
index 191b22b9a35b..eb84af1c8628 100644
--- a/fs/xfs/libxfs/xfs_ag.h
+++ b/fs/xfs/libxfs/xfs_ag.h
@@ -106,6 +106,9 @@ struct xfs_perag {
#endif /* __KERNEL__ */
};
+
+void xfs_free_unused_perag_range(struct xfs_mount *mp, xfs_agnumber_t agstart,
+ xfs_agnumber_t agend);
int xfs_initialize_perag(struct xfs_mount *mp, xfs_agnumber_t agcount,
xfs_rfsblock_t dcount, xfs_agnumber_t *maxagi);
int xfs_initialize_perag_data(struct xfs_mount *mp, xfs_agnumber_t agno);
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 74d039bdc9f7..cd5b197d7046 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -2485,45 +2485,53 @@ xfs_agfl_reset(
* the real allocation can proceed. Deferring the free disconnects freeing up
* the AGFL slot from freeing the block.
*/
-STATIC void
+static int
xfs_defer_agfl_block(
struct xfs_trans *tp,
xfs_agnumber_t agno,
- xfs_fsblock_t agbno,
+ xfs_agblock_t agbno,
struct xfs_owner_info *oinfo)
{
struct xfs_mount *mp = tp->t_mountp;
- struct xfs_extent_free_item *new; /* new element */
+ struct xfs_extent_free_item *xefi;
+ xfs_fsblock_t fsbno = XFS_AGB_TO_FSB(mp, agno, agbno);
ASSERT(xfs_extfree_item_cache != NULL);
ASSERT(oinfo != NULL);
- new = kmem_cache_zalloc(xfs_extfree_item_cache,
+ if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbno(mp, fsbno)))
+ return -EFSCORRUPTED;
+
+ xefi = kmem_cache_zalloc(xfs_extfree_item_cache,
GFP_KERNEL | __GFP_NOFAIL);
- new->xefi_startblock = XFS_AGB_TO_FSB(mp, agno, agbno);
- new->xefi_blockcount = 1;
- new->xefi_owner = oinfo->oi_owner;
+ xefi->xefi_startblock = fsbno;
+ xefi->xefi_blockcount = 1;
+ xefi->xefi_owner = oinfo->oi_owner;
+ xefi->xefi_agresv = XFS_AG_RESV_AGFL;
trace_xfs_agfl_free_defer(mp, agno, 0, agbno, 1);
- xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_AGFL_FREE, &new->xefi_list);
+ xfs_extent_free_get_group(mp, xefi);
+ xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_AGFL_FREE, &xefi->xefi_list);
+ return 0;
}
/*
* Add the extent to the list of extents to be free at transaction end.
* The list is maintained sorted (by block number).
*/
-void
+int
__xfs_free_extent_later(
struct xfs_trans *tp,
xfs_fsblock_t bno,
xfs_filblks_t len,
const struct xfs_owner_info *oinfo,
+ enum xfs_ag_resv_type type,
bool skip_discard)
{
- struct xfs_extent_free_item *new; /* new element */
-#ifdef DEBUG
+ struct xfs_extent_free_item *xefi;
struct xfs_mount *mp = tp->t_mountp;
+#ifdef DEBUG
xfs_agnumber_t agno;
xfs_agblock_t agbno;
@@ -2539,28 +2547,36 @@ __xfs_free_extent_later(
ASSERT(agbno + len <= mp->m_sb.sb_agblocks);
#endif
ASSERT(xfs_extfree_item_cache != NULL);
+ ASSERT(type != XFS_AG_RESV_AGFL);
- new = kmem_cache_zalloc(xfs_extfree_item_cache,
+ if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbext(mp, bno, len)))
+ return -EFSCORRUPTED;
+
+ xefi = kmem_cache_zalloc(xfs_extfree_item_cache,
GFP_KERNEL | __GFP_NOFAIL);
- new->xefi_startblock = bno;
- new->xefi_blockcount = (xfs_extlen_t)len;
+ xefi->xefi_startblock = bno;
+ xefi->xefi_blockcount = (xfs_extlen_t)len;
+ xefi->xefi_agresv = type;
if (skip_discard)
- new->xefi_flags |= XFS_EFI_SKIP_DISCARD;
+ xefi->xefi_flags |= XFS_EFI_SKIP_DISCARD;
if (oinfo) {
ASSERT(oinfo->oi_offset == 0);
if (oinfo->oi_flags & XFS_OWNER_INFO_ATTR_FORK)
- new->xefi_flags |= XFS_EFI_ATTR_FORK;
+ xefi->xefi_flags |= XFS_EFI_ATTR_FORK;
if (oinfo->oi_flags & XFS_OWNER_INFO_BMBT_BLOCK)
- new->xefi_flags |= XFS_EFI_BMBT_BLOCK;
- new->xefi_owner = oinfo->oi_owner;
+ xefi->xefi_flags |= XFS_EFI_BMBT_BLOCK;
+ xefi->xefi_owner = oinfo->oi_owner;
} else {
- new->xefi_owner = XFS_RMAP_OWN_NULL;
+ xefi->xefi_owner = XFS_RMAP_OWN_NULL;
}
- trace_xfs_bmap_free_defer(tp->t_mountp,
+ trace_xfs_bmap_free_defer(mp,
XFS_FSB_TO_AGNO(tp->t_mountp, bno), 0,
XFS_FSB_TO_AGBNO(tp->t_mountp, bno), len);
- xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_FREE, &new->xefi_list);
+
+ xfs_extent_free_get_group(mp, xefi);
+ xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_FREE, &xefi->xefi_list);
+ return 0;
}
#ifdef DEBUG
@@ -2720,7 +2736,9 @@ xfs_alloc_fix_freelist(
goto out_agbp_relse;
/* defer agfl frees */
- xfs_defer_agfl_block(tp, args->agno, bno, &targs.oinfo);
+ error = xfs_defer_agfl_block(tp, args->agno, bno, &targs.oinfo);
+ if (error)
+ goto out_agbp_relse;
}
targs.tp = tp;
@@ -3447,7 +3465,8 @@ xfs_free_extent_fix_freelist(
int
__xfs_free_extent(
struct xfs_trans *tp,
- xfs_fsblock_t bno,
+ struct xfs_perag *pag,
+ xfs_agblock_t agbno,
xfs_extlen_t len,
const struct xfs_owner_info *oinfo,
enum xfs_ag_resv_type type,
@@ -3455,12 +3474,9 @@ __xfs_free_extent(
{
struct xfs_mount *mp = tp->t_mountp;
struct xfs_buf *agbp;
- xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, bno);
- xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp, bno);
struct xfs_agf *agf;
int error;
unsigned int busy_flags = 0;
- struct xfs_perag *pag;
ASSERT(len != 0);
ASSERT(type != XFS_AG_RESV_AGFL);
@@ -3469,10 +3485,9 @@ __xfs_free_extent(
XFS_ERRTAG_FREE_EXTENT))
return -EIO;
- pag = xfs_perag_get(mp, agno);
error = xfs_free_extent_fix_freelist(tp, pag, &agbp);
if (error)
- goto err;
+ return error;
agf = agbp->b_addr;
if (XFS_IS_CORRUPT(mp, agbno >= mp->m_sb.sb_agblocks)) {
@@ -3486,20 +3501,18 @@ __xfs_free_extent(
goto err_release;
}
- error = xfs_free_ag_extent(tp, agbp, agno, agbno, len, oinfo, type);
+ error = xfs_free_ag_extent(tp, agbp, pag->pag_agno, agbno, len, oinfo,
+ type);
if (error)
goto err_release;
if (skip_discard)
busy_flags |= XFS_EXTENT_BUSY_SKIP_DISCARD;
xfs_extent_busy_insert(tp, pag, agbno, len, busy_flags);
- xfs_perag_put(pag);
return 0;
err_release:
xfs_trans_brelse(tp, agbp);
-err:
- xfs_perag_put(pag);
return error;
}
@@ -3532,15 +3545,11 @@ xfs_alloc_query_range(
xfs_alloc_query_range_fn fn,
void *priv)
{
- union xfs_btree_irec low_brec;
- union xfs_btree_irec high_brec;
- struct xfs_alloc_query_range_info query;
+ union xfs_btree_irec low_brec = { .a = *low_rec };
+ union xfs_btree_irec high_brec = { .a = *high_rec };
+ struct xfs_alloc_query_range_info query = { .priv = priv, .fn = fn };
ASSERT(cur->bc_btnum == XFS_BTNUM_BNO);
- low_brec.a = *low_rec;
- high_brec.a = *high_rec;
- query.priv = priv;
- query.fn = fn;
return xfs_btree_query_range(cur, &low_brec, &high_brec,
xfs_alloc_query_range_helper, &query);
}
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index 2c3f762dfb58..2dd93d62150f 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -130,7 +130,8 @@ xfs_alloc_vextent(
int /* error */
__xfs_free_extent(
struct xfs_trans *tp, /* transaction pointer */
- xfs_fsblock_t bno, /* starting block number of extent */
+ struct xfs_perag *pag,
+ xfs_agblock_t agbno,
xfs_extlen_t len, /* length of extent */
const struct xfs_owner_info *oinfo, /* extent owner */
enum xfs_ag_resv_type type, /* block reservation type */
@@ -139,12 +140,13 @@ __xfs_free_extent(
static inline int
xfs_free_extent(
struct xfs_trans *tp,
- xfs_fsblock_t bno,
+ struct xfs_perag *pag,
+ xfs_agblock_t agbno,
xfs_extlen_t len,
const struct xfs_owner_info *oinfo,
enum xfs_ag_resv_type type)
{
- return __xfs_free_extent(tp, bno, len, oinfo, type, false);
+ return __xfs_free_extent(tp, pag, agbno, len, oinfo, type, false);
}
int /* error */
@@ -211,9 +213,9 @@ xfs_buf_to_agfl_bno(
return bp->b_addr;
}
-void __xfs_free_extent_later(struct xfs_trans *tp, xfs_fsblock_t bno,
+int __xfs_free_extent_later(struct xfs_trans *tp, xfs_fsblock_t bno,
xfs_filblks_t len, const struct xfs_owner_info *oinfo,
- bool skip_discard);
+ enum xfs_ag_resv_type type, bool skip_discard);
/*
* List of extents to be free "later".
@@ -224,21 +226,27 @@ struct xfs_extent_free_item {
uint64_t xefi_owner;
xfs_fsblock_t xefi_startblock;/* starting fs block number */
xfs_extlen_t xefi_blockcount;/* number of blocks in extent */
+ struct xfs_perag *xefi_pag;
unsigned int xefi_flags;
+ enum xfs_ag_resv_type xefi_agresv;
};
+void xfs_extent_free_get_group(struct xfs_mount *mp,
+ struct xfs_extent_free_item *xefi);
+
#define XFS_EFI_SKIP_DISCARD (1U << 0) /* don't issue discard */
#define XFS_EFI_ATTR_FORK (1U << 1) /* freeing attr fork block */
#define XFS_EFI_BMBT_BLOCK (1U << 2) /* freeing bmap btree block */
-static inline void
+static inline int
xfs_free_extent_later(
struct xfs_trans *tp,
xfs_fsblock_t bno,
xfs_filblks_t len,
- const struct xfs_owner_info *oinfo)
+ const struct xfs_owner_info *oinfo,
+ enum xfs_ag_resv_type type)
{
- __xfs_free_extent_later(tp, bno, len, oinfo, false);
+ return __xfs_free_extent_later(tp, bno, len, oinfo, type, false);
}
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index e28d93d232de..32d350e97e0f 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -421,10 +421,10 @@ xfs_attr_complete_op(
bool do_replace = args->op_flags & XFS_DA_OP_REPLACE;
args->op_flags &= ~XFS_DA_OP_REPLACE;
- if (do_replace) {
- args->attr_filter &= ~XFS_ATTR_INCOMPLETE;
+ args->attr_filter &= ~XFS_ATTR_INCOMPLETE;
+ if (do_replace)
return replace_state;
- }
+
return XFS_DAS_DONE;
}
diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
index d440393b40eb..54de405cbab5 100644
--- a/fs/xfs/libxfs/xfs_attr_remote.c
+++ b/fs/xfs/libxfs/xfs_attr_remote.c
@@ -619,7 +619,6 @@ xfs_attr_rmtval_set_blk(
if (error)
return error;
- ASSERT(nmap == 1);
ASSERT((map->br_startblock != DELAYSTARTBLOCK) &&
(map->br_startblock != HOLESTARTBLOCK));
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 27d3121e6da9..14b0d230f61b 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -21,7 +21,7 @@
#include "xfs_bmap.h"
#include "xfs_bmap_util.h"
#include "xfs_bmap_btree.h"
-#include "xfs_rtalloc.h"
+#include "xfs_rtbitmap.h"
#include "xfs_errortag.h"
#include "xfs_error.h"
#include "xfs_quota.h"
@@ -572,8 +572,13 @@ xfs_bmap_btree_to_extents(
cblock = XFS_BUF_TO_BLOCK(cbp);
if ((error = xfs_btree_check_block(cur, cblock, 0, cbp)))
return error;
+
xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, whichfork);
- xfs_free_extent_later(cur->bc_tp, cbno, 1, &oinfo);
+ error = xfs_free_extent_later(cur->bc_tp, cbno, 1, &oinfo,
+ XFS_AG_RESV_NONE);
+ if (error)
+ return error;
+
ip->i_nblocks--;
xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
xfs_trans_binval(tp, cbp);
@@ -1525,6 +1530,7 @@ xfs_bmap_add_extent_delay_real(
if (error)
goto done;
}
+ ASSERT(da_new <= da_old);
break;
case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG:
@@ -1554,6 +1560,7 @@ xfs_bmap_add_extent_delay_real(
if (error)
goto done;
}
+ ASSERT(da_new <= da_old);
break;
case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
@@ -1587,6 +1594,7 @@ xfs_bmap_add_extent_delay_real(
if (error)
goto done;
}
+ ASSERT(da_new <= da_old);
break;
case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING:
@@ -1619,6 +1627,7 @@ xfs_bmap_add_extent_delay_real(
goto done;
}
}
+ ASSERT(da_new <= da_old);
break;
case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG:
@@ -1656,6 +1665,7 @@ xfs_bmap_add_extent_delay_real(
if (error)
goto done;
}
+ ASSERT(da_new <= da_old);
break;
case BMAP_LEFT_FILLING:
@@ -1743,6 +1753,7 @@ xfs_bmap_add_extent_delay_real(
xfs_iext_update_extent(bma->ip, state, &bma->icur, &PREV);
xfs_iext_next(ifp, &bma->icur);
xfs_iext_update_extent(bma->ip, state, &bma->icur, &RIGHT);
+ ASSERT(da_new <= da_old);
break;
case BMAP_RIGHT_FILLING:
@@ -1790,6 +1801,7 @@ xfs_bmap_add_extent_delay_real(
PREV.br_blockcount = temp;
xfs_iext_insert(bma->ip, &bma->icur, &PREV, state);
xfs_iext_next(ifp, &bma->icur);
+ ASSERT(da_new <= da_old);
break;
case 0:
@@ -1910,11 +1922,9 @@ xfs_bmap_add_extent_delay_real(
}
/* adjust for changes in reserved delayed indirect blocks */
- if (da_new != da_old) {
- ASSERT(state == 0 || da_new < da_old);
+ if (da_new != da_old)
error = xfs_mod_fdblocks(mp, (int64_t)(da_old - da_new),
- false);
- }
+ true);
xfs_bmap_check_leaf_extents(bma->cur, bma->ip, whichfork);
done:
@@ -3949,20 +3959,32 @@ xfs_bmapi_reserve_delalloc(
xfs_extlen_t alen;
xfs_extlen_t indlen;
int error;
- xfs_fileoff_t aoff = off;
+ xfs_fileoff_t aoff;
+ bool use_cowextszhint =
+ whichfork == XFS_COW_FORK && !prealloc;
+retry:
/*
* Cap the alloc length. Keep track of prealloc so we know whether to
* tag the inode before we return.
*/
+ aoff = off;
alen = XFS_FILBLKS_MIN(len + prealloc, XFS_MAX_BMBT_EXTLEN);
if (!eof)
alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff);
if (prealloc && alen >= len)
prealloc = alen - len;
- /* Figure out the extent size, adjust alen */
- if (whichfork == XFS_COW_FORK) {
+ /*
+ * If we're targetting the COW fork but aren't creating a speculative
+ * posteof preallocation, try to expand the reservation to align with
+ * the COW extent size hint if there's sufficient free space.
+ *
+ * Unlike the data fork, the CoW cancellation functions will free all
+ * the reservations at inactivation, so we don't require that every
+ * delalloc reservation have a dirty pagecache.
+ */
+ if (use_cowextszhint) {
struct xfs_bmbt_irec prev;
xfs_extlen_t extsz = xfs_get_cowextsz_hint(ip);
@@ -3981,7 +4003,7 @@ xfs_bmapi_reserve_delalloc(
*/
error = xfs_quota_reserve_blkres(ip, alen);
if (error)
- return error;
+ goto out;
/*
* Split changing sb for alen and indlen since they could be coming
@@ -4026,6 +4048,17 @@ out_unreserve_blocks:
out_unreserve_quota:
if (XFS_IS_QUOTA_ON(mp))
xfs_quota_unreserve_blkres(ip, alen);
+out:
+ if (error == -ENOSPC || error == -EDQUOT) {
+ trace_xfs_delalloc_enospc(ip, off, len);
+
+ if (prealloc || use_cowextszhint) {
+ /* retry without any preallocation */
+ use_cowextszhint = false;
+ prealloc = 0;
+ goto retry;
+ }
+ }
return error;
}
@@ -4108,8 +4141,10 @@ xfs_bmapi_allocate(
} else {
error = xfs_bmap_alloc_userdata(bma);
}
- if (error || bma->blkno == NULLFSBLOCK)
+ if (error)
return error;
+ if (bma->blkno == NULLFSBLOCK)
+ return -ENOSPC;
if (bma->flags & XFS_BMAPI_ZERO) {
error = xfs_zero_extent(bma->ip, bma->blkno, bma->length);
@@ -4289,6 +4324,15 @@ xfs_bmapi_finish(
* extent state if necessary. Details behaviour is controlled by the flags
* parameter. Only allocates blocks from a single allocation group, to avoid
* locking problems.
+ *
+ * Returns 0 on success and places the extent mappings in mval. nmaps is used
+ * as an input/output parameter where the caller specifies the maximum number
+ * of mappings that may be returned and xfs_bmapi_write passes back the number
+ * of mappings (including existing mappings) it found.
+ *
+ * Returns a negative error code on failure, including -ENOSPC when it could not
+ * allocate any blocks and -ENOSR when it did allocate blocks to convert a
+ * delalloc range, but those blocks were before the passed in range.
*/
int
xfs_bmapi_write(
@@ -4416,10 +4460,16 @@ xfs_bmapi_write(
ASSERT(len > 0);
ASSERT(bma.length > 0);
error = xfs_bmapi_allocate(&bma);
- if (error)
+ if (error) {
+ /*
+ * If we already allocated space in a previous
+ * iteration return what we go so far when
+ * running out of space.
+ */
+ if (error == -ENOSPC && bma.nallocs)
+ break;
goto error0;
- if (bma.blkno == NULLFSBLOCK)
- break;
+ }
/*
* If this is a CoW allocation, record the data in
@@ -4457,7 +4507,6 @@ xfs_bmapi_write(
if (!xfs_iext_next_extent(ifp, &bma.icur, &bma.got))
eof = true;
}
- *nmap = n;
error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, &bma.logflags,
whichfork);
@@ -4468,7 +4517,22 @@ xfs_bmapi_write(
ifp->if_nextents > XFS_IFORK_MAXEXT(ip, whichfork));
xfs_bmapi_finish(&bma, whichfork, 0);
xfs_bmap_validate_ret(orig_bno, orig_len, orig_flags, orig_mval,
- orig_nmap, *nmap);
+ orig_nmap, n);
+
+ /*
+ * When converting delayed allocations, xfs_bmapi_allocate ignores
+ * the passed in bno and always converts from the start of the found
+ * delalloc extent.
+ *
+ * To avoid a successful return with *nmap set to 0, return the magic
+ * -ENOSR error code for this particular case so that the caller can
+ * handle it.
+ */
+ if (!n) {
+ ASSERT(bma.nallocs >= *nmap);
+ return -ENOSR;
+ }
+ *nmap = n;
return 0;
error0:
xfs_bmapi_finish(&bma, whichfork, error);
@@ -4481,8 +4545,8 @@ error0:
* invocations to allocate the target offset if a large enough physical extent
* is not available.
*/
-int
-xfs_bmapi_convert_delalloc(
+static int
+xfs_bmapi_convert_one_delalloc(
struct xfs_inode *ip,
int whichfork,
xfs_off_t offset,
@@ -4539,7 +4603,8 @@ xfs_bmapi_convert_delalloc(
if (!isnullstartblock(bma.got.br_startblock)) {
xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags,
xfs_iomap_inode_sequence(ip, flags));
- *seq = READ_ONCE(ifp->if_seq);
+ if (seq)
+ *seq = READ_ONCE(ifp->if_seq);
goto out_trans_cancel;
}
@@ -4575,9 +4640,6 @@ xfs_bmapi_convert_delalloc(
if (error)
goto out_finish;
- error = -ENOSPC;
- if (WARN_ON_ONCE(bma.blkno == NULLFSBLOCK))
- goto out_finish;
error = -EFSCORRUPTED;
if (WARN_ON_ONCE(!xfs_valid_startblock(ip, bma.got.br_startblock)))
goto out_finish;
@@ -4588,7 +4650,8 @@ xfs_bmapi_convert_delalloc(
ASSERT(!isnullstartblock(bma.got.br_startblock));
xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags,
xfs_iomap_inode_sequence(ip, flags));
- *seq = READ_ONCE(ifp->if_seq);
+ if (seq)
+ *seq = READ_ONCE(ifp->if_seq);
if (whichfork == XFS_COW_FORK)
xfs_refcount_alloc_cow_extent(tp, bma.blkno, bma.length);
@@ -4611,6 +4674,36 @@ out_trans_cancel:
return error;
}
+/*
+ * Pass in a dellalloc extent and convert it to real extents, return the real
+ * extent that maps offset_fsb in iomap.
+ */
+int
+xfs_bmapi_convert_delalloc(
+ struct xfs_inode *ip,
+ int whichfork,
+ loff_t offset,
+ struct iomap *iomap,
+ unsigned int *seq)
+{
+ int error;
+
+ /*
+ * Attempt to allocate whatever delalloc extent currently backs offset
+ * and put the result into iomap. Allocate in a loop because it may
+ * take several attempts to allocate real blocks for a contiguous
+ * delalloc extent if free space is sufficiently fragmented.
+ */
+ do {
+ error = xfs_bmapi_convert_one_delalloc(ip, whichfork, offset,
+ iomap, seq);
+ if (error)
+ return error;
+ } while (iomap->offset + iomap->length <= offset);
+
+ return 0;
+}
+
int
xfs_bmapi_remap(
struct xfs_trans *tp,
@@ -4994,7 +5087,6 @@ xfs_bmap_del_extent_real(
xfs_fileoff_t del_endoff; /* first offset past del */
int do_fx; /* free extent at end of routine */
int error; /* error return value */
- int flags = 0;/* inode logging flags */
struct xfs_bmbt_irec got; /* current extent entry */
xfs_fileoff_t got_endoff; /* first offset past got */
int i; /* temp state */
@@ -5007,6 +5099,8 @@ xfs_bmap_del_extent_real(
uint32_t state = xfs_bmap_fork_to_state(whichfork);
struct xfs_bmbt_irec old;
+ *logflagsp = 0;
+
mp = ip->i_mount;
XFS_STATS_INC(mp, xs_del_exlist);
@@ -5019,7 +5113,6 @@ xfs_bmap_del_extent_real(
ASSERT(got_endoff >= del_endoff);
ASSERT(!isnullstartblock(got.br_startblock));
qfield = 0;
- error = 0;
/*
* If it's the case where the directory code is running with no block
@@ -5035,13 +5128,13 @@ xfs_bmap_del_extent_real(
del->br_startoff > got.br_startoff && del_endoff < got_endoff)
return -ENOSPC;
- flags = XFS_ILOG_CORE;
+ *logflagsp = XFS_ILOG_CORE;
if (whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip)) {
if (!(bflags & XFS_BMAPI_REMAP)) {
error = xfs_rtfree_blocks(tp, del->br_startblock,
del->br_blockcount);
if (error)
- goto done;
+ return error;
}
do_fx = 0;
@@ -5056,11 +5149,9 @@ xfs_bmap_del_extent_real(
if (cur) {
error = xfs_bmbt_lookup_eq(cur, &got, &i);
if (error)
- goto done;
- if (XFS_IS_CORRUPT(mp, i != 1)) {
- error = -EFSCORRUPTED;
- goto done;
- }
+ return error;
+ if (XFS_IS_CORRUPT(mp, i != 1))
+ return -EFSCORRUPTED;
}
if (got.br_startoff == del->br_startoff)
@@ -5077,17 +5168,15 @@ xfs_bmap_del_extent_real(
xfs_iext_prev(ifp, icur);
ifp->if_nextents--;
- flags |= XFS_ILOG_CORE;
+ *logflagsp |= XFS_ILOG_CORE;
if (!cur) {
- flags |= xfs_ilog_fext(whichfork);
+ *logflagsp |= xfs_ilog_fext(whichfork);
break;
}
if ((error = xfs_btree_delete(cur, &i)))
- goto done;
- if (XFS_IS_CORRUPT(mp, i != 1)) {
- error = -EFSCORRUPTED;
- goto done;
- }
+ return error;
+ if (XFS_IS_CORRUPT(mp, i != 1))
+ return -EFSCORRUPTED;
break;
case BMAP_LEFT_FILLING:
/*
@@ -5098,12 +5187,12 @@ xfs_bmap_del_extent_real(
got.br_blockcount -= del->br_blockcount;
xfs_iext_update_extent(ip, state, icur, &got);
if (!cur) {
- flags |= xfs_ilog_fext(whichfork);
+ *logflagsp |= xfs_ilog_fext(whichfork);
break;
}
error = xfs_bmbt_update(cur, &got);
if (error)
- goto done;
+ return error;
break;
case BMAP_RIGHT_FILLING:
/*
@@ -5112,12 +5201,12 @@ xfs_bmap_del_extent_real(
got.br_blockcount -= del->br_blockcount;
xfs_iext_update_extent(ip, state, icur, &got);
if (!cur) {
- flags |= xfs_ilog_fext(whichfork);
+ *logflagsp |= xfs_ilog_fext(whichfork);
break;
}
error = xfs_bmbt_update(cur, &got);
if (error)
- goto done;
+ return error;
break;
case 0:
/*
@@ -5134,18 +5223,18 @@ xfs_bmap_del_extent_real(
new.br_state = got.br_state;
new.br_startblock = del_endblock;
- flags |= XFS_ILOG_CORE;
+ *logflagsp |= XFS_ILOG_CORE;
if (cur) {
error = xfs_bmbt_update(cur, &got);
if (error)
- goto done;
+ return error;
error = xfs_btree_increment(cur, 0, &i);
if (error)
- goto done;
+ return error;
cur->bc_rec.b = new;
error = xfs_btree_insert(cur, &i);
if (error && error != -ENOSPC)
- goto done;
+ return error;
/*
* If get no-space back from btree insert, it tried a
* split, and we have a zero block reservation. Fix up
@@ -5158,33 +5247,28 @@ xfs_bmap_del_extent_real(
*/
error = xfs_bmbt_lookup_eq(cur, &got, &i);
if (error)
- goto done;
- if (XFS_IS_CORRUPT(mp, i != 1)) {
- error = -EFSCORRUPTED;
- goto done;
- }
+ return error;
+ if (XFS_IS_CORRUPT(mp, i != 1))
+ return -EFSCORRUPTED;
/*
* Update the btree record back
* to the original value.
*/
error = xfs_bmbt_update(cur, &old);
if (error)
- goto done;
+ return error;
/*
* Reset the extent record back
* to the original value.
*/
xfs_iext_update_extent(ip, state, icur, &old);
- flags = 0;
- error = -ENOSPC;
- goto done;
- }
- if (XFS_IS_CORRUPT(mp, i != 1)) {
- error = -EFSCORRUPTED;
- goto done;
+ *logflagsp = 0;
+ return -ENOSPC;
}
+ if (XFS_IS_CORRUPT(mp, i != 1))
+ return -EFSCORRUPTED;
} else
- flags |= xfs_ilog_fext(whichfork);
+ *logflagsp |= xfs_ilog_fext(whichfork);
ifp->if_nextents++;
xfs_iext_next(ifp, icur);
@@ -5202,10 +5286,13 @@ xfs_bmap_del_extent_real(
if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) {
xfs_refcount_decrease_extent(tp, del);
} else {
- __xfs_free_extent_later(tp, del->br_startblock,
+ error = __xfs_free_extent_later(tp, del->br_startblock,
del->br_blockcount, NULL,
- (bflags & XFS_BMAPI_NODISCARD) ||
- del->br_state == XFS_EXT_UNWRITTEN);
+ XFS_AG_RESV_NONE,
+ ((bflags & XFS_BMAPI_NODISCARD) ||
+ del->br_state == XFS_EXT_UNWRITTEN));
+ if (error)
+ return error;
}
}
@@ -5220,9 +5307,7 @@ xfs_bmap_del_extent_real(
if (qfield && !(bflags & XFS_BMAPI_REMAP))
xfs_trans_mod_dquot_byino(tp, ip, qfield, (long)-nblks);
-done:
- *logflagsp = flags;
- return error;
+ return 0;
}
/*
@@ -6119,39 +6204,37 @@ xfs_bmap_unmap_extent(
int
xfs_bmap_finish_one(
struct xfs_trans *tp,
- struct xfs_inode *ip,
- enum xfs_bmap_intent_type type,
- int whichfork,
- xfs_fileoff_t startoff,
- xfs_fsblock_t startblock,
- xfs_filblks_t *blockcount,
- xfs_exntst_t state)
+ struct xfs_bmap_intent *bi)
{
+ struct xfs_bmbt_irec *bmap = &bi->bi_bmap;
int error = 0;
ASSERT(tp->t_firstblock == NULLFSBLOCK);
trace_xfs_bmap_deferred(tp->t_mountp,
- XFS_FSB_TO_AGNO(tp->t_mountp, startblock), type,
- XFS_FSB_TO_AGBNO(tp->t_mountp, startblock),
- ip->i_ino, whichfork, startoff, *blockcount, state);
+ XFS_FSB_TO_AGNO(tp->t_mountp, bmap->br_startblock),
+ bi->bi_type,
+ XFS_FSB_TO_AGBNO(tp->t_mountp, bmap->br_startblock),
+ bi->bi_owner->i_ino, bi->bi_whichfork,
+ bmap->br_startoff, bmap->br_blockcount,
+ bmap->br_state);
- if (WARN_ON_ONCE(whichfork != XFS_DATA_FORK))
+ if (WARN_ON_ONCE(bi->bi_whichfork != XFS_DATA_FORK))
return -EFSCORRUPTED;
if (XFS_TEST_ERROR(false, tp->t_mountp,
XFS_ERRTAG_BMAP_FINISH_ONE))
return -EIO;
- switch (type) {
+ switch (bi->bi_type) {
case XFS_BMAP_MAP:
- error = xfs_bmapi_remap(tp, ip, startoff, *blockcount,
- startblock, 0);
- *blockcount = 0;
+ error = xfs_bmapi_remap(tp, bi->bi_owner, bmap->br_startoff,
+ bmap->br_blockcount, bmap->br_startblock, 0);
+ bmap->br_blockcount = 0;
break;
case XFS_BMAP_UNMAP:
- error = __xfs_bunmapi(tp, ip, startoff, blockcount,
- XFS_BMAPI_REMAP, 1);
+ error = __xfs_bunmapi(tp, bi->bi_owner, bmap->br_startoff,
+ &bmap->br_blockcount, XFS_BMAPI_REMAP, 1);
break;
default:
ASSERT(0);
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index 08c16e4edc0f..524912f276f8 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -236,10 +236,7 @@ struct xfs_bmap_intent {
struct xfs_bmbt_irec bi_bmap;
};
-int xfs_bmap_finish_one(struct xfs_trans *tp, struct xfs_inode *ip,
- enum xfs_bmap_intent_type type, int whichfork,
- xfs_fileoff_t startoff, xfs_fsblock_t startblock,
- xfs_filblks_t *blockcount, xfs_exntst_t state);
+int xfs_bmap_finish_one(struct xfs_trans *tp, struct xfs_bmap_intent *bi);
void xfs_bmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip,
struct xfs_bmbt_irec *imap);
void xfs_bmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip,
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index 18de4fbfef4e..57f401f2492d 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -285,11 +285,15 @@ xfs_bmbt_free_block(
struct xfs_trans *tp = cur->bc_tp;
xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp));
struct xfs_owner_info oinfo;
+ int error;
xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, cur->bc_ino.whichfork);
- xfs_free_extent_later(cur->bc_tp, fsbno, 1, &oinfo);
- ip->i_nblocks--;
+ error = xfs_free_extent_later(cur->bc_tp, fsbno, 1, &oinfo,
+ XFS_AG_RESV_NONE);
+ if (error)
+ return error;
+ ip->i_nblocks--;
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
return 0;
diff --git a/fs/xfs/libxfs/xfs_btree_staging.c b/fs/xfs/libxfs/xfs_btree_staging.c
index dd75e208b543..29e3f8ccb185 100644
--- a/fs/xfs/libxfs/xfs_btree_staging.c
+++ b/fs/xfs/libxfs/xfs_btree_staging.c
@@ -342,9 +342,7 @@ xfs_btree_bload_drop_buf(
if (*bpp == NULL)
return;
- if (!xfs_buf_delwri_queue(*bpp, buffers_list))
- ASSERT(0);
-
+ xfs_buf_delwri_queue_here(*bpp, buffers_list);
xfs_buf_relse(*bpp);
*bpp = NULL;
}
diff --git a/fs/xfs/libxfs/xfs_btree_staging.h b/fs/xfs/libxfs/xfs_btree_staging.h
index f0d2976050ae..5f638f711246 100644
--- a/fs/xfs/libxfs/xfs_btree_staging.h
+++ b/fs/xfs/libxfs/xfs_btree_staging.h
@@ -37,12 +37,6 @@ struct xbtree_ifakeroot {
/* Number of bytes available for this fork in the inode. */
unsigned int if_fork_size;
-
- /* Fork format. */
- unsigned int if_format;
-
- /* Number of records. */
- unsigned int if_extents;
};
/* Cursor interactions with fake roots for inode-rooted btrees. */
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
index e576560b46e9..12e3cca804b7 100644
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -2158,8 +2158,8 @@ xfs_da_grow_inode_int(
struct xfs_inode *dp = args->dp;
int w = args->whichfork;
xfs_rfsblock_t nblks = dp->i_nblocks;
- struct xfs_bmbt_irec map, *mapp;
- int nmap, error, got, i, mapi;
+ struct xfs_bmbt_irec map, *mapp = &map;
+ int nmap, error, got, i, mapi = 1;
/*
* Find a spot in the file space to put the new block.
@@ -2175,14 +2175,7 @@ xfs_da_grow_inode_int(
error = xfs_bmapi_write(tp, dp, *bno, count,
xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA|XFS_BMAPI_CONTIG,
args->total, &map, &nmap);
- if (error)
- return error;
-
- ASSERT(nmap <= 1);
- if (nmap == 1) {
- mapp = &map;
- mapi = 1;
- } else if (nmap == 0 && count > 1) {
+ if (error == -ENOSPC && count > 1) {
xfs_fileoff_t b;
int c;
@@ -2199,16 +2192,13 @@ xfs_da_grow_inode_int(
args->total, &mapp[mapi], &nmap);
if (error)
goto out_free_map;
- if (nmap < 1)
- break;
mapi += nmap;
b = mapp[mapi - 1].br_startoff +
mapp[mapi - 1].br_blockcount;
}
- } else {
- mapi = 0;
- mapp = NULL;
}
+ if (error)
+ goto out_free_map;
/*
* Count the blocks we got, make sure it matches the total.
@@ -2316,10 +2306,17 @@ xfs_da3_swap_lastblock(
return error;
/*
* Copy the last block into the dead buffer and log it.
+ * On CRC-enabled file systems, also update the stamped in blkno.
*/
memcpy(dead_buf->b_addr, last_buf->b_addr, args->geo->blksize);
+ if (xfs_has_crc(mp)) {
+ struct xfs_da3_blkinfo *da3 = dead_buf->b_addr;
+
+ da3->blkno = cpu_to_be64(xfs_buf_daddr(dead_buf));
+ }
xfs_trans_log_buf(tp, dead_buf, 0, args->geo->blksize - 1);
dead_info = dead_buf->b_addr;
+
/*
* Get values from the moved block.
*/
diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c
index dbcf58979a59..e1d5da6d8d4a 100644
--- a/fs/xfs/libxfs/xfs_dir2_data.c
+++ b/fs/xfs/libxfs/xfs_dir2_data.c
@@ -177,6 +177,14 @@ __xfs_dir3_data_check(
while (offset < end) {
struct xfs_dir2_data_unused *dup = bp->b_addr + offset;
struct xfs_dir2_data_entry *dep = bp->b_addr + offset;
+ unsigned int reclen;
+
+ /*
+ * Are the remaining bytes large enough to hold an
+ * unused entry?
+ */
+ if (offset > end - xfs_dir2_data_unusedsize(1))
+ return __this_address;
/*
* If it's unused, look for the space in the bestfree table.
@@ -186,9 +194,13 @@ __xfs_dir3_data_check(
if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
xfs_failaddr_t fa;
+ reclen = xfs_dir2_data_unusedsize(
+ be16_to_cpu(dup->length));
if (lastfree != 0)
return __this_address;
- if (offset + be16_to_cpu(dup->length) > end)
+ if (be16_to_cpu(dup->length) != reclen)
+ return __this_address;
+ if (offset + reclen > end)
return __this_address;
if (be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)) !=
offset)
@@ -206,10 +218,18 @@ __xfs_dir3_data_check(
be16_to_cpu(bf[2].length))
return __this_address;
}
- offset += be16_to_cpu(dup->length);
+ offset += reclen;
lastfree = 1;
continue;
}
+
+ /*
+ * This is not an unused entry. Are the remaining bytes
+ * large enough for a dirent with a single-byte name?
+ */
+ if (offset > end - xfs_dir2_data_entsize(mp, 1))
+ return __this_address;
+
/*
* It's a real entry. Validate the fields.
* If this is a block directory then make sure it's
@@ -218,9 +238,10 @@ __xfs_dir3_data_check(
*/
if (dep->namelen == 0)
return __this_address;
- if (!xfs_verify_dir_ino(mp, be64_to_cpu(dep->inumber)))
+ reclen = xfs_dir2_data_entsize(mp, dep->namelen);
+ if (offset + reclen > end)
return __this_address;
- if (offset + xfs_dir2_data_entsize(mp, dep->namelen) > end)
+ if (!xfs_verify_dir_ino(mp, be64_to_cpu(dep->inumber)))
return __this_address;
if (be16_to_cpu(*xfs_dir2_data_entry_tag_p(mp, dep)) != offset)
return __this_address;
@@ -244,7 +265,7 @@ __xfs_dir3_data_check(
if (i >= be32_to_cpu(btp->count))
return __this_address;
}
- offset += xfs_dir2_data_entsize(mp, dep->namelen);
+ offset += reclen;
}
/*
* Need to have seen all the entries and all the bestfree slots.
diff --git a/fs/xfs/libxfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h
index 7404a9ff1a92..9046d08554e9 100644
--- a/fs/xfs/libxfs/xfs_dir2_priv.h
+++ b/fs/xfs/libxfs/xfs_dir2_priv.h
@@ -188,6 +188,13 @@ extern int xfs_readdir(struct xfs_trans *tp, struct xfs_inode *dp,
struct dir_context *ctx, size_t bufsize);
static inline unsigned int
+xfs_dir2_data_unusedsize(
+ unsigned int len)
+{
+ return round_up(len, XFS_DIR2_DATA_ALIGN);
+}
+
+static inline unsigned int
xfs_dir2_data_entsize(
struct xfs_mount *mp,
unsigned int namelen)
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 371dc07233e0..20acb8573d7a 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -98,7 +98,7 @@ typedef struct xfs_sb {
uint32_t sb_blocksize; /* logical block size, bytes */
xfs_rfsblock_t sb_dblocks; /* number of data blocks */
xfs_rfsblock_t sb_rblocks; /* number of realtime blocks */
- xfs_rtblock_t sb_rextents; /* number of realtime extents */
+ xfs_rtbxlen_t sb_rextents; /* number of realtime extents */
uuid_t sb_uuid; /* user-visible file system unique id */
xfs_fsblock_t sb_logstart; /* starting block of log if internal */
xfs_ino_t sb_rootino; /* root inode number */
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index 120dbec16f5c..d1472cbd48ff 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -1827,7 +1827,7 @@ xfs_dialloc(
* might be sparse and only free the regions that are allocated as part of the
* chunk.
*/
-STATIC void
+static int
xfs_difree_inode_chunk(
struct xfs_trans *tp,
xfs_agnumber_t agno,
@@ -1844,10 +1844,10 @@ xfs_difree_inode_chunk(
if (!xfs_inobt_issparse(rec->ir_holemask)) {
/* not sparse, calculate extent info directly */
- xfs_free_extent_later(tp, XFS_AGB_TO_FSB(mp, agno, sagbno),
- M_IGEO(mp)->ialloc_blks,
- &XFS_RMAP_OINFO_INODES);
- return;
+ return xfs_free_extent_later(tp,
+ XFS_AGB_TO_FSB(mp, agno, sagbno),
+ M_IGEO(mp)->ialloc_blks, &XFS_RMAP_OINFO_INODES,
+ XFS_AG_RESV_NONE);
}
/* holemask is only 16-bits (fits in an unsigned long) */
@@ -1864,6 +1864,8 @@ xfs_difree_inode_chunk(
XFS_INOBT_HOLEMASK_BITS);
nextbit = startidx + 1;
while (startidx < XFS_INOBT_HOLEMASK_BITS) {
+ int error;
+
nextbit = find_next_zero_bit(holemask, XFS_INOBT_HOLEMASK_BITS,
nextbit);
/*
@@ -1889,8 +1891,11 @@ xfs_difree_inode_chunk(
ASSERT(agbno % mp->m_sb.sb_spino_align == 0);
ASSERT(contigblk % mp->m_sb.sb_spino_align == 0);
- xfs_free_extent_later(tp, XFS_AGB_TO_FSB(mp, agno, agbno),
- contigblk, &XFS_RMAP_OINFO_INODES);
+ error = xfs_free_extent_later(tp,
+ XFS_AGB_TO_FSB(mp, agno, agbno), contigblk,
+ &XFS_RMAP_OINFO_INODES, XFS_AG_RESV_NONE);
+ if (error)
+ return error;
/* reset range to current bit and carry on... */
startidx = endidx = nextbit;
@@ -1898,6 +1903,7 @@ xfs_difree_inode_chunk(
next:
nextbit++;
}
+ return 0;
}
STATIC int
@@ -1998,7 +2004,9 @@ xfs_difree_inobt(
goto error0;
}
- xfs_difree_inode_chunk(tp, pag->pag_agno, &rec);
+ error = xfs_difree_inode_chunk(tp, pag->pag_agno, &rec);
+ if (error)
+ goto error0;
} else {
xic->deleted = false;
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index 8c83e265770c..7125447cde1a 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -156,9 +156,11 @@ __xfs_inobt_free_block(
struct xfs_buf *bp,
enum xfs_ag_resv_type resv)
{
+ xfs_fsblock_t fsbno;
+
xfs_inobt_mod_blockcount(cur, -1);
- return xfs_free_extent(cur->bc_tp,
- XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp)), 1,
+ fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp));
+ return xfs_free_extent_later(cur->bc_tp, fsbno, 1,
&XFS_RMAP_OINFO_INOBT, resv);
}
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 601b05ca5fc2..3c611c8ac158 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -365,17 +365,40 @@ xfs_dinode_verify_fork(
/*
* For fork types that can contain local data, check that the fork
* format matches the size of local data contained within the fork.
- *
- * For all types, check that when the size says the should be in extent
- * or btree format, the inode isn't claiming it is in local format.
*/
if (whichfork == XFS_DATA_FORK) {
- if (S_ISDIR(mode) || S_ISLNK(mode)) {
+ /*
+ * A directory small enough to fit in the inode must be stored
+ * in local format. The directory sf <-> extents conversion
+ * code updates the directory size accordingly. Directories
+ * being truncated have zero size and are not subject to this
+ * check.
+ */
+ if (S_ISDIR(mode)) {
+ if (dip->di_size &&
+ be64_to_cpu(dip->di_size) <= fork_size &&
+ fork_format != XFS_DINODE_FMT_LOCAL)
+ return __this_address;
+ }
+
+ /*
+ * A symlink with a target small enough to fit in the inode can
+ * be stored in extents format if xattrs were added (thus
+ * converting the data fork from shortform to remote format)
+ * and then removed.
+ */
+ if (S_ISLNK(mode)) {
if (be64_to_cpu(dip->di_size) <= fork_size &&
+ fork_format != XFS_DINODE_FMT_EXTENTS &&
fork_format != XFS_DINODE_FMT_LOCAL)
return __this_address;
}
+ /*
+ * For all types, check that when the size says the fork should
+ * be in extent or btree format, the inode isn't claiming to be
+ * in local format.
+ */
if (be64_to_cpu(dip->di_size) > fork_size &&
fork_format == XFS_DINODE_FMT_LOCAL)
return __this_address;
@@ -491,9 +514,19 @@ xfs_dinode_verify(
if (mode && xfs_mode_to_ftype(mode) == XFS_DIR3_FT_UNKNOWN)
return __this_address;
- /* No zero-length symlinks/dirs. */
- if ((S_ISLNK(mode) || S_ISDIR(mode)) && di_size == 0)
- return __this_address;
+ /*
+ * No zero-length symlinks/dirs unless they're unlinked and hence being
+ * inactivated.
+ */
+ if ((S_ISLNK(mode) || S_ISDIR(mode)) && di_size == 0) {
+ if (dip->di_version > 1) {
+ if (dip->di_nlink)
+ return __this_address;
+ } else {
+ if (dip->di_onlink)
+ return __this_address;
+ }
+ }
fa = xfs_dinode_verify_nrext64(mp, dip);
if (fa)
diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h
index 2420865f3007..a5100a11faf9 100644
--- a/fs/xfs/libxfs/xfs_log_recover.h
+++ b/fs/xfs/libxfs/xfs_log_recover.h
@@ -131,4 +131,26 @@ void xlog_check_buf_cancel_table(struct xlog *log);
#define xlog_check_buf_cancel_table(log) do { } while (0)
#endif
+/*
+ * Transform a regular reservation into one suitable for recovery of a log
+ * intent item.
+ *
+ * Intent recovery only runs a single step of the transaction chain and defers
+ * the rest to a separate transaction. Therefore, we reduce logcount to 1 here
+ * to avoid livelocks if the log grant space is nearly exhausted due to the
+ * recovered intent pinning the tail. Keep the same logflags to avoid tripping
+ * asserts elsewhere. Struct copies abound below.
+ */
+static inline struct xfs_trans_res
+xlog_recover_resv(const struct xfs_trans_res *r)
+{
+ struct xfs_trans_res ret = {
+ .tr_logres = r->tr_logres,
+ .tr_logcount = 1,
+ .tr_logflags = r->tr_logflags,
+ };
+
+ return ret;
+}
+
#endif /* __XFS_LOG_RECOVER_H__ */
diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h
index cb035da3f990..fb05f44f6c75 100644
--- a/fs/xfs/libxfs/xfs_quota_defs.h
+++ b/fs/xfs/libxfs/xfs_quota_defs.h
@@ -56,7 +56,7 @@ typedef uint8_t xfs_dqtype_t;
* And, of course, we also need to take into account the dquot log format item
* used to describe each dquot.
*/
-#define XFS_DQUOT_LOGRES(mp) \
+#define XFS_DQUOT_LOGRES \
((sizeof(struct xfs_dq_logformat) + sizeof(struct xfs_disk_dquot)) * 6)
#define XFS_IS_QUOTA_ON(mp) ((mp)->m_qflags & XFS_ALL_QUOTA_ACCT)
diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index 6f7ed9288fe4..7e16e76fd2e1 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -1129,8 +1129,11 @@ xfs_refcount_adjust_extents(
fsbno = XFS_AGB_TO_FSB(cur->bc_mp,
cur->bc_ag.pag->pag_agno,
tmp.rc_startblock);
- xfs_free_extent_later(cur->bc_tp, fsbno,
- tmp.rc_blockcount, NULL);
+ error = xfs_free_extent_later(cur->bc_tp, fsbno,
+ tmp.rc_blockcount, NULL,
+ XFS_AG_RESV_NONE);
+ if (error)
+ goto out_error;
}
(*agbno) += tmp.rc_blockcount;
@@ -1188,8 +1191,11 @@ xfs_refcount_adjust_extents(
fsbno = XFS_AGB_TO_FSB(cur->bc_mp,
cur->bc_ag.pag->pag_agno,
ext.rc_startblock);
- xfs_free_extent_later(cur->bc_tp, fsbno,
- ext.rc_blockcount, NULL);
+ error = xfs_free_extent_later(cur->bc_tp, fsbno,
+ ext.rc_blockcount, NULL,
+ XFS_AG_RESV_NONE);
+ if (error)
+ goto out_error;
}
skip:
@@ -1213,37 +1219,33 @@ out_error:
STATIC int
xfs_refcount_adjust(
struct xfs_btree_cur *cur,
- xfs_agblock_t agbno,
- xfs_extlen_t aglen,
- xfs_agblock_t *new_agbno,
- xfs_extlen_t *new_aglen,
+ xfs_agblock_t *agbno,
+ xfs_extlen_t *aglen,
enum xfs_refc_adjust_op adj)
{
bool shape_changed;
int shape_changes = 0;
int error;
- *new_agbno = agbno;
- *new_aglen = aglen;
if (adj == XFS_REFCOUNT_ADJUST_INCREASE)
- trace_xfs_refcount_increase(cur->bc_mp, cur->bc_ag.pag->pag_agno,
- agbno, aglen);
+ trace_xfs_refcount_increase(cur->bc_mp,
+ cur->bc_ag.pag->pag_agno, *agbno, *aglen);
else
- trace_xfs_refcount_decrease(cur->bc_mp, cur->bc_ag.pag->pag_agno,
- agbno, aglen);
+ trace_xfs_refcount_decrease(cur->bc_mp,
+ cur->bc_ag.pag->pag_agno, *agbno, *aglen);
/*
* Ensure that no rcextents cross the boundary of the adjustment range.
*/
error = xfs_refcount_split_extent(cur, XFS_REFC_DOMAIN_SHARED,
- agbno, &shape_changed);
+ *agbno, &shape_changed);
if (error)
goto out_error;
if (shape_changed)
shape_changes++;
error = xfs_refcount_split_extent(cur, XFS_REFC_DOMAIN_SHARED,
- agbno + aglen, &shape_changed);
+ *agbno + *aglen, &shape_changed);
if (error)
goto out_error;
if (shape_changed)
@@ -1253,7 +1255,7 @@ xfs_refcount_adjust(
* Try to merge with the left or right extents of the range.
*/
error = xfs_refcount_merge_extents(cur, XFS_REFC_DOMAIN_SHARED,
- new_agbno, new_aglen, adj, &shape_changed);
+ agbno, aglen, adj, &shape_changed);
if (error)
goto out_error;
if (shape_changed)
@@ -1262,7 +1264,7 @@ xfs_refcount_adjust(
cur->bc_ag.refc.shape_changes++;
/* Now that we've taken care of the ends, adjust the middle extents */
- error = xfs_refcount_adjust_extents(cur, new_agbno, new_aglen, adj);
+ error = xfs_refcount_adjust_extents(cur, agbno, aglen, adj);
if (error)
goto out_error;
@@ -1298,21 +1300,20 @@ xfs_refcount_finish_one_cleanup(
static inline int
xfs_refcount_continue_op(
struct xfs_btree_cur *cur,
- xfs_fsblock_t startblock,
- xfs_agblock_t new_agbno,
- xfs_extlen_t new_len,
- xfs_fsblock_t *new_fsbno)
+ struct xfs_refcount_intent *ri,
+ xfs_agblock_t new_agbno)
{
struct xfs_mount *mp = cur->bc_mp;
struct xfs_perag *pag = cur->bc_ag.pag;
- if (XFS_IS_CORRUPT(mp, !xfs_verify_agbext(pag, new_agbno, new_len)))
+ if (XFS_IS_CORRUPT(mp, !xfs_verify_agbext(pag, new_agbno,
+ ri->ri_blockcount)))
return -EFSCORRUPTED;
- *new_fsbno = XFS_AGB_TO_FSB(mp, pag->pag_agno, new_agbno);
+ ri->ri_startblock = XFS_AGB_TO_FSB(mp, pag->pag_agno, new_agbno);
- ASSERT(xfs_verify_fsbext(mp, *new_fsbno, new_len));
- ASSERT(pag->pag_agno == XFS_FSB_TO_AGNO(mp, *new_fsbno));
+ ASSERT(xfs_verify_fsbext(mp, ri->ri_startblock, ri->ri_blockcount));
+ ASSERT(pag->pag_agno == XFS_FSB_TO_AGNO(mp, ri->ri_startblock));
return 0;
}
@@ -1327,11 +1328,7 @@ xfs_refcount_continue_op(
int
xfs_refcount_finish_one(
struct xfs_trans *tp,
- enum xfs_refcount_intent_type type,
- xfs_fsblock_t startblock,
- xfs_extlen_t blockcount,
- xfs_fsblock_t *new_fsb,
- xfs_extlen_t *new_len,
+ struct xfs_refcount_intent *ri,
struct xfs_btree_cur **pcur)
{
struct xfs_mount *mp = tp->t_mountp;
@@ -1339,17 +1336,16 @@ xfs_refcount_finish_one(
struct xfs_buf *agbp = NULL;
int error = 0;
xfs_agblock_t bno;
- xfs_agblock_t new_agbno;
unsigned long nr_ops = 0;
int shape_changes = 0;
struct xfs_perag *pag;
- pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, startblock));
- bno = XFS_FSB_TO_AGBNO(mp, startblock);
+ pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, ri->ri_startblock));
+ bno = XFS_FSB_TO_AGBNO(mp, ri->ri_startblock);
- trace_xfs_refcount_deferred(mp, XFS_FSB_TO_AGNO(mp, startblock),
- type, XFS_FSB_TO_AGBNO(mp, startblock),
- blockcount);
+ trace_xfs_refcount_deferred(mp, XFS_FSB_TO_AGNO(mp, ri->ri_startblock),
+ ri->ri_type, XFS_FSB_TO_AGBNO(mp, ri->ri_startblock),
+ ri->ri_blockcount);
if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REFCOUNT_FINISH_ONE)) {
error = -EIO;
@@ -1380,42 +1376,42 @@ xfs_refcount_finish_one(
}
*pcur = rcur;
- switch (type) {
+ switch (ri->ri_type) {
case XFS_REFCOUNT_INCREASE:
- error = xfs_refcount_adjust(rcur, bno, blockcount, &new_agbno,
- new_len, XFS_REFCOUNT_ADJUST_INCREASE);
+ error = xfs_refcount_adjust(rcur, &bno, &ri->ri_blockcount,
+ XFS_REFCOUNT_ADJUST_INCREASE);
if (error)
goto out_drop;
- if (*new_len > 0)
- error = xfs_refcount_continue_op(rcur, startblock,
- new_agbno, *new_len, new_fsb);
+ if (ri->ri_blockcount > 0)
+ error = xfs_refcount_continue_op(rcur, ri, bno);
break;
case XFS_REFCOUNT_DECREASE:
- error = xfs_refcount_adjust(rcur, bno, blockcount, &new_agbno,
- new_len, XFS_REFCOUNT_ADJUST_DECREASE);
+ error = xfs_refcount_adjust(rcur, &bno, &ri->ri_blockcount,
+ XFS_REFCOUNT_ADJUST_DECREASE);
if (error)
goto out_drop;
- if (*new_len > 0)
- error = xfs_refcount_continue_op(rcur, startblock,
- new_agbno, *new_len, new_fsb);
+ if (ri->ri_blockcount > 0)
+ error = xfs_refcount_continue_op(rcur, ri, bno);
break;
case XFS_REFCOUNT_ALLOC_COW:
- *new_fsb = startblock + blockcount;
- *new_len = 0;
- error = __xfs_refcount_cow_alloc(rcur, bno, blockcount);
+ error = __xfs_refcount_cow_alloc(rcur, bno, ri->ri_blockcount);
+ if (error)
+ goto out_drop;
+ ri->ri_blockcount = 0;
break;
case XFS_REFCOUNT_FREE_COW:
- *new_fsb = startblock + blockcount;
- *new_len = 0;
- error = __xfs_refcount_cow_free(rcur, bno, blockcount);
+ error = __xfs_refcount_cow_free(rcur, bno, ri->ri_blockcount);
+ if (error)
+ goto out_drop;
+ ri->ri_blockcount = 0;
break;
default:
ASSERT(0);
error = -EFSCORRUPTED;
}
- if (!error && *new_len > 0)
- trace_xfs_refcount_finish_one_leftover(mp, pag->pag_agno, type,
- bno, blockcount, new_agbno, *new_len);
+ if (!error && ri->ri_blockcount > 0)
+ trace_xfs_refcount_finish_one_leftover(mp, pag->pag_agno,
+ ri->ri_type, bno, ri->ri_blockcount);
out_drop:
xfs_perag_put(pag);
return error;
@@ -1907,8 +1903,13 @@ xfs_refcount_recover_cow_leftovers(
struct xfs_buf *agbp;
struct xfs_refcount_recovery *rr, *n;
struct list_head debris;
- union xfs_btree_irec low;
- union xfs_btree_irec high;
+ union xfs_btree_irec low = {
+ .rc.rc_domain = XFS_REFC_DOMAIN_COW,
+ };
+ union xfs_btree_irec high = {
+ .rc.rc_domain = XFS_REFC_DOMAIN_COW,
+ .rc.rc_startblock = -1U,
+ };
xfs_fsblock_t fsb;
int error;
@@ -1939,10 +1940,6 @@ xfs_refcount_recover_cow_leftovers(
cur = xfs_refcountbt_init_cursor(mp, tp, agbp, pag);
/* Find all the leftover CoW staging extents. */
- memset(&low, 0, sizeof(low));
- memset(&high, 0, sizeof(high));
- low.rc.rc_domain = high.rc.rc_domain = XFS_REFC_DOMAIN_COW;
- high.rc.rc_startblock = -1U;
error = xfs_btree_query_range(cur, &low, &high,
xfs_refcount_recover_extent, &debris);
xfs_btree_del_cursor(cur, error);
@@ -1968,7 +1965,11 @@ xfs_refcount_recover_cow_leftovers(
rr->rr_rrec.rc_blockcount);
/* Free the block. */
- xfs_free_extent_later(tp, fsb, rr->rr_rrec.rc_blockcount, NULL);
+ error = xfs_free_extent_later(tp, fsb,
+ rr->rr_rrec.rc_blockcount, NULL,
+ XFS_AG_RESV_NONE);
+ if (error)
+ goto out_trans;
error = xfs_trans_commit(tp);
if (error)
diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h
index 452f30556f5a..c633477ce3ce 100644
--- a/fs/xfs/libxfs/xfs_refcount.h
+++ b/fs/xfs/libxfs/xfs_refcount.h
@@ -75,9 +75,7 @@ void xfs_refcount_decrease_extent(struct xfs_trans *tp,
extern void xfs_refcount_finish_one_cleanup(struct xfs_trans *tp,
struct xfs_btree_cur *rcur, int error);
extern int xfs_refcount_finish_one(struct xfs_trans *tp,
- enum xfs_refcount_intent_type type, xfs_fsblock_t startblock,
- xfs_extlen_t blockcount, xfs_fsblock_t *new_fsb,
- xfs_extlen_t *new_len, struct xfs_btree_cur **pcur);
+ struct xfs_refcount_intent *ri, struct xfs_btree_cur **pcur);
extern int xfs_refcount_find_shared(struct xfs_btree_cur *cur,
xfs_agblock_t agbno, xfs_extlen_t aglen, xfs_agblock_t *fbno,
diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c
index e1f789866683..fbd53b6951a9 100644
--- a/fs/xfs/libxfs/xfs_refcount_btree.c
+++ b/fs/xfs/libxfs/xfs_refcount_btree.c
@@ -106,18 +106,13 @@ xfs_refcountbt_free_block(
struct xfs_buf *agbp = cur->bc_ag.agbp;
struct xfs_agf *agf = agbp->b_addr;
xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp));
- int error;
trace_xfs_refcountbt_free_block(cur->bc_mp, cur->bc_ag.pag->pag_agno,
XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno), 1);
be32_add_cpu(&agf->agf_refcount_blocks, -1);
xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_REFCOUNT_BLOCKS);
- error = xfs_free_extent(cur->bc_tp, fsbno, 1, &XFS_RMAP_OINFO_REFC,
- XFS_AG_RESV_METADATA);
- if (error)
- return error;
-
- return error;
+ return xfs_free_extent_later(cur->bc_tp, fsbno, 1,
+ &XFS_RMAP_OINFO_REFC, XFS_AG_RESV_METADATA);
}
STATIC int
diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index b56aca1e7c66..95d3599561ce 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -2337,14 +2337,10 @@ xfs_rmap_query_range(
xfs_rmap_query_range_fn fn,
void *priv)
{
- union xfs_btree_irec low_brec;
- union xfs_btree_irec high_brec;
- struct xfs_rmap_query_range_info query;
+ union xfs_btree_irec low_brec = { .r = *low_rec };
+ union xfs_btree_irec high_brec = { .r = *high_rec };
+ struct xfs_rmap_query_range_info query = { .priv = priv, .fn = fn };
- low_brec.r = *low_rec;
- high_brec.r = *high_rec;
- query.priv = priv;
- query.fn = fn;
return xfs_btree_query_range(cur, &low_brec, &high_brec,
xfs_rmap_query_range_helper, &query);
}
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index 655108a4cd05..760172a65aff 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -16,6 +16,7 @@
#include "xfs_trans.h"
#include "xfs_rtalloc.h"
#include "xfs_error.h"
+#include "xfs_rtbitmap.h"
/*
* Realtime allocator bitmap functions shared with userspace.
@@ -1129,3 +1130,4 @@ xfs_rtalloc_extent_is_free(
*is_free = matches;
return 0;
}
+
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.h b/fs/xfs/libxfs/xfs_rtbitmap.h
new file mode 100644
index 000000000000..b89712983347
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_rtbitmap.h
@@ -0,0 +1,83 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_RTBITMAP_H__
+#define __XFS_RTBITMAP_H__
+
+/*
+ * XXX: Most of the realtime allocation functions deal in units of realtime
+ * extents, not realtime blocks. This looks funny when paired with the type
+ * name and screams for a larger cleanup.
+ */
+struct xfs_rtalloc_rec {
+ xfs_rtblock_t ar_startext;
+ xfs_rtbxlen_t ar_extcount;
+};
+
+typedef int (*xfs_rtalloc_query_range_fn)(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ const struct xfs_rtalloc_rec *rec,
+ void *priv);
+
+#ifdef CONFIG_XFS_RT
+int xfs_rtbuf_get(struct xfs_mount *mp, struct xfs_trans *tp,
+ xfs_rtblock_t block, int issum, struct xfs_buf **bpp);
+int xfs_rtcheck_range(struct xfs_mount *mp, struct xfs_trans *tp,
+ xfs_rtblock_t start, xfs_extlen_t len, int val,
+ xfs_rtblock_t *new, int *stat);
+int xfs_rtfind_back(struct xfs_mount *mp, struct xfs_trans *tp,
+ xfs_rtblock_t start, xfs_rtblock_t limit,
+ xfs_rtblock_t *rtblock);
+int xfs_rtfind_forw(struct xfs_mount *mp, struct xfs_trans *tp,
+ xfs_rtblock_t start, xfs_rtblock_t limit,
+ xfs_rtblock_t *rtblock);
+int xfs_rtmodify_range(struct xfs_mount *mp, struct xfs_trans *tp,
+ xfs_rtblock_t start, xfs_extlen_t len, int val);
+int xfs_rtmodify_summary_int(struct xfs_mount *mp, struct xfs_trans *tp,
+ int log, xfs_rtblock_t bbno, int delta,
+ struct xfs_buf **rbpp, xfs_fsblock_t *rsb,
+ xfs_suminfo_t *sum);
+int xfs_rtmodify_summary(struct xfs_mount *mp, struct xfs_trans *tp, int log,
+ xfs_rtblock_t bbno, int delta, struct xfs_buf **rbpp,
+ xfs_fsblock_t *rsb);
+int xfs_rtfree_range(struct xfs_mount *mp, struct xfs_trans *tp,
+ xfs_rtblock_t start, xfs_extlen_t len,
+ struct xfs_buf **rbpp, xfs_fsblock_t *rsb);
+int xfs_rtalloc_query_range(struct xfs_mount *mp, struct xfs_trans *tp,
+ const struct xfs_rtalloc_rec *low_rec,
+ const struct xfs_rtalloc_rec *high_rec,
+ xfs_rtalloc_query_range_fn fn, void *priv);
+int xfs_rtalloc_query_all(struct xfs_mount *mp, struct xfs_trans *tp,
+ xfs_rtalloc_query_range_fn fn,
+ void *priv);
+bool xfs_verify_rtbno(struct xfs_mount *mp, xfs_rtblock_t rtbno);
+int xfs_rtalloc_extent_is_free(struct xfs_mount *mp, struct xfs_trans *tp,
+ xfs_rtblock_t start, xfs_extlen_t len,
+ bool *is_free);
+/*
+ * Free an extent in the realtime subvolume. Length is expressed in
+ * realtime extents, as is the block number.
+ */
+int /* error */
+xfs_rtfree_extent(
+ struct xfs_trans *tp, /* transaction pointer */
+ xfs_rtblock_t bno, /* starting block number to free */
+ xfs_extlen_t len); /* length of extent freed */
+
+/* Same as above, but in units of rt blocks. */
+int xfs_rtfree_blocks(struct xfs_trans *tp, xfs_fsblock_t rtbno,
+ xfs_filblks_t rtlen);
+
+#else /* CONFIG_XFS_RT */
+# define xfs_rtfree_extent(t,b,l) (-ENOSYS)
+# define xfs_rtfree_blocks(t,rb,rl) (-ENOSYS)
+# define xfs_rtalloc_query_range(m,t,l,h,f,p) (-ENOSYS)
+# define xfs_rtalloc_query_all(m,t,f,p) (-ENOSYS)
+# define xfs_rtbuf_get(m,t,b,i,p) (-ENOSYS)
+# define xfs_rtalloc_extent_is_free(m,t,s,l,i) (-ENOSYS)
+#endif /* CONFIG_XFS_RT */
+
+#endif /* __XFS_RTBITMAP_H__ */
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index d214233ef532..8e0a176b8e0b 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -25,6 +25,7 @@
#include "xfs_da_format.h"
#include "xfs_health.h"
#include "xfs_ag.h"
+#include "xfs_rtbitmap.h"
/*
* Physical superblock buffer manipulations. Shared with libxfs in userspace.
@@ -501,8 +502,9 @@ xfs_validate_sb_common(
rbmblocks = howmany_64(sbp->sb_rextents,
NBBY * sbp->sb_blocksize);
- if (sbp->sb_rextents != rexts ||
- sbp->sb_rextslog != xfs_highbit32(sbp->sb_rextents) ||
+ if (!xfs_validate_rtextents(rexts) ||
+ sbp->sb_rextents != rexts ||
+ sbp->sb_rextslog != xfs_compute_rextslog(rexts) ||
sbp->sb_rbmblocks != rbmblocks) {
xfs_notice(mp,
"realtime geometry sanity check failed");
@@ -1020,11 +1022,12 @@ xfs_log_sb(
* and hence we don't need have to update it here.
*/
if (xfs_has_lazysbcount(mp)) {
- mp->m_sb.sb_icount = percpu_counter_sum(&mp->m_icount);
+ mp->m_sb.sb_icount = percpu_counter_sum_positive(&mp->m_icount);
mp->m_sb.sb_ifree = min_t(uint64_t,
- percpu_counter_sum(&mp->m_ifree),
+ percpu_counter_sum_positive(&mp->m_ifree),
mp->m_sb.sb_icount);
- mp->m_sb.sb_fdblocks = percpu_counter_sum(&mp->m_fdblocks);
+ mp->m_sb.sb_fdblocks =
+ percpu_counter_sum_positive(&mp->m_fdblocks);
}
xfs_sb_to_disk(bp->b_addr, &mp->m_sb);
@@ -1365,3 +1368,17 @@ xfs_validate_stripe_geometry(
}
return true;
}
+
+/*
+ * Compute the maximum level number of the realtime summary file, as defined by
+ * mkfs. The historic use of highbit32 on a 64-bit quantity prohibited correct
+ * use of rt volumes with more than 2^32 extents.
+ */
+uint8_t
+xfs_compute_rextslog(
+ xfs_rtbxlen_t rtextents)
+{
+ if (!rtextents)
+ return 0;
+ return xfs_highbit64(rtextents);
+}
diff --git a/fs/xfs/libxfs/xfs_sb.h b/fs/xfs/libxfs/xfs_sb.h
index 19134b23c10b..2e8e8d63d4eb 100644
--- a/fs/xfs/libxfs/xfs_sb.h
+++ b/fs/xfs/libxfs/xfs_sb.h
@@ -38,4 +38,6 @@ extern int xfs_sb_get_secondary(struct xfs_mount *mp,
extern bool xfs_validate_stripe_geometry(struct xfs_mount *mp,
__s64 sunit, __s64 swidth, int sectorsize, bool silent);
+uint8_t xfs_compute_rextslog(xfs_rtbxlen_t rtextents);
+
#endif /* __XFS_SB_H__ */
diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c
index 5b2f27cbdb80..1bb2891b26ff 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.c
+++ b/fs/xfs/libxfs/xfs_trans_resv.c
@@ -334,11 +334,11 @@ xfs_calc_write_reservation(
blksz);
t1 += adj;
t3 += adj;
- return XFS_DQUOT_LOGRES(mp) + max3(t1, t2, t3);
+ return XFS_DQUOT_LOGRES + max3(t1, t2, t3);
}
t4 = xfs_calc_refcountbt_reservation(mp, 1);
- return XFS_DQUOT_LOGRES(mp) + max(t4, max3(t1, t2, t3));
+ return XFS_DQUOT_LOGRES + max(t4, max3(t1, t2, t3));
}
unsigned int
@@ -406,11 +406,11 @@ xfs_calc_itruncate_reservation(
xfs_refcountbt_block_count(mp, 4),
blksz);
- return XFS_DQUOT_LOGRES(mp) + max3(t1, t2, t3);
+ return XFS_DQUOT_LOGRES + max3(t1, t2, t3);
}
t4 = xfs_calc_refcountbt_reservation(mp, 2);
- return XFS_DQUOT_LOGRES(mp) + max(t4, max3(t1, t2, t3));
+ return XFS_DQUOT_LOGRES + max(t4, max3(t1, t2, t3));
}
unsigned int
@@ -436,7 +436,7 @@ STATIC uint
xfs_calc_rename_reservation(
struct xfs_mount *mp)
{
- return XFS_DQUOT_LOGRES(mp) +
+ return XFS_DQUOT_LOGRES +
max((xfs_calc_inode_res(mp, 5) +
xfs_calc_buf_res(2 * XFS_DIROP_LOG_COUNT(mp),
XFS_FSB_TO_B(mp, 1))),
@@ -475,7 +475,7 @@ STATIC uint
xfs_calc_link_reservation(
struct xfs_mount *mp)
{
- return XFS_DQUOT_LOGRES(mp) +
+ return XFS_DQUOT_LOGRES +
xfs_calc_iunlink_remove_reservation(mp) +
max((xfs_calc_inode_res(mp, 2) +
xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
@@ -513,7 +513,7 @@ STATIC uint
xfs_calc_remove_reservation(
struct xfs_mount *mp)
{
- return XFS_DQUOT_LOGRES(mp) +
+ return XFS_DQUOT_LOGRES +
xfs_calc_iunlink_add_reservation(mp) +
max((xfs_calc_inode_res(mp, 2) +
xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
@@ -572,7 +572,7 @@ xfs_calc_icreate_resv_alloc(
STATIC uint
xfs_calc_icreate_reservation(xfs_mount_t *mp)
{
- return XFS_DQUOT_LOGRES(mp) +
+ return XFS_DQUOT_LOGRES +
max(xfs_calc_icreate_resv_alloc(mp),
xfs_calc_create_resv_modify(mp));
}
@@ -581,7 +581,7 @@ STATIC uint
xfs_calc_create_tmpfile_reservation(
struct xfs_mount *mp)
{
- uint res = XFS_DQUOT_LOGRES(mp);
+ uint res = XFS_DQUOT_LOGRES;
res += xfs_calc_icreate_resv_alloc(mp);
return res + xfs_calc_iunlink_add_reservation(mp);
@@ -630,7 +630,7 @@ STATIC uint
xfs_calc_ifree_reservation(
struct xfs_mount *mp)
{
- return XFS_DQUOT_LOGRES(mp) +
+ return XFS_DQUOT_LOGRES +
xfs_calc_inode_res(mp, 1) +
xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
xfs_calc_iunlink_remove_reservation(mp) +
@@ -647,7 +647,7 @@ STATIC uint
xfs_calc_ichange_reservation(
struct xfs_mount *mp)
{
- return XFS_DQUOT_LOGRES(mp) +
+ return XFS_DQUOT_LOGRES +
xfs_calc_inode_res(mp, 1) +
xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
@@ -756,7 +756,7 @@ STATIC uint
xfs_calc_addafork_reservation(
struct xfs_mount *mp)
{
- return XFS_DQUOT_LOGRES(mp) +
+ return XFS_DQUOT_LOGRES +
xfs_calc_inode_res(mp, 1) +
xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
xfs_calc_buf_res(1, mp->m_dir_geo->blksize) +
@@ -804,7 +804,7 @@ STATIC uint
xfs_calc_attrsetm_reservation(
struct xfs_mount *mp)
{
- return XFS_DQUOT_LOGRES(mp) +
+ return XFS_DQUOT_LOGRES +
xfs_calc_inode_res(mp, 1) +
xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
xfs_calc_buf_res(XFS_DA_NODE_MAXDEPTH, XFS_FSB_TO_B(mp, 1));
@@ -844,7 +844,7 @@ STATIC uint
xfs_calc_attrrm_reservation(
struct xfs_mount *mp)
{
- return XFS_DQUOT_LOGRES(mp) +
+ return XFS_DQUOT_LOGRES +
max((xfs_calc_inode_res(mp, 1) +
xfs_calc_buf_res(XFS_DA_NODE_MAXDEPTH,
XFS_FSB_TO_B(mp, 1)) +
diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
index 5ebdda7e1078..42fed04f038d 100644
--- a/fs/xfs/libxfs/xfs_types.h
+++ b/fs/xfs/libxfs/xfs_types.h
@@ -31,6 +31,7 @@ typedef uint64_t xfs_rfsblock_t; /* blockno in filesystem (raw) */
typedef uint64_t xfs_rtblock_t; /* extent (block) in realtime area */
typedef uint64_t xfs_fileoff_t; /* block number in a file */
typedef uint64_t xfs_filblks_t; /* number of blocks in a file */
+typedef uint64_t xfs_rtbxlen_t; /* rtbitmap extent length in rtextents */
typedef int64_t xfs_srtblock_t; /* signed version of xfs_rtblock_t */
@@ -227,4 +228,16 @@ bool xfs_verify_fileoff(struct xfs_mount *mp, xfs_fileoff_t off);
bool xfs_verify_fileext(struct xfs_mount *mp, xfs_fileoff_t off,
xfs_fileoff_t len);
+/* Do we support an rt volume having this number of rtextents? */
+static inline bool
+xfs_validate_rtextents(
+ xfs_rtbxlen_t rtextents)
+{
+ /* No runt rt volumes */
+ if (rtextents == 0)
+ return false;
+
+ return true;
+}
+
#endif /* __XFS_TYPES_H__ */
diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c
index b6f0c9f3f124..f51771e5c3fe 100644
--- a/fs/xfs/scrub/attr.c
+++ b/fs/xfs/scrub/attr.c
@@ -159,6 +159,11 @@ xchk_xattr_listent(
args.value = xchk_xattr_valuebuf(sx->sc);
args.valuelen = valuelen;
+ /*
+ * Get the attr value to ensure that lookup can find this attribute
+ * through the dabtree indexing and that remote value retrieval also
+ * works correctly.
+ */
error = xfs_attr_get_ilocked(&args);
/* ENODATA means the hash lookup failed and the attr is bad */
if (error == -ENODATA)
diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index f0b9cb6506fd..45b135929144 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -647,7 +647,13 @@ xchk_bmap(
}
break;
case XFS_ATTR_FORK:
- if (!xfs_has_attr(mp) && !xfs_has_attr2(mp))
+ /*
+ * "attr" means that an attr fork was created at some point in
+ * the life of this filesystem. "attr2" means that inodes have
+ * variable-sized data/attr fork areas. Hence we only check
+ * attr here.
+ */
+ if (!xfs_has_attr(mp))
xchk_ino_set_corrupt(sc, sc->ip->i_ino);
break;
default:
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index c18bd039fce9..e0ed0ebfdaea 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -582,7 +582,8 @@ xrep_reap_block(
else if (resv == XFS_AG_RESV_AGFL)
error = xrep_put_freelist(sc, agbno);
else
- error = xfs_free_extent(sc->tp, fsbno, 1, oinfo, resv);
+ error = xfs_free_extent(sc->tp, sc->sa.pag, agbno, 1, oinfo,
+ resv);
if (agf_bp != sc->sa.agf_bp)
xfs_trans_brelse(sc->tp, agf_bp);
if (error)
diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c
index 0a3bde64c675..fad7c353ada6 100644
--- a/fs/xfs/scrub/rtbitmap.c
+++ b/fs/xfs/scrub/rtbitmap.c
@@ -11,9 +11,10 @@
#include "xfs_mount.h"
#include "xfs_log_format.h"
#include "xfs_trans.h"
-#include "xfs_rtalloc.h"
+#include "xfs_rtbitmap.h"
#include "xfs_inode.h"
#include "xfs_bmap.h"
+#include "xfs_sb.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
diff --git a/fs/xfs/xfs.h b/fs/xfs/xfs.h
index f6ffb4f248f7..9355ccad9503 100644
--- a/fs/xfs/xfs.h
+++ b/fs/xfs/xfs.h
@@ -10,6 +10,10 @@
#define DEBUG 1
#endif
+#ifdef CONFIG_XFS_DEBUG_EXPENSIVE
+#define DEBUG_EXPENSIVE 1
+#endif
+
#ifdef CONFIG_XFS_ASSERT_FATAL
#define XFS_ASSERT_FATAL 1
#endif
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 21c241e96d48..50a7f2745514 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -225,45 +225,6 @@ xfs_imap_valid(
return true;
}
-/*
- * Pass in a dellalloc extent and convert it to real extents, return the real
- * extent that maps offset_fsb in wpc->iomap.
- *
- * The current page is held locked so nothing could have removed the block
- * backing offset_fsb, although it could have moved from the COW to the data
- * fork by another thread.
- */
-static int
-xfs_convert_blocks(
- struct iomap_writepage_ctx *wpc,
- struct xfs_inode *ip,
- int whichfork,
- loff_t offset)
-{
- int error;
- unsigned *seq;
-
- if (whichfork == XFS_COW_FORK)
- seq = &XFS_WPC(wpc)->cow_seq;
- else
- seq = &XFS_WPC(wpc)->data_seq;
-
- /*
- * Attempt to allocate whatever delalloc extent currently backs offset
- * and put the result into wpc->iomap. Allocate in a loop because it
- * may take several attempts to allocate real blocks for a contiguous
- * delalloc extent if free space is sufficiently fragmented.
- */
- do {
- error = xfs_bmapi_convert_delalloc(ip, whichfork, offset,
- &wpc->iomap, seq);
- if (error)
- return error;
- } while (wpc->iomap.offset + wpc->iomap.length <= offset);
-
- return 0;
-}
-
static int
xfs_map_blocks(
struct iomap_writepage_ctx *wpc,
@@ -281,6 +242,7 @@ xfs_map_blocks(
struct xfs_iext_cursor icur;
int retries = 0;
int error = 0;
+ unsigned int *seq;
if (xfs_is_shutdown(mp))
return -EIO;
@@ -376,7 +338,19 @@ retry:
trace_xfs_map_blocks_found(ip, offset, count, whichfork, &imap);
return 0;
allocate_blocks:
- error = xfs_convert_blocks(wpc, ip, whichfork, offset);
+ /*
+ * Convert a dellalloc extent to a real one. The current page is held
+ * locked so nothing could have removed the block backing offset_fsb,
+ * although it could have moved from the COW to the data fork by another
+ * thread.
+ */
+ if (whichfork == XFS_COW_FORK)
+ seq = &XFS_WPC(wpc)->cow_seq;
+ else
+ seq = &XFS_WPC(wpc)->data_seq;
+
+ error = xfs_bmapi_convert_delalloc(ip, whichfork, offset,
+ &wpc->iomap, seq);
if (error) {
/*
* If we failed to find the extent in the COW fork we might have
diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c
index 2788a6f2edcd..4a712f1565c1 100644
--- a/fs/xfs/xfs_attr_item.c
+++ b/fs/xfs/xfs_attr_item.c
@@ -329,6 +329,13 @@ xfs_xattri_finish_update(
goto out;
}
+ /* If an attr removal is trivially complete, we're done. */
+ if (attr->xattri_op_flags == XFS_ATTRI_OP_FLAGS_REMOVE &&
+ !xfs_inode_hasattr(args->dp)) {
+ error = 0;
+ goto out;
+ }
+
error = xfs_attr_set_iter(attr);
if (!error && attr->xattri_dela_state != XFS_DAS_DONE)
error = -EAGAIN;
@@ -503,6 +510,9 @@ xfs_attri_validate(
unsigned int op = attrp->alfi_op_flags &
XFS_ATTRI_OP_FLAGS_TYPE_MASK;
+ if (!xfs_sb_version_haslogxattrs(&mp->m_sb))
+ return false;
+
if (attrp->__pad != 0)
return false;
@@ -547,7 +557,7 @@ xfs_attri_item_recover(
struct xfs_inode *ip;
struct xfs_da_args *args;
struct xfs_trans *tp;
- struct xfs_trans_res tres;
+ struct xfs_trans_res resv;
struct xfs_attri_log_format *attrp;
struct xfs_attri_log_nameval *nv = attrip->attri_nameval;
int error;
@@ -594,8 +604,6 @@ xfs_attri_item_recover(
args->op_flags = XFS_DA_OP_RECOVERY | XFS_DA_OP_OKNOENT |
XFS_DA_OP_LOGGED;
- ASSERT(xfs_sb_version_haslogxattrs(&mp->m_sb));
-
switch (attr->xattri_op_flags) {
case XFS_ATTRI_OP_FLAGS_SET:
case XFS_ATTRI_OP_FLAGS_REPLACE:
@@ -608,8 +616,6 @@ xfs_attri_item_recover(
attr->xattri_dela_state = xfs_attr_init_add_state(args);
break;
case XFS_ATTRI_OP_FLAGS_REMOVE:
- if (!xfs_inode_hasattr(args->dp))
- goto out;
attr->xattri_dela_state = xfs_attr_init_remove_state(args);
break;
default:
@@ -618,8 +624,9 @@ xfs_attri_item_recover(
goto out;
}
- xfs_init_attr_trans(args, &tres, &total);
- error = xfs_trans_alloc(mp, &tres, total, 0, XFS_TRANS_RESERVE, &tp);
+ xfs_init_attr_trans(args, &resv, &total);
+ resv = xlog_recover_resv(&resv);
+ error = xfs_trans_alloc(mp, &resv, total, 0, XFS_TRANS_RESERVE, &tp);
if (error)
goto out;
@@ -710,48 +717,111 @@ xlog_recover_attri_commit_pass2(
const void *attr_value = NULL;
const void *attr_name;
size_t len;
-
- attri_formatp = item->ri_buf[0].i_addr;
- attr_name = item->ri_buf[1].i_addr;
+ unsigned int op, i = 0;
/* Validate xfs_attri_log_format before the large memory allocation */
len = sizeof(struct xfs_attri_log_format);
- if (item->ri_buf[0].i_len != len) {
+ if (item->ri_buf[i].i_len != len) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
return -EFSCORRUPTED;
}
+ attri_formatp = item->ri_buf[i].i_addr;
if (!xfs_attri_validate(mp, attri_formatp)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
- item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ attri_formatp, len);
return -EFSCORRUPTED;
}
+ /* Check the number of log iovecs makes sense for the op code. */
+ op = attri_formatp->alfi_op_flags & XFS_ATTRI_OP_FLAGS_TYPE_MASK;
+ switch (op) {
+ case XFS_ATTRI_OP_FLAGS_SET:
+ case XFS_ATTRI_OP_FLAGS_REPLACE:
+ /* Log item, attr name, attr value */
+ if (item->ri_total != 3) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ attri_formatp, len);
+ return -EFSCORRUPTED;
+ }
+ break;
+ case XFS_ATTRI_OP_FLAGS_REMOVE:
+ /* Log item, attr name */
+ if (item->ri_total != 2) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ attri_formatp, len);
+ return -EFSCORRUPTED;
+ }
+ break;
+ default:
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ attri_formatp, len);
+ return -EFSCORRUPTED;
+ }
+ i++;
+
/* Validate the attr name */
- if (item->ri_buf[1].i_len !=
+ if (item->ri_buf[i].i_len !=
xlog_calc_iovec_len(attri_formatp->alfi_name_len)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
- item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ attri_formatp, len);
return -EFSCORRUPTED;
}
+ attr_name = item->ri_buf[i].i_addr;
if (!xfs_attr_namecheck(attr_name, attri_formatp->alfi_name_len)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
- item->ri_buf[1].i_addr, item->ri_buf[1].i_len);
+ attri_formatp, len);
return -EFSCORRUPTED;
}
+ i++;
/* Validate the attr value, if present */
if (attri_formatp->alfi_value_len != 0) {
- if (item->ri_buf[2].i_len != xlog_calc_iovec_len(attri_formatp->alfi_value_len)) {
+ if (item->ri_buf[i].i_len != xlog_calc_iovec_len(attri_formatp->alfi_value_len)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
item->ri_buf[0].i_addr,
item->ri_buf[0].i_len);
return -EFSCORRUPTED;
}
- attr_value = item->ri_buf[2].i_addr;
+ attr_value = item->ri_buf[i].i_addr;
+ i++;
+ }
+
+ /*
+ * Make sure we got the correct number of buffers for the operation
+ * that we just loaded.
+ */
+ if (i != item->ri_total) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ attri_formatp, len);
+ return -EFSCORRUPTED;
+ }
+
+ switch (op) {
+ case XFS_ATTRI_OP_FLAGS_REMOVE:
+ /* Regular remove operations operate only on names. */
+ if (attr_value != NULL || attri_formatp->alfi_value_len != 0) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ attri_formatp, len);
+ return -EFSCORRUPTED;
+ }
+ fallthrough;
+ case XFS_ATTRI_OP_FLAGS_SET:
+ case XFS_ATTRI_OP_FLAGS_REPLACE:
+ /*
+ * Regular xattr set/remove/replace operations require a name
+ * and do not take a newname. Values are optional for set and
+ * replace.
+ */
+ if (attr_name == NULL || attri_formatp->alfi_name_len == 0) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ attri_formatp, len);
+ return -EFSCORRUPTED;
+ }
+ break;
}
/*
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index 41323da523d1..1058603db3ac 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -246,18 +246,11 @@ static int
xfs_trans_log_finish_bmap_update(
struct xfs_trans *tp,
struct xfs_bud_log_item *budp,
- enum xfs_bmap_intent_type type,
- struct xfs_inode *ip,
- int whichfork,
- xfs_fileoff_t startoff,
- xfs_fsblock_t startblock,
- xfs_filblks_t *blockcount,
- xfs_exntst_t state)
+ struct xfs_bmap_intent *bi)
{
int error;
- error = xfs_bmap_finish_one(tp, ip, type, whichfork, startoff,
- startblock, blockcount, state);
+ error = xfs_bmap_finish_one(tp, bi);
/*
* Mark the transaction dirty, even on error. This ensures the
@@ -378,25 +371,17 @@ xfs_bmap_update_finish_item(
struct list_head *item,
struct xfs_btree_cur **state)
{
- struct xfs_bmap_intent *bmap;
- xfs_filblks_t count;
+ struct xfs_bmap_intent *bi;
int error;
- bmap = container_of(item, struct xfs_bmap_intent, bi_list);
- count = bmap->bi_bmap.br_blockcount;
- error = xfs_trans_log_finish_bmap_update(tp, BUD_ITEM(done),
- bmap->bi_type,
- bmap->bi_owner, bmap->bi_whichfork,
- bmap->bi_bmap.br_startoff,
- bmap->bi_bmap.br_startblock,
- &count,
- bmap->bi_bmap.br_state);
- if (!error && count > 0) {
- ASSERT(bmap->bi_type == XFS_BMAP_UNMAP);
- bmap->bi_bmap.br_blockcount = count;
+ bi = container_of(item, struct xfs_bmap_intent, bi_list);
+
+ error = xfs_trans_log_finish_bmap_update(tp, BUD_ITEM(done), bi);
+ if (!error && bi->bi_bmap.br_blockcount > 0) {
+ ASSERT(bi->bi_type == XFS_BMAP_UNMAP);
return -EAGAIN;
}
- kmem_cache_free(xfs_bmap_intent_cache, bmap);
+ kmem_cache_free(xfs_bmap_intent_cache, bi);
return error;
}
@@ -471,17 +456,14 @@ xfs_bui_item_recover(
struct xfs_log_item *lip,
struct list_head *capture_list)
{
- struct xfs_bmbt_irec irec;
+ struct xfs_bmap_intent fake = { };
+ struct xfs_trans_res resv;
struct xfs_bui_log_item *buip = BUI_ITEM(lip);
struct xfs_trans *tp;
struct xfs_inode *ip = NULL;
struct xfs_mount *mp = lip->li_log->l_mp;
- struct xfs_map_extent *bmap;
+ struct xfs_map_extent *map;
struct xfs_bud_log_item *budp;
- xfs_filblks_t count;
- xfs_exntst_t state;
- unsigned int bui_type;
- int whichfork;
int iext_delta;
int error = 0;
@@ -491,19 +473,18 @@ xfs_bui_item_recover(
return -EFSCORRUPTED;
}
- bmap = &buip->bui_format.bui_extents[0];
- state = (bmap->me_flags & XFS_BMAP_EXTENT_UNWRITTEN) ?
- XFS_EXT_UNWRITTEN : XFS_EXT_NORM;
- whichfork = (bmap->me_flags & XFS_BMAP_EXTENT_ATTR_FORK) ?
+ map = &buip->bui_format.bui_extents[0];
+ fake.bi_whichfork = (map->me_flags & XFS_BMAP_EXTENT_ATTR_FORK) ?
XFS_ATTR_FORK : XFS_DATA_FORK;
- bui_type = bmap->me_flags & XFS_BMAP_EXTENT_TYPE_MASK;
+ fake.bi_type = map->me_flags & XFS_BMAP_EXTENT_TYPE_MASK;
- error = xlog_recover_iget(mp, bmap->me_owner, &ip);
+ error = xlog_recover_iget(mp, map->me_owner, &ip);
if (error)
return error;
/* Allocate transaction and do the work. */
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate,
+ resv = xlog_recover_resv(&M_RES(mp)->tr_itruncate);
+ error = xfs_trans_alloc(mp, &resv,
XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK), 0, 0, &tp);
if (error)
goto err_rele;
@@ -512,34 +493,34 @@ xfs_bui_item_recover(
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, 0);
- if (bui_type == XFS_BMAP_MAP)
+ if (fake.bi_type == XFS_BMAP_MAP)
iext_delta = XFS_IEXT_ADD_NOSPLIT_CNT;
else
iext_delta = XFS_IEXT_PUNCH_HOLE_CNT;
- error = xfs_iext_count_may_overflow(ip, whichfork, iext_delta);
+ error = xfs_iext_count_may_overflow(ip, fake.bi_whichfork, iext_delta);
if (error == -EFBIG)
error = xfs_iext_count_upgrade(tp, ip, iext_delta);
if (error)
goto err_cancel;
- count = bmap->me_len;
- error = xfs_trans_log_finish_bmap_update(tp, budp, bui_type, ip,
- whichfork, bmap->me_startoff, bmap->me_startblock,
- &count, state);
+ fake.bi_owner = ip;
+ fake.bi_bmap.br_startblock = map->me_startblock;
+ fake.bi_bmap.br_startoff = map->me_startoff;
+ fake.bi_bmap.br_blockcount = map->me_len;
+ fake.bi_bmap.br_state = (map->me_flags & XFS_BMAP_EXTENT_UNWRITTEN) ?
+ XFS_EXT_UNWRITTEN : XFS_EXT_NORM;
+
+ error = xfs_trans_log_finish_bmap_update(tp, budp, &fake);
if (error == -EFSCORRUPTED)
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bmap,
- sizeof(*bmap));
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, map,
+ sizeof(*map));
if (error)
goto err_cancel;
- if (count > 0) {
- ASSERT(bui_type == XFS_BMAP_UNMAP);
- irec.br_startblock = bmap->me_startblock;
- irec.br_blockcount = count;
- irec.br_startoff = bmap->me_startoff;
- irec.br_state = state;
- xfs_bmap_unmap_extent(tp, ip, &irec);
+ if (fake.bi_bmap.br_blockcount > 0) {
+ ASSERT(fake.bi_type == XFS_BMAP_UNMAP);
+ xfs_bmap_unmap_extent(tp, ip, &fake.bi_bmap);
}
/*
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 468bb61a5e46..bab8ba224e10 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -636,13 +636,11 @@ out_unlock:
/*
* Test whether it is appropriate to check an inode for and free post EOF
- * blocks. The 'force' parameter determines whether we should also consider
- * regular files that are marked preallocated or append-only.
+ * blocks.
*/
bool
xfs_can_free_eofblocks(
- struct xfs_inode *ip,
- bool force)
+ struct xfs_inode *ip)
{
struct xfs_bmbt_irec imap;
struct xfs_mount *mp = ip->i_mount;
@@ -676,11 +674,11 @@ xfs_can_free_eofblocks(
return false;
/*
- * Do not free real preallocated or append-only files unless the file
- * has delalloc blocks and we are forced to remove them.
+ * Only free real extents for inodes with persistent preallocations or
+ * the append-only flag.
*/
if (ip->i_diflags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND))
- if (!force || ip->i_delayed_blks == 0)
+ if (ip->i_delayed_blks == 0)
return false;
/*
@@ -734,6 +732,22 @@ xfs_free_eofblocks(
/* Wait on dio to ensure i_size has settled. */
inode_dio_wait(VFS_I(ip));
+ /*
+ * For preallocated files only free delayed allocations.
+ *
+ * Note that this means we also leave speculative preallocations in
+ * place for preallocated files.
+ */
+ if (ip->i_diflags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) {
+ if (ip->i_delayed_blks) {
+ xfs_bmap_punch_delalloc_range(ip,
+ round_up(XFS_ISIZE(ip), mp->m_sb.sb_blocksize),
+ LLONG_MAX);
+ }
+ xfs_inode_clear_eofblocks_tag(ip);
+ return 0;
+ }
+
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
if (error) {
ASSERT(xfs_is_shutdown(mp));
@@ -868,33 +882,32 @@ xfs_alloc_file_space(
if (error)
goto error;
- error = xfs_bmapi_write(tp, ip, startoffset_fsb,
- allocatesize_fsb, XFS_BMAPI_PREALLOC, 0, imapp,
- &nimaps);
- if (error)
- goto error;
-
- ip->i_diflags |= XFS_DIFLAG_PREALLOC;
- xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-
- error = xfs_trans_commit(tp);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- if (error)
- break;
-
/*
* If the allocator cannot find a single free extent large
* enough to cover the start block of the requested range,
- * xfs_bmapi_write will return 0 but leave *nimaps set to 0.
+ * xfs_bmapi_write will return -ENOSR.
*
* In that case we simply need to keep looping with the same
* startoffset_fsb so that one of the following allocations
* will eventually reach the requested range.
*/
- if (nimaps) {
+ error = xfs_bmapi_write(tp, ip, startoffset_fsb,
+ allocatesize_fsb, XFS_BMAPI_PREALLOC, 0, imapp,
+ &nimaps);
+ if (error) {
+ if (error != -ENOSR)
+ goto error;
+ error = 0;
+ } else {
startoffset_fsb += imapp->br_blockcount;
allocatesize_fsb -= imapp->br_blockcount;
}
+
+ ip->i_diflags |= XFS_DIFLAG_PREALLOC;
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+ error = xfs_trans_commit(tp);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
}
return error;
@@ -950,14 +963,18 @@ xfs_flush_unmap_range(
xfs_off_t offset,
xfs_off_t len)
{
- struct xfs_mount *mp = ip->i_mount;
struct inode *inode = VFS_I(ip);
xfs_off_t rounding, start, end;
int error;
- rounding = max_t(xfs_off_t, mp->m_sb.sb_blocksize, PAGE_SIZE);
- start = round_down(offset, rounding);
- end = round_up(offset + len, rounding) - 1;
+ /*
+ * Make sure we extend the flush out to extent alignment
+ * boundaries so any extent range overlapping the start/end
+ * of the modification we are about to do is clean and idle.
+ */
+ rounding = max_t(xfs_off_t, xfs_inode_alloc_unitsize(ip), PAGE_SIZE);
+ start = rounddown_64(offset, rounding);
+ end = roundup_64(offset + len, rounding) - 1;
error = filemap_write_and_wait_range(inode->i_mapping, start, end);
if (error)
@@ -1042,14 +1059,14 @@ xfs_prepare_shift(
struct xfs_inode *ip,
loff_t offset)
{
- struct xfs_mount *mp = ip->i_mount;
+ unsigned int rounding;
int error;
/*
* Trim eofblocks to avoid shifting uninitialized post-eof preallocation
* into the accessible region of the file.
*/
- if (xfs_can_free_eofblocks(ip, true)) {
+ if (xfs_can_free_eofblocks(ip)) {
error = xfs_free_eofblocks(ip);
if (error)
return error;
@@ -1060,11 +1077,13 @@ xfs_prepare_shift(
* with the full range of the operation. If we don't, a COW writeback
* completion could race with an insert, front merge with the start
* extent (after split) during the shift and corrupt the file. Start
- * with the block just prior to the start to stabilize the boundary.
+ * with the allocation unit just prior to the start to stabilize the
+ * boundary.
*/
- offset = round_down(offset, mp->m_sb.sb_blocksize);
+ rounding = xfs_inode_alloc_unitsize(ip);
+ offset = rounddown_64(offset, rounding);
if (offset)
- offset -= mp->m_sb.sb_blocksize;
+ offset -= rounding;
/*
* Writeback and invalidate cache for the remainder of the file as we're
diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
index 6888078f5c31..1383019ccdb7 100644
--- a/fs/xfs/xfs_bmap_util.h
+++ b/fs/xfs/xfs_bmap_util.h
@@ -63,7 +63,7 @@ int xfs_insert_file_space(struct xfs_inode *, xfs_off_t offset,
xfs_off_t len);
/* EOF block manipulation functions */
-bool xfs_can_free_eofblocks(struct xfs_inode *ip, bool force);
+bool xfs_can_free_eofblocks(struct xfs_inode *ip);
int xfs_free_eofblocks(struct xfs_inode *ip);
int xfs_swap_extents(struct xfs_inode *ip, struct xfs_inode *tip,
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 54c774af6e1c..257945cdf63b 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -2040,6 +2040,14 @@ error_free:
return NULL;
}
+static inline void
+xfs_buf_list_del(
+ struct xfs_buf *bp)
+{
+ list_del_init(&bp->b_list);
+ wake_up_var(&bp->b_list);
+}
+
/*
* Cancel a delayed write list.
*
@@ -2057,7 +2065,7 @@ xfs_buf_delwri_cancel(
xfs_buf_lock(bp);
bp->b_flags &= ~_XBF_DELWRI_Q;
- list_del_init(&bp->b_list);
+ xfs_buf_list_del(bp);
xfs_buf_relse(bp);
}
}
@@ -2111,6 +2119,34 @@ xfs_buf_delwri_queue(
}
/*
+ * Queue a buffer to this delwri list as part of a data integrity operation.
+ * If the buffer is on any other delwri list, we'll wait for that to clear
+ * so that the caller can submit the buffer for IO and wait for the result.
+ * Callers must ensure the buffer is not already on the list.
+ */
+void
+xfs_buf_delwri_queue_here(
+ struct xfs_buf *bp,
+ struct list_head *buffer_list)
+{
+ /*
+ * We need this buffer to end up on the /caller's/ delwri list, not any
+ * old list. This can happen if the buffer is marked stale (which
+ * clears DELWRI_Q) after the AIL queues the buffer to its list but
+ * before the AIL has a chance to submit the list.
+ */
+ while (!list_empty(&bp->b_list)) {
+ xfs_buf_unlock(bp);
+ wait_var_event(&bp->b_list, list_empty(&bp->b_list));
+ xfs_buf_lock(bp);
+ }
+
+ ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
+
+ xfs_buf_delwri_queue(bp, buffer_list);
+}
+
+/*
* Compare function is more complex than it needs to be because
* the return value is only 32 bits and we are doing comparisons
* on 64 bit values
@@ -2172,7 +2208,7 @@ xfs_buf_delwri_submit_buffers(
* reference and remove it from the list here.
*/
if (!(bp->b_flags & _XBF_DELWRI_Q)) {
- list_del_init(&bp->b_list);
+ xfs_buf_list_del(bp);
xfs_buf_relse(bp);
continue;
}
@@ -2192,7 +2228,7 @@ xfs_buf_delwri_submit_buffers(
list_move_tail(&bp->b_list, wait_list);
} else {
bp->b_flags |= XBF_ASYNC;
- list_del_init(&bp->b_list);
+ xfs_buf_list_del(bp);
}
__xfs_buf_submit(bp, false);
}
@@ -2246,7 +2282,7 @@ xfs_buf_delwri_submit(
while (!list_empty(&wait_list)) {
bp = list_first_entry(&wait_list, struct xfs_buf, b_list);
- list_del_init(&bp->b_list);
+ xfs_buf_list_del(bp);
/*
* Wait on the locked buffer, check for errors and unlock and
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 549c60942208..6cf0332ba62c 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -305,6 +305,7 @@ extern void xfs_buf_stale(struct xfs_buf *bp);
/* Delayed Write Buffer Routines */
extern void xfs_buf_delwri_cancel(struct list_head *);
extern bool xfs_buf_delwri_queue(struct xfs_buf *, struct list_head *);
+void xfs_buf_delwri_queue_here(struct xfs_buf *bp, struct list_head *bl);
extern int xfs_buf_delwri_submit(struct list_head *);
extern int xfs_buf_delwri_submit_nowait(struct list_head *);
extern int xfs_buf_delwri_pushbuf(struct xfs_buf *, struct list_head *);
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 023d4e0385dd..b02ce568de0c 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -22,6 +22,7 @@
#include "xfs_trace.h"
#include "xfs_log.h"
#include "xfs_log_priv.h"
+#include "xfs_error.h"
struct kmem_cache *xfs_buf_item_cache;
@@ -781,8 +782,39 @@ xfs_buf_item_committed(
return lsn;
}
+#ifdef DEBUG_EXPENSIVE
+static int
+xfs_buf_item_precommit(
+ struct xfs_trans *tp,
+ struct xfs_log_item *lip)
+{
+ struct xfs_buf_log_item *bip = BUF_ITEM(lip);
+ struct xfs_buf *bp = bip->bli_buf;
+ struct xfs_mount *mp = bp->b_mount;
+ xfs_failaddr_t fa;
+
+ if (!bp->b_ops || !bp->b_ops->verify_struct)
+ return 0;
+ if (bip->bli_flags & XFS_BLI_STALE)
+ return 0;
+
+ fa = bp->b_ops->verify_struct(bp);
+ if (fa) {
+ xfs_buf_verifier_error(bp, -EFSCORRUPTED, bp->b_ops->name,
+ bp->b_addr, BBTOB(bp->b_length), fa);
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+ ASSERT(fa == NULL);
+ }
+
+ return 0;
+}
+#else
+# define xfs_buf_item_precommit NULL
+#endif
+
static const struct xfs_item_ops xfs_buf_item_ops = {
.iop_size = xfs_buf_item_size,
+ .iop_precommit = xfs_buf_item_precommit,
.iop_format = xfs_buf_item_format,
.iop_pin = xfs_buf_item_pin,
.iop_unpin = xfs_buf_item_unpin,
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index a8b2f3b278ea..6186b69be50a 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -333,7 +333,6 @@ xfs_dquot_disk_alloc(
goto err_cancel;
ASSERT(map.br_blockcount == XFS_DQUOT_CLUSTER_SIZE_FSB);
- ASSERT(nmaps == 1);
ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
(map.br_startblock != HOLESTARTBLOCK));
diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c
index 6a1aae799cf1..7d19091215b0 100644
--- a/fs/xfs/xfs_dquot_item.c
+++ b/fs/xfs/xfs_dquot_item.c
@@ -17,6 +17,7 @@
#include "xfs_trans_priv.h"
#include "xfs_qm.h"
#include "xfs_log.h"
+#include "xfs_error.h"
static inline struct xfs_dq_logitem *DQUOT_ITEM(struct xfs_log_item *lip)
{
@@ -193,8 +194,38 @@ xfs_qm_dquot_logitem_committing(
return xfs_qm_dquot_logitem_release(lip);
}
+#ifdef DEBUG_EXPENSIVE
+static int
+xfs_qm_dquot_logitem_precommit(
+ struct xfs_trans *tp,
+ struct xfs_log_item *lip)
+{
+ struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot;
+ struct xfs_mount *mp = dqp->q_mount;
+ struct xfs_disk_dquot ddq = { };
+ xfs_failaddr_t fa;
+
+ xfs_dquot_to_disk(&ddq, dqp);
+ fa = xfs_dquot_verify(mp, &ddq, dqp->q_id);
+ if (fa) {
+ XFS_CORRUPTION_ERROR("Bad dquot during logging",
+ XFS_ERRLEVEL_LOW, mp, &ddq, sizeof(ddq));
+ xfs_alert(mp,
+ "Metadata corruption detected at %pS, dquot 0x%x",
+ fa, dqp->q_id);
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+ ASSERT(fa == NULL);
+ }
+
+ return 0;
+}
+#else
+# define xfs_qm_dquot_logitem_precommit NULL
+#endif
+
static const struct xfs_item_ops xfs_dquot_item_ops = {
.iop_size = xfs_qm_dquot_logitem_size,
+ .iop_precommit = xfs_qm_dquot_logitem_precommit,
.iop_format = xfs_qm_dquot_logitem_format,
.iop_pin = xfs_qm_dquot_logitem_pin,
.iop_unpin = xfs_qm_dquot_logitem_unpin,
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index d5130d1fcfae..be9f279a5c75 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -345,23 +345,29 @@ static int
xfs_trans_free_extent(
struct xfs_trans *tp,
struct xfs_efd_log_item *efdp,
- xfs_fsblock_t start_block,
- xfs_extlen_t ext_len,
- const struct xfs_owner_info *oinfo,
- bool skip_discard)
+ struct xfs_extent_free_item *xefi)
{
+ struct xfs_owner_info oinfo = { };
struct xfs_mount *mp = tp->t_mountp;
struct xfs_extent *extp;
uint next_extent;
- xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, start_block);
xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp,
- start_block);
+ xefi->xefi_startblock);
int error;
- trace_xfs_bmap_free_deferred(tp->t_mountp, agno, 0, agbno, ext_len);
+ oinfo.oi_owner = xefi->xefi_owner;
+ if (xefi->xefi_flags & XFS_EFI_ATTR_FORK)
+ oinfo.oi_flags |= XFS_OWNER_INFO_ATTR_FORK;
+ if (xefi->xefi_flags & XFS_EFI_BMBT_BLOCK)
+ oinfo.oi_flags |= XFS_OWNER_INFO_BMBT_BLOCK;
+
+ trace_xfs_bmap_free_deferred(tp->t_mountp, xefi->xefi_pag->pag_agno, 0,
+ agbno, xefi->xefi_blockcount);
+
+ error = __xfs_free_extent(tp, xefi->xefi_pag, agbno,
+ xefi->xefi_blockcount, &oinfo, xefi->xefi_agresv,
+ xefi->xefi_flags & XFS_EFI_SKIP_DISCARD);
- error = __xfs_free_extent(tp, start_block, ext_len,
- oinfo, XFS_AG_RESV_NONE, skip_discard);
/*
* Mark the transaction dirty, even on error. This ensures the
* transaction is aborted, which:
@@ -375,8 +381,8 @@ xfs_trans_free_extent(
next_extent = efdp->efd_next_extent;
ASSERT(next_extent < efdp->efd_format.efd_nextents);
extp = &(efdp->efd_format.efd_extents[next_extent]);
- extp->ext_start = start_block;
- extp->ext_len = ext_len;
+ extp->ext_start = xefi->xefi_startblock;
+ extp->ext_len = xefi->xefi_blockcount;
efdp->efd_next_extent++;
return error;
@@ -389,14 +395,13 @@ xfs_extent_free_diff_items(
const struct list_head *a,
const struct list_head *b)
{
- struct xfs_mount *mp = priv;
struct xfs_extent_free_item *ra;
struct xfs_extent_free_item *rb;
ra = container_of(a, struct xfs_extent_free_item, xefi_list);
rb = container_of(b, struct xfs_extent_free_item, xefi_list);
- return XFS_FSB_TO_AGNO(mp, ra->xefi_startblock) -
- XFS_FSB_TO_AGNO(mp, rb->xefi_startblock);
+
+ return ra->xefi_pag->pag_agno - rb->xefi_pag->pag_agno;
}
/* Log a free extent to the intent item. */
@@ -404,7 +409,7 @@ STATIC void
xfs_extent_free_log_item(
struct xfs_trans *tp,
struct xfs_efi_log_item *efip,
- struct xfs_extent_free_item *free)
+ struct xfs_extent_free_item *xefi)
{
uint next_extent;
struct xfs_extent *extp;
@@ -420,8 +425,8 @@ xfs_extent_free_log_item(
next_extent = atomic_inc_return(&efip->efi_next_extent) - 1;
ASSERT(next_extent < efip->efi_format.efi_nextents);
extp = &efip->efi_format.efi_extents[next_extent];
- extp->ext_start = free->xefi_startblock;
- extp->ext_len = free->xefi_blockcount;
+ extp->ext_start = xefi->xefi_startblock;
+ extp->ext_len = xefi->xefi_blockcount;
}
static struct xfs_log_item *
@@ -433,15 +438,15 @@ xfs_extent_free_create_intent(
{
struct xfs_mount *mp = tp->t_mountp;
struct xfs_efi_log_item *efip = xfs_efi_init(mp, count);
- struct xfs_extent_free_item *free;
+ struct xfs_extent_free_item *xefi;
ASSERT(count > 0);
xfs_trans_add_item(tp, &efip->efi_item);
if (sort)
list_sort(mp, items, xfs_extent_free_diff_items);
- list_for_each_entry(free, items, xefi_list)
- xfs_extent_free_log_item(tp, efip, free);
+ list_for_each_entry(xefi, items, xefi_list)
+ xfs_extent_free_log_item(tp, efip, xefi);
return &efip->efi_item;
}
@@ -455,6 +460,26 @@ xfs_extent_free_create_done(
return &xfs_trans_get_efd(tp, EFI_ITEM(intent), count)->efd_item;
}
+/* Take a passive ref to the AG containing the space we're freeing. */
+void
+xfs_extent_free_get_group(
+ struct xfs_mount *mp,
+ struct xfs_extent_free_item *xefi)
+{
+ xfs_agnumber_t agno;
+
+ agno = XFS_FSB_TO_AGNO(mp, xefi->xefi_startblock);
+ xefi->xefi_pag = xfs_perag_get(mp, agno);
+}
+
+/* Release a passive AG ref after some freeing work. */
+static inline void
+xfs_extent_free_put_group(
+ struct xfs_extent_free_item *xefi)
+{
+ xfs_perag_put(xefi->xefi_pag);
+}
+
/* Process a free extent. */
STATIC int
xfs_extent_free_finish_item(
@@ -463,21 +488,15 @@ xfs_extent_free_finish_item(
struct list_head *item,
struct xfs_btree_cur **state)
{
- struct xfs_owner_info oinfo = { };
- struct xfs_extent_free_item *free;
+ struct xfs_extent_free_item *xefi;
int error;
- free = container_of(item, struct xfs_extent_free_item, xefi_list);
- oinfo.oi_owner = free->xefi_owner;
- if (free->xefi_flags & XFS_EFI_ATTR_FORK)
- oinfo.oi_flags |= XFS_OWNER_INFO_ATTR_FORK;
- if (free->xefi_flags & XFS_EFI_BMBT_BLOCK)
- oinfo.oi_flags |= XFS_OWNER_INFO_BMBT_BLOCK;
- error = xfs_trans_free_extent(tp, EFD_ITEM(done),
- free->xefi_startblock,
- free->xefi_blockcount,
- &oinfo, free->xefi_flags & XFS_EFI_SKIP_DISCARD);
- kmem_cache_free(xfs_extfree_item_cache, free);
+ xefi = container_of(item, struct xfs_extent_free_item, xefi_list);
+
+ error = xfs_trans_free_extent(tp, EFD_ITEM(done), xefi);
+
+ xfs_extent_free_put_group(xefi);
+ kmem_cache_free(xfs_extfree_item_cache, xefi);
return error;
}
@@ -494,10 +513,12 @@ STATIC void
xfs_extent_free_cancel_item(
struct list_head *item)
{
- struct xfs_extent_free_item *free;
+ struct xfs_extent_free_item *xefi;
- free = container_of(item, struct xfs_extent_free_item, xefi_list);
- kmem_cache_free(xfs_extfree_item_cache, free);
+ xefi = container_of(item, struct xfs_extent_free_item, xefi_list);
+
+ xfs_extent_free_put_group(xefi);
+ kmem_cache_free(xfs_extfree_item_cache, xefi);
}
const struct xfs_defer_op_type xfs_extent_free_defer_type = {
@@ -523,28 +544,25 @@ xfs_agfl_free_finish_item(
struct xfs_owner_info oinfo = { };
struct xfs_mount *mp = tp->t_mountp;
struct xfs_efd_log_item *efdp = EFD_ITEM(done);
- struct xfs_extent_free_item *free;
+ struct xfs_extent_free_item *xefi;
struct xfs_extent *extp;
struct xfs_buf *agbp;
int error;
- xfs_agnumber_t agno;
xfs_agblock_t agbno;
uint next_extent;
- struct xfs_perag *pag;
- free = container_of(item, struct xfs_extent_free_item, xefi_list);
- ASSERT(free->xefi_blockcount == 1);
- agno = XFS_FSB_TO_AGNO(mp, free->xefi_startblock);
- agbno = XFS_FSB_TO_AGBNO(mp, free->xefi_startblock);
- oinfo.oi_owner = free->xefi_owner;
+ xefi = container_of(item, struct xfs_extent_free_item, xefi_list);
+ ASSERT(xefi->xefi_blockcount == 1);
+ agbno = XFS_FSB_TO_AGBNO(mp, xefi->xefi_startblock);
+ oinfo.oi_owner = xefi->xefi_owner;
- trace_xfs_agfl_free_deferred(mp, agno, 0, agbno, free->xefi_blockcount);
+ trace_xfs_agfl_free_deferred(mp, xefi->xefi_pag->pag_agno, 0, agbno,
+ xefi->xefi_blockcount);
- pag = xfs_perag_get(mp, agno);
- error = xfs_alloc_read_agf(pag, tp, 0, &agbp);
+ error = xfs_alloc_read_agf(xefi->xefi_pag, tp, 0, &agbp);
if (!error)
- error = xfs_free_agfl_block(tp, agno, agbno, agbp, &oinfo);
- xfs_perag_put(pag);
+ error = xfs_free_agfl_block(tp, xefi->xefi_pag->pag_agno,
+ agbno, agbp, &oinfo);
/*
* Mark the transaction dirty, even on error. This ensures the
@@ -559,11 +577,12 @@ xfs_agfl_free_finish_item(
next_extent = efdp->efd_next_extent;
ASSERT(next_extent < efdp->efd_format.efd_nextents);
extp = &(efdp->efd_format.efd_extents[next_extent]);
- extp->ext_start = free->xefi_startblock;
- extp->ext_len = free->xefi_blockcount;
+ extp->ext_start = xefi->xefi_startblock;
+ extp->ext_len = xefi->xefi_blockcount;
efdp->efd_next_extent++;
- kmem_cache_free(xfs_extfree_item_cache, free);
+ xfs_extent_free_put_group(xefi);
+ kmem_cache_free(xfs_extfree_item_cache, xefi);
return error;
}
@@ -595,11 +614,11 @@ xfs_efi_item_recover(
struct xfs_log_item *lip,
struct list_head *capture_list)
{
+ struct xfs_trans_res resv;
struct xfs_efi_log_item *efip = EFI_ITEM(lip);
struct xfs_mount *mp = lip->li_log->l_mp;
struct xfs_efd_log_item *efdp;
struct xfs_trans *tp;
- struct xfs_extent *extp;
int i;
int error = 0;
@@ -618,16 +637,27 @@ xfs_efi_item_recover(
}
}
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
+ resv = xlog_recover_resv(&M_RES(mp)->tr_itruncate);
+ error = xfs_trans_alloc(mp, &resv, 0, 0, 0, &tp);
if (error)
return error;
efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents);
for (i = 0; i < efip->efi_format.efi_nextents; i++) {
+ struct xfs_extent_free_item fake = {
+ .xefi_owner = XFS_RMAP_OWN_UNKNOWN,
+ .xefi_agresv = XFS_AG_RESV_NONE,
+ };
+ struct xfs_extent *extp;
+
extp = &efip->efi_format.efi_extents[i];
- error = xfs_trans_free_extent(tp, efdp, extp->ext_start,
- extp->ext_len,
- &XFS_RMAP_OINFO_ANY_OWNER, false);
+
+ fake.xefi_startblock = extp->ext_start;
+ fake.xefi_blockcount = extp->ext_len;
+
+ xfs_extent_free_get_group(mp, &fake);
+ error = xfs_trans_free_extent(tp, efdp, &fake);
+ xfs_extent_free_put_group(&fake);
if (error == -EFSCORRUPTED)
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
extp, sizeof(*extp));
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 821cb86a83bd..3c910e36da69 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -24,6 +24,7 @@
#include "xfs_pnfs.h"
#include "xfs_iomap.h"
#include "xfs_reflink.h"
+#include "xfs_file.h"
#include <linux/dax.h>
#include <linux/falloc.h>
@@ -38,33 +39,25 @@ static const struct vm_operations_struct xfs_file_vm_ops;
* Decide if the given file range is aligned to the size of the fundamental
* allocation unit for the file.
*/
-static bool
+bool
xfs_is_falloc_aligned(
struct xfs_inode *ip,
loff_t pos,
long long int len)
{
- struct xfs_mount *mp = ip->i_mount;
- uint64_t mask;
-
- if (XFS_IS_REALTIME_INODE(ip)) {
- if (!is_power_of_2(mp->m_sb.sb_rextsize)) {
- u64 rextbytes;
- u32 mod;
-
- rextbytes = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize);
- div_u64_rem(pos, rextbytes, &mod);
- if (mod)
- return false;
- div_u64_rem(len, rextbytes, &mod);
- return mod == 0;
- }
- mask = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize) - 1;
- } else {
- mask = mp->m_sb.sb_blocksize - 1;
+ unsigned int alloc_unit = xfs_inode_alloc_unitsize(ip);
+
+ if (!is_power_of_2(alloc_unit)) {
+ u32 mod;
+
+ div_u64_rem(pos, alloc_unit, &mod);
+ if (mod)
+ return false;
+ div_u64_rem(len, alloc_unit, &mod);
+ return mod == 0;
}
- return !((pos | len) & mask);
+ return !((pos | len) & (alloc_unit - 1));
}
/*
diff --git a/fs/xfs/xfs_file.h b/fs/xfs/xfs_file.h
new file mode 100644
index 000000000000..2ad91f755caf
--- /dev/null
+++ b/fs/xfs/xfs_file.h
@@ -0,0 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_FILE_H__
+#define __XFS_FILE_H__
+
+extern const struct file_operations xfs_file_operations;
+extern const struct file_operations xfs_dir_file_operations;
+
+bool xfs_is_falloc_aligned(struct xfs_inode *ip, loff_t pos,
+ long long int len);
+
+#endif /* __XFS_FILE_H__ */
diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
index 062e5dc5db9f..a0668a1ef100 100644
--- a/fs/xfs/xfs_fsmap.c
+++ b/fs/xfs/xfs_fsmap.c
@@ -23,7 +23,7 @@
#include "xfs_refcount.h"
#include "xfs_refcount_btree.h"
#include "xfs_alloc_btree.h"
-#include "xfs_rtalloc.h"
+#include "xfs_rtbitmap.h"
#include "xfs_ag.h"
/* Convert an xfs_fsmap to an fsmap. */
@@ -71,7 +71,7 @@ xfs_fsmap_owner_to_rmap(
switch (src->fmr_owner) {
case 0: /* "lowest owner id possible" */
case -1ULL: /* "highest owner id possible" */
- dest->rm_owner = 0;
+ dest->rm_owner = src->fmr_owner;
break;
case XFS_FMR_OWN_FREE:
dest->rm_owner = XFS_RMAP_OWN_NULL;
@@ -160,9 +160,18 @@ struct xfs_getfsmap_info {
struct xfs_buf *agf_bp; /* AGF, for refcount queries */
struct xfs_perag *pag; /* AG info, if applicable */
xfs_daddr_t next_daddr; /* next daddr we expect */
+ /* daddr of low fsmap key when we're using the rtbitmap */
+ xfs_daddr_t low_daddr;
u64 missing_owner; /* owner of holes */
u32 dev; /* device id */
- struct xfs_rmap_irec low; /* low rmap key */
+ /*
+ * Low rmap key for the query. If low.rm_blockcount is nonzero, this
+ * is the second (or later) call to retrieve the recordset in pieces.
+ * xfs_getfsmap_rec_before_start will compare all records retrieved
+ * by the rmapbt query to filter out any records that start before
+ * the last record.
+ */
+ struct xfs_rmap_irec low;
struct xfs_rmap_irec high; /* high rmap key */
bool last; /* last extent? */
};
@@ -237,16 +246,31 @@ xfs_getfsmap_format(
xfs_fsmap_from_internal(rec, xfm);
}
+static inline bool
+xfs_getfsmap_rec_before_start(
+ struct xfs_getfsmap_info *info,
+ const struct xfs_rmap_irec *rec,
+ xfs_daddr_t rec_daddr)
+{
+ if (info->low_daddr != XFS_BUF_DADDR_NULL)
+ return rec_daddr < info->low_daddr;
+ if (info->low.rm_blockcount)
+ return xfs_rmap_compare(rec, &info->low) < 0;
+ return false;
+}
+
/*
* Format a reverse mapping for getfsmap, having translated rm_startblock
- * into the appropriate daddr units.
+ * into the appropriate daddr units. Pass in a nonzero @len_daddr if the
+ * length could be larger than rm_blockcount in struct xfs_rmap_irec.
*/
STATIC int
xfs_getfsmap_helper(
struct xfs_trans *tp,
struct xfs_getfsmap_info *info,
const struct xfs_rmap_irec *rec,
- xfs_daddr_t rec_daddr)
+ xfs_daddr_t rec_daddr,
+ xfs_daddr_t len_daddr)
{
struct xfs_fsmap fmr;
struct xfs_mount *mp = tp->t_mountp;
@@ -256,12 +280,15 @@ xfs_getfsmap_helper(
if (fatal_signal_pending(current))
return -EINTR;
+ if (len_daddr == 0)
+ len_daddr = XFS_FSB_TO_BB(mp, rec->rm_blockcount);
+
/*
* Filter out records that start before our startpoint, if the
* caller requested that.
*/
- if (xfs_rmap_compare(rec, &info->low) < 0) {
- rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount);
+ if (xfs_getfsmap_rec_before_start(info, rec, rec_daddr)) {
+ rec_daddr += len_daddr;
if (info->next_daddr < rec_daddr)
info->next_daddr = rec_daddr;
return 0;
@@ -280,7 +307,7 @@ xfs_getfsmap_helper(
info->head->fmh_entries++;
- rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount);
+ rec_daddr += len_daddr;
if (info->next_daddr < rec_daddr)
info->next_daddr = rec_daddr;
return 0;
@@ -320,7 +347,7 @@ xfs_getfsmap_helper(
if (error)
return error;
fmr.fmr_offset = XFS_FSB_TO_BB(mp, rec->rm_offset);
- fmr.fmr_length = XFS_FSB_TO_BB(mp, rec->rm_blockcount);
+ fmr.fmr_length = len_daddr;
if (rec->rm_flags & XFS_RMAP_UNWRITTEN)
fmr.fmr_flags |= FMR_OF_PREALLOC;
if (rec->rm_flags & XFS_RMAP_ATTR_FORK)
@@ -337,7 +364,7 @@ xfs_getfsmap_helper(
xfs_getfsmap_format(mp, &fmr, info);
out:
- rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount);
+ rec_daddr += len_daddr;
if (info->next_daddr < rec_daddr)
info->next_daddr = rec_daddr;
return 0;
@@ -358,7 +385,7 @@ xfs_getfsmap_datadev_helper(
fsb = XFS_AGB_TO_FSB(mp, cur->bc_ag.pag->pag_agno, rec->rm_startblock);
rec_daddr = XFS_FSB_TO_DADDR(mp, fsb);
- return xfs_getfsmap_helper(cur->bc_tp, info, rec, rec_daddr);
+ return xfs_getfsmap_helper(cur->bc_tp, info, rec, rec_daddr, 0);
}
/* Transform a bnobt irec into a fsmap */
@@ -382,7 +409,7 @@ xfs_getfsmap_datadev_bnobt_helper(
irec.rm_offset = 0;
irec.rm_flags = 0;
- return xfs_getfsmap_helper(cur->bc_tp, info, &irec, rec_daddr);
+ return xfs_getfsmap_helper(cur->bc_tp, info, &irec, rec_daddr, 0);
}
/* Set rmap flags based on the getfsmap flags */
@@ -409,31 +436,25 @@ xfs_getfsmap_logdev(
{
struct xfs_mount *mp = tp->t_mountp;
struct xfs_rmap_irec rmap;
- int error;
+ xfs_daddr_t rec_daddr, len_daddr;
+ xfs_fsblock_t start_fsb, end_fsb;
+ uint64_t eofs;
- /* Set up search keys */
- info->low.rm_startblock = XFS_BB_TO_FSBT(mp, keys[0].fmr_physical);
- info->low.rm_offset = XFS_BB_TO_FSBT(mp, keys[0].fmr_offset);
- error = xfs_fsmap_owner_to_rmap(&info->low, keys);
- if (error)
- return error;
- info->low.rm_blockcount = 0;
- xfs_getfsmap_set_irec_flags(&info->low, &keys[0]);
+ eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks);
+ if (keys[0].fmr_physical >= eofs)
+ return 0;
+ start_fsb = XFS_BB_TO_FSBT(mp,
+ keys[0].fmr_physical + keys[0].fmr_length);
+ end_fsb = XFS_BB_TO_FSB(mp, min(eofs - 1, keys[1].fmr_physical));
- error = xfs_fsmap_owner_to_rmap(&info->high, keys + 1);
- if (error)
- return error;
- info->high.rm_startblock = -1U;
- info->high.rm_owner = ULLONG_MAX;
- info->high.rm_offset = ULLONG_MAX;
- info->high.rm_blockcount = 0;
- info->high.rm_flags = XFS_RMAP_KEY_FLAGS | XFS_RMAP_REC_FLAGS;
- info->missing_owner = XFS_FMR_OWN_FREE;
+ /* Adjust the low key if we are continuing from where we left off. */
+ if (keys[0].fmr_length > 0)
+ info->low_daddr = XFS_FSB_TO_BB(mp, start_fsb);
- trace_xfs_fsmap_low_key(mp, info->dev, NULLAGNUMBER, &info->low);
- trace_xfs_fsmap_high_key(mp, info->dev, NULLAGNUMBER, &info->high);
+ trace_xfs_fsmap_low_key_linear(mp, info->dev, start_fsb);
+ trace_xfs_fsmap_high_key_linear(mp, info->dev, end_fsb);
- if (keys[0].fmr_physical > 0)
+ if (start_fsb > 0)
return 0;
/* Fabricate an rmap entry for the external log device. */
@@ -443,7 +464,9 @@ xfs_getfsmap_logdev(
rmap.rm_offset = 0;
rmap.rm_flags = 0;
- return xfs_getfsmap_helper(tp, info, &rmap, 0);
+ rec_daddr = XFS_FSB_TO_BB(mp, rmap.rm_startblock);
+ len_daddr = XFS_FSB_TO_BB(mp, rmap.rm_blockcount);
+ return xfs_getfsmap_helper(tp, info, &rmap, rec_daddr, len_daddr);
}
#ifdef CONFIG_XFS_RT
@@ -457,72 +480,58 @@ xfs_getfsmap_rtdev_rtbitmap_helper(
{
struct xfs_getfsmap_info *info = priv;
struct xfs_rmap_irec irec;
- xfs_daddr_t rec_daddr;
+ xfs_rtblock_t rtbno;
+ xfs_daddr_t rec_daddr, len_daddr;
+
+ rtbno = rec->ar_startext * mp->m_sb.sb_rextsize;
+ rec_daddr = XFS_FSB_TO_BB(mp, rtbno);
+ irec.rm_startblock = rtbno;
+
+ rtbno = rec->ar_extcount * mp->m_sb.sb_rextsize;
+ len_daddr = XFS_FSB_TO_BB(mp, rtbno);
+ irec.rm_blockcount = rtbno;
- irec.rm_startblock = rec->ar_startext * mp->m_sb.sb_rextsize;
- rec_daddr = XFS_FSB_TO_BB(mp, irec.rm_startblock);
- irec.rm_blockcount = rec->ar_extcount * mp->m_sb.sb_rextsize;
irec.rm_owner = XFS_RMAP_OWN_NULL; /* "free" */
irec.rm_offset = 0;
irec.rm_flags = 0;
- return xfs_getfsmap_helper(tp, info, &irec, rec_daddr);
+ return xfs_getfsmap_helper(tp, info, &irec, rec_daddr, len_daddr);
}
-/* Execute a getfsmap query against the realtime device. */
+/* Execute a getfsmap query against the realtime device rtbitmap. */
STATIC int
-__xfs_getfsmap_rtdev(
+xfs_getfsmap_rtdev_rtbitmap(
struct xfs_trans *tp,
const struct xfs_fsmap *keys,
- int (*query_fn)(struct xfs_trans *,
- struct xfs_getfsmap_info *),
struct xfs_getfsmap_info *info)
{
+
+ struct xfs_rtalloc_rec alow = { 0 };
+ struct xfs_rtalloc_rec ahigh = { 0 };
struct xfs_mount *mp = tp->t_mountp;
- xfs_fsblock_t start_fsb;
- xfs_fsblock_t end_fsb;
+ xfs_rtblock_t start_rtb;
+ xfs_rtblock_t end_rtb;
uint64_t eofs;
- int error = 0;
+ int error;
- eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks);
+ eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rextents * mp->m_sb.sb_rextsize);
if (keys[0].fmr_physical >= eofs)
return 0;
- start_fsb = XFS_BB_TO_FSBT(mp, keys[0].fmr_physical);
- end_fsb = XFS_BB_TO_FSB(mp, min(eofs - 1, keys[1].fmr_physical));
+ start_rtb = XFS_BB_TO_FSBT(mp,
+ keys[0].fmr_physical + keys[0].fmr_length);
+ end_rtb = XFS_BB_TO_FSB(mp, min(eofs - 1, keys[1].fmr_physical));
- /* Set up search keys */
- info->low.rm_startblock = start_fsb;
- error = xfs_fsmap_owner_to_rmap(&info->low, &keys[0]);
- if (error)
- return error;
- info->low.rm_offset = XFS_BB_TO_FSBT(mp, keys[0].fmr_offset);
- info->low.rm_blockcount = 0;
- xfs_getfsmap_set_irec_flags(&info->low, &keys[0]);
-
- info->high.rm_startblock = end_fsb;
- error = xfs_fsmap_owner_to_rmap(&info->high, &keys[1]);
- if (error)
- return error;
- info->high.rm_offset = XFS_BB_TO_FSBT(mp, keys[1].fmr_offset);
- info->high.rm_blockcount = 0;
- xfs_getfsmap_set_irec_flags(&info->high, &keys[1]);
+ info->missing_owner = XFS_FMR_OWN_UNKNOWN;
- trace_xfs_fsmap_low_key(mp, info->dev, NULLAGNUMBER, &info->low);
- trace_xfs_fsmap_high_key(mp, info->dev, NULLAGNUMBER, &info->high);
+ /* Adjust the low key if we are continuing from where we left off. */
+ if (keys[0].fmr_length > 0) {
+ info->low_daddr = XFS_FSB_TO_BB(mp, start_rtb);
+ if (info->low_daddr >= eofs)
+ return 0;
+ }
- return query_fn(tp, info);
-}
-
-/* Actually query the realtime bitmap. */
-STATIC int
-xfs_getfsmap_rtdev_rtbitmap_query(
- struct xfs_trans *tp,
- struct xfs_getfsmap_info *info)
-{
- struct xfs_rtalloc_rec alow = { 0 };
- struct xfs_rtalloc_rec ahigh = { 0 };
- struct xfs_mount *mp = tp->t_mountp;
- int error;
+ trace_xfs_fsmap_low_key_linear(mp, info->dev, start_rtb);
+ trace_xfs_fsmap_high_key_linear(mp, info->dev, end_rtb);
xfs_ilock(mp->m_rbmip, XFS_ILOCK_SHARED);
@@ -530,8 +539,8 @@ xfs_getfsmap_rtdev_rtbitmap_query(
* Set up query parameters to return free rtextents covering the range
* we want.
*/
- alow.ar_startext = info->low.rm_startblock;
- ahigh.ar_startext = info->high.rm_startblock;
+ alow.ar_startext = start_rtb;
+ ahigh.ar_startext = end_rtb;
do_div(alow.ar_startext, mp->m_sb.sb_rextsize);
if (do_div(ahigh.ar_startext, mp->m_sb.sb_rextsize))
ahigh.ar_startext++;
@@ -554,19 +563,20 @@ err:
xfs_iunlock(mp->m_rbmip, XFS_ILOCK_SHARED);
return error;
}
+#endif /* CONFIG_XFS_RT */
-/* Execute a getfsmap query against the realtime device rtbitmap. */
-STATIC int
-xfs_getfsmap_rtdev_rtbitmap(
- struct xfs_trans *tp,
- const struct xfs_fsmap *keys,
- struct xfs_getfsmap_info *info)
+static inline bool
+rmap_not_shareable(struct xfs_mount *mp, const struct xfs_rmap_irec *r)
{
- info->missing_owner = XFS_FMR_OWN_UNKNOWN;
- return __xfs_getfsmap_rtdev(tp, keys, xfs_getfsmap_rtdev_rtbitmap_query,
- info);
+ if (!xfs_has_reflink(mp))
+ return true;
+ if (XFS_RMAP_NON_INODE_OWNER(r->rm_owner))
+ return true;
+ if (r->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK |
+ XFS_RMAP_UNWRITTEN))
+ return true;
+ return false;
}
-#endif /* CONFIG_XFS_RT */
/* Execute a getfsmap query against the regular data device. */
STATIC int
@@ -601,14 +611,30 @@ __xfs_getfsmap_datadev(
* low to the fsmap low key and max out the high key to the end
* of the AG.
*/
- info->low.rm_startblock = XFS_FSB_TO_AGBNO(mp, start_fsb);
info->low.rm_offset = XFS_BB_TO_FSBT(mp, keys[0].fmr_offset);
error = xfs_fsmap_owner_to_rmap(&info->low, &keys[0]);
if (error)
return error;
- info->low.rm_blockcount = 0;
+ info->low.rm_blockcount = XFS_BB_TO_FSBT(mp, keys[0].fmr_length);
xfs_getfsmap_set_irec_flags(&info->low, &keys[0]);
+ /* Adjust the low key if we are continuing from where we left off. */
+ if (info->low.rm_blockcount == 0) {
+ /* No previous record from which to continue */
+ } else if (rmap_not_shareable(mp, &info->low)) {
+ /* Last record seen was an unshareable extent */
+ info->low.rm_owner = 0;
+ info->low.rm_offset = 0;
+
+ start_fsb += info->low.rm_blockcount;
+ if (XFS_FSB_TO_DADDR(mp, start_fsb) >= eofs)
+ return 0;
+ } else {
+ /* Last record seen was a shareable file data extent */
+ info->low.rm_offset += info->low.rm_blockcount;
+ }
+ info->low.rm_startblock = XFS_FSB_TO_AGBNO(mp, start_fsb);
+
info->high.rm_startblock = -1U;
info->high.rm_owner = ULLONG_MAX;
info->high.rm_offset = ULLONG_MAX;
@@ -659,12 +685,8 @@ __xfs_getfsmap_datadev(
* Set the AG low key to the start of the AG prior to
* moving on to the next AG.
*/
- if (pag->pag_agno == start_ag) {
- info->low.rm_startblock = 0;
- info->low.rm_owner = 0;
- info->low.rm_offset = 0;
- info->low.rm_flags = 0;
- }
+ if (pag->pag_agno == start_ag)
+ memset(&info->low, 0, sizeof(info->low));
/*
* If this is the last AG, report any gap at the end of it
@@ -791,6 +813,19 @@ xfs_getfsmap_check_keys(
struct xfs_fsmap *low_key,
struct xfs_fsmap *high_key)
{
+ if (low_key->fmr_flags & (FMR_OF_SPECIAL_OWNER | FMR_OF_EXTENT_MAP)) {
+ if (low_key->fmr_offset)
+ return false;
+ }
+ if (high_key->fmr_flags != -1U &&
+ (high_key->fmr_flags & (FMR_OF_SPECIAL_OWNER |
+ FMR_OF_EXTENT_MAP))) {
+ if (high_key->fmr_offset && high_key->fmr_offset != -1ULL)
+ return false;
+ }
+ if (high_key->fmr_length && high_key->fmr_length != -1ULL)
+ return false;
+
if (low_key->fmr_device > high_key->fmr_device)
return false;
if (low_key->fmr_device < high_key->fmr_device)
@@ -834,15 +869,15 @@ xfs_getfsmap_check_keys(
* ----------------
* There are multiple levels of keys and counters at work here:
* xfs_fsmap_head.fmh_keys -- low and high fsmap keys passed in;
- * these reflect fs-wide sector addrs.
+ * these reflect fs-wide sector addrs.
* dkeys -- fmh_keys used to query each device;
- * these are fmh_keys but w/ the low key
- * bumped up by fmr_length.
+ * these are fmh_keys but w/ the low key
+ * bumped up by fmr_length.
* xfs_getfsmap_info.next_daddr -- next disk addr we expect to see; this
* is how we detect gaps in the fsmap
records and report them.
* xfs_getfsmap_info.low/high -- per-AG low/high keys computed from
- * dkeys; used to query the metadata.
+ * dkeys; used to query the metadata.
*/
int
xfs_getfsmap(
@@ -863,6 +898,8 @@ xfs_getfsmap(
if (!xfs_getfsmap_is_valid_device(mp, &head->fmh_keys[0]) ||
!xfs_getfsmap_is_valid_device(mp, &head->fmh_keys[1]))
return -EINVAL;
+ if (!xfs_getfsmap_check_keys(&head->fmh_keys[0], &head->fmh_keys[1]))
+ return -EINVAL;
use_rmap = xfs_has_rmapbt(mp) &&
has_capability_noaudit(current, CAP_SYS_ADMIN);
@@ -901,26 +938,15 @@ xfs_getfsmap(
* blocks could be mapped to several other files/offsets.
* According to rmapbt record ordering, the minimal next
* possible record for the block range is the next starting
- * offset in the same inode. Therefore, bump the file offset to
- * continue the search appropriately. For all other low key
- * mapping types (attr blocks, metadata), bump the physical
- * offset as there can be no other mapping for the same physical
- * block range.
+ * offset in the same inode. Therefore, each fsmap backend bumps
+ * the file offset to continue the search appropriately. For
+ * all other low key mapping types (attr blocks, metadata), each
+ * fsmap backend bumps the physical offset as there can be no
+ * other mapping for the same physical block range.
*/
dkeys[0] = head->fmh_keys[0];
- if (dkeys[0].fmr_flags & (FMR_OF_SPECIAL_OWNER | FMR_OF_EXTENT_MAP)) {
- dkeys[0].fmr_physical += dkeys[0].fmr_length;
- dkeys[0].fmr_owner = 0;
- if (dkeys[0].fmr_offset)
- return -EINVAL;
- } else
- dkeys[0].fmr_offset += dkeys[0].fmr_length;
- dkeys[0].fmr_length = 0;
memset(&dkeys[1], 0xFF, sizeof(struct xfs_fsmap));
- if (!xfs_getfsmap_check_keys(dkeys, &head->fmh_keys[1]))
- return -EINVAL;
-
info.next_daddr = head->fmh_keys[0].fmr_physical +
head->fmh_keys[0].fmr_length;
info.fsmap_recs = fsmap_recs;
@@ -960,6 +986,8 @@ xfs_getfsmap(
info.dev = handlers[i].dev;
info.last = false;
info.pag = NULL;
+ info.low_daddr = XFS_BUF_DADDR_NULL;
+ info.low.rm_blockcount = 0;
error = handlers[i].fn(tp, dkeys, &info);
if (error)
break;
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 77b14f788214..96e9d64fbe62 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -153,7 +153,7 @@ xfs_growfs_data_private(
(delta > 0 ? XFS_GROWFS_SPACE_RES(mp) : -delta), 0,
XFS_TRANS_RESERVE, &tp);
if (error)
- return error;
+ goto out_free_unused_perag;
last_pag = xfs_perag_get(mp, oagcount - 1);
if (delta > 0) {
@@ -227,6 +227,9 @@ xfs_growfs_data_private(
out_trans_cancel:
xfs_trans_cancel(tp);
+out_free_unused_perag:
+ if (nagcount > oagcount)
+ xfs_free_unused_perag_range(mp, oagcount, nagcount);
return error;
}
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 6df826fc787c..586d26c05160 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -1186,7 +1186,7 @@ xfs_inode_free_eofblocks(
}
*lockflags |= XFS_IOLOCK_EXCL;
- if (xfs_can_free_eofblocks(ip, false))
+ if (xfs_can_free_eofblocks(ip))
return xfs_free_eofblocks(ip);
/* inode could be preallocated or append-only */
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 26961b0dae03..3ccbc31767b3 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -983,10 +983,12 @@ xfs_create(
prid = xfs_get_initial_prid(dp);
/*
- * Make sure that we have allocated dquot(s) on disk.
+ * Make sure that we have allocated dquot(s) on disk. The uid/gid
+ * computation code must match what the VFS uses to assign i_[ug]id.
+ * INHERIT adjusts the gid computation for setgid/grpid systems.
*/
- error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(mnt_userns, &init_user_ns),
- mapped_fsgid(mnt_userns, &init_user_ns), prid,
+ error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(mnt_userns, i_user_ns(VFS_I(dp))),
+ mapped_fsgid(mnt_userns, i_user_ns(VFS_I(dp))), prid,
XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
&udqp, &gdqp, &pdqp);
if (error)
@@ -1132,10 +1134,12 @@ xfs_create_tmpfile(
prid = xfs_get_initial_prid(dp);
/*
- * Make sure that we have allocated dquot(s) on disk.
+ * Make sure that we have allocated dquot(s) on disk. The uid/gid
+ * computation code must match what the VFS uses to assign i_[ug]id.
+ * INHERIT adjusts the gid computation for setgid/grpid systems.
*/
- error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(mnt_userns, &init_user_ns),
- mapped_fsgid(mnt_userns, &init_user_ns), prid,
+ error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(mnt_userns, i_user_ns(VFS_I(dp))),
+ mapped_fsgid(mnt_userns, i_user_ns(VFS_I(dp))), prid,
XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
&udqp, &gdqp, &pdqp);
if (error)
@@ -1459,7 +1463,7 @@ xfs_release(
if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL))
return 0;
- if (xfs_can_free_eofblocks(ip, false)) {
+ if (xfs_can_free_eofblocks(ip)) {
/*
* Check if the inode is being opened, written and closed
* frequently and we have delayed allocation blocks outstanding
@@ -1675,15 +1679,13 @@ xfs_inode_needs_inactive(
/*
* This file isn't being freed, so check if there are post-eof blocks
- * to free. @force is true because we are evicting an inode from the
- * cache. Post-eof blocks must be freed, lest we end up with broken
- * free space accounting.
+ * to free.
*
* Note: don't bother with iolock here since lockdep complains about
* acquiring it in reclaim context. We have the only reference to the
* inode at this point anyways.
*/
- return xfs_can_free_eofblocks(ip, true);
+ return xfs_can_free_eofblocks(ip);
}
/*
@@ -1734,15 +1736,11 @@ xfs_inactive(
if (VFS_I(ip)->i_nlink != 0) {
/*
- * force is true because we are evicting an inode from the
- * cache. Post-eof blocks must be freed, lest we end up with
- * broken free space accounting.
- *
* Note: don't bother with iolock here since lockdep complains
* about acquiring it in reclaim context. We have the only
* reference to the inode at this point anyways.
*/
- if (xfs_can_free_eofblocks(ip, true))
+ if (xfs_can_free_eofblocks(ip))
error = xfs_free_eofblocks(ip);
goto out;
@@ -3777,3 +3775,16 @@ xfs_inode_reload_unlinked(
return error;
}
+
+/* Returns the size of fundamental allocation unit for a file, in bytes. */
+unsigned int
+xfs_inode_alloc_unitsize(
+ struct xfs_inode *ip)
+{
+ unsigned int blocks = 1;
+
+ if (XFS_IS_REALTIME_INODE(ip))
+ blocks = ip->i_mount->m_sb.sb_rextsize;
+
+ return XFS_FSB_TO_B(ip->i_mount, blocks);
+}
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index c177c92f3aa5..c4f426eadf8e 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -622,4 +622,6 @@ xfs_inode_unlinked_incomplete(
int xfs_inode_reload_unlinked_bucket(struct xfs_trans *tp, struct xfs_inode *ip);
int xfs_inode_reload_unlinked(struct xfs_inode *ip);
+unsigned int xfs_inode_alloc_unitsize(struct xfs_inode *ip);
+
#endif /* __XFS_INODE_H__ */
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 91c847a84e10..a734ca8d8f03 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -36,6 +36,36 @@ xfs_inode_item_sort(
return INODE_ITEM(lip)->ili_inode->i_ino;
}
+#ifdef DEBUG_EXPENSIVE
+static void
+xfs_inode_item_precommit_check(
+ struct xfs_inode *ip)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_dinode *dip;
+ xfs_failaddr_t fa;
+
+ dip = kzalloc(mp->m_sb.sb_inodesize, GFP_KERNEL | GFP_NOFS);
+ if (!dip) {
+ ASSERT(dip != NULL);
+ return;
+ }
+
+ xfs_inode_to_disk(ip, dip, 0);
+ xfs_dinode_calc_crc(mp, dip);
+ fa = xfs_dinode_verify(mp, ip->i_ino, dip);
+ if (fa) {
+ xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, dip,
+ sizeof(*dip), fa);
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+ ASSERT(fa == NULL);
+ }
+ kfree(dip);
+}
+#else
+# define xfs_inode_item_precommit_check(ip) ((void)0)
+#endif
+
/*
* Prior to finally logging the inode, we have to ensure that all the
* per-modification inode state changes are applied. This includes VFS inode
@@ -168,6 +198,8 @@ xfs_inode_item_precommit(
iip->ili_fields |= (flags | iip->ili_last_fields);
spin_unlock(&iip->ili_lock);
+ xfs_inode_item_precommit_check(ip);
+
/*
* We are done with the log item transaction dirty state, so clear it so
* that it doesn't pollute future transactions.
@@ -556,6 +588,9 @@ xfs_inode_to_log_dinode(
memset(to->di_pad2, 0, sizeof(to->di_pad2));
uuid_copy(&to->di_uuid, &ip->i_mount->m_sb.sb_meta_uuid);
to->di_v3_pad = 0;
+
+ /* dummy value for initialisation */
+ to->di_crc = 0;
} else {
to->di_version = 2;
to->di_flushiter = ip->i_flushiter;
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index c7cb496dc345..ef3dc0778566 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -38,6 +38,7 @@
#include "xfs_reflink.h"
#include "xfs_ioctl.h"
#include "xfs_xattr.h"
+#include "xfs_file.h"
#include <linux/mount.h>
#include <linux/namei.h>
@@ -1127,6 +1128,17 @@ xfs_ioctl_setattr_xflags(
/* Can't change realtime flag if any extents are allocated. */
if (ip->i_df.if_nextents || ip->i_delayed_blks)
return -EINVAL;
+
+ /*
+ * If S_DAX is enabled on this file, we can only switch the
+ * device if both support fsdax. We can't update S_DAX because
+ * there might be other threads walking down the access paths.
+ */
+ if (IS_DAX(VFS_I(ip)) &&
+ (mp->m_ddev_targp->bt_daxdev == NULL ||
+ (mp->m_rtdev_targp &&
+ mp->m_rtdev_targp->bt_daxdev == NULL)))
+ return -EINVAL;
}
if (rtflag) {
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index ab5512c0bcf7..28a1c19dfdb3 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -309,14 +309,6 @@ xfs_iomap_write_direct(
if (error)
goto out_unlock;
- /*
- * Copy any maps to caller's array and return any error.
- */
- if (nimaps == 0) {
- error = -ENOSPC;
- goto out_unlock;
- }
-
if (unlikely(!xfs_valid_startblock(ip, imap->br_startblock)))
error = xfs_alert_fsblock_zero(ip, imap);
@@ -1005,6 +997,24 @@ xfs_buffered_write_iomap_begin(
}
/*
+ * For zeroing, trim a delalloc extent that extends beyond the EOF
+ * block. If it starts beyond the EOF block, convert it to an
+ * unwritten extent.
+ */
+ if ((flags & IOMAP_ZERO) && imap.br_startoff <= offset_fsb &&
+ isnullstartblock(imap.br_startblock)) {
+ xfs_fileoff_t eof_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip));
+
+ if (offset_fsb >= eof_fsb)
+ goto convert_delay;
+ if (end_fsb > eof_fsb) {
+ end_fsb = eof_fsb;
+ xfs_trim_extent(&imap, offset_fsb,
+ end_fsb - offset_fsb);
+ }
+ }
+
+ /*
* Search the COW fork extent list even if we did not find a data fork
* extent. This serves two purposes: first this implements the
* speculative preallocation using cowextsize, so that we also unshare
@@ -1105,47 +1115,48 @@ xfs_buffered_write_iomap_begin(
}
}
-retry:
- error = xfs_bmapi_reserve_delalloc(ip, allocfork, offset_fsb,
- end_fsb - offset_fsb, prealloc_blocks,
- allocfork == XFS_DATA_FORK ? &imap : &cmap,
- allocfork == XFS_DATA_FORK ? &icur : &ccur,
- allocfork == XFS_DATA_FORK ? eof : cow_eof);
- switch (error) {
- case 0:
- break;
- case -ENOSPC:
- case -EDQUOT:
- /* retry without any preallocation */
- trace_xfs_delalloc_enospc(ip, offset, count);
- if (prealloc_blocks) {
- prealloc_blocks = 0;
- goto retry;
- }
- fallthrough;
- default:
- goto out_unlock;
- }
-
if (allocfork == XFS_COW_FORK) {
+ error = xfs_bmapi_reserve_delalloc(ip, allocfork, offset_fsb,
+ end_fsb - offset_fsb, prealloc_blocks, &cmap,
+ &ccur, cow_eof);
+ if (error)
+ goto out_unlock;
+
trace_xfs_iomap_alloc(ip, offset, count, allocfork, &cmap);
goto found_cow;
}
+ error = xfs_bmapi_reserve_delalloc(ip, allocfork, offset_fsb,
+ end_fsb - offset_fsb, prealloc_blocks, &imap, &icur,
+ eof);
+ if (error)
+ goto out_unlock;
+
/*
* Flag newly allocated delalloc blocks with IOMAP_F_NEW so we punch
* them out if the write happens to fail.
*/
seq = xfs_iomap_inode_sequence(ip, IOMAP_F_NEW);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ xfs_iunlock(ip, lockmode);
trace_xfs_iomap_alloc(ip, offset, count, allocfork, &imap);
return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, IOMAP_F_NEW, seq);
found_imap:
seq = xfs_iomap_inode_sequence(ip, 0);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ xfs_iunlock(ip, lockmode);
return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0, seq);
+convert_delay:
+ xfs_iunlock(ip, lockmode);
+ truncate_pagecache(inode, offset);
+ error = xfs_bmapi_convert_delalloc(ip, XFS_DATA_FORK, offset,
+ iomap, NULL);
+ if (error)
+ return error;
+
+ trace_xfs_iomap_alloc(ip, offset, count, XFS_DATA_FORK, &imap);
+ return 0;
+
found_cow:
seq = xfs_iomap_inode_sequence(ip, 0);
if (imap.br_startoff <= offset_fsb) {
@@ -1153,17 +1164,17 @@ found_cow:
if (error)
goto out_unlock;
seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ xfs_iunlock(ip, lockmode);
return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags,
IOMAP_F_SHARED, seq);
}
xfs_trim_extent(&cmap, offset_fsb, imap.br_startoff - offset_fsb);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ xfs_iunlock(ip, lockmode);
return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, 0, seq);
out_unlock:
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ xfs_iunlock(ip, lockmode);
return error;
}
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 6fbdc0a19e54..9ca1b8bf1f05 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -25,6 +25,7 @@
#include "xfs_error.h"
#include "xfs_ioctl.h"
#include "xfs_xattr.h"
+#include "xfs_file.h"
#include <linux/posix_acl.h>
#include <linux/security.h>
diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h
index e570dcb5df8d..73ff92355eaa 100644
--- a/fs/xfs/xfs_iops.h
+++ b/fs/xfs/xfs_iops.h
@@ -8,9 +8,6 @@
struct xfs_inode;
-extern const struct file_operations xfs_file_operations;
-extern const struct file_operations xfs_dir_file_operations;
-
extern ssize_t xfs_vn_listxattr(struct dentry *, char *data, size_t size);
int xfs_vn_setattr_size(struct user_namespace *mnt_userns,
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index 858e3e9eb4a8..dfd7b824e32b 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -252,17 +252,12 @@ static int
xfs_trans_log_finish_refcount_update(
struct xfs_trans *tp,
struct xfs_cud_log_item *cudp,
- enum xfs_refcount_intent_type type,
- xfs_fsblock_t startblock,
- xfs_extlen_t blockcount,
- xfs_fsblock_t *new_fsb,
- xfs_extlen_t *new_len,
+ struct xfs_refcount_intent *ri,
struct xfs_btree_cur **pcur)
{
int error;
- error = xfs_refcount_finish_one(tp, type, startblock,
- blockcount, new_fsb, new_len, pcur);
+ error = xfs_refcount_finish_one(tp, ri, pcur);
/*
* Mark the transaction dirty, even on error. This ensures the
@@ -378,25 +373,20 @@ xfs_refcount_update_finish_item(
struct list_head *item,
struct xfs_btree_cur **state)
{
- struct xfs_refcount_intent *refc;
- xfs_fsblock_t new_fsb;
- xfs_extlen_t new_aglen;
+ struct xfs_refcount_intent *ri;
int error;
- refc = container_of(item, struct xfs_refcount_intent, ri_list);
- error = xfs_trans_log_finish_refcount_update(tp, CUD_ITEM(done),
- refc->ri_type, refc->ri_startblock, refc->ri_blockcount,
- &new_fsb, &new_aglen, state);
+ ri = container_of(item, struct xfs_refcount_intent, ri_list);
+ error = xfs_trans_log_finish_refcount_update(tp, CUD_ITEM(done), ri,
+ state);
/* Did we run out of reservation? Requeue what we didn't finish. */
- if (!error && new_aglen > 0) {
- ASSERT(refc->ri_type == XFS_REFCOUNT_INCREASE ||
- refc->ri_type == XFS_REFCOUNT_DECREASE);
- refc->ri_startblock = new_fsb;
- refc->ri_blockcount = new_aglen;
+ if (!error && ri->ri_blockcount > 0) {
+ ASSERT(ri->ri_type == XFS_REFCOUNT_INCREASE ||
+ ri->ri_type == XFS_REFCOUNT_DECREASE);
return -EAGAIN;
}
- kmem_cache_free(xfs_refcount_intent_cache, refc);
+ kmem_cache_free(xfs_refcount_intent_cache, ri);
return error;
}
@@ -463,18 +453,14 @@ xfs_cui_item_recover(
struct xfs_log_item *lip,
struct list_head *capture_list)
{
- struct xfs_bmbt_irec irec;
+ struct xfs_trans_res resv;
struct xfs_cui_log_item *cuip = CUI_ITEM(lip);
- struct xfs_phys_extent *refc;
struct xfs_cud_log_item *cudp;
struct xfs_trans *tp;
struct xfs_btree_cur *rcur = NULL;
struct xfs_mount *mp = lip->li_log->l_mp;
- xfs_fsblock_t new_fsb;
- xfs_extlen_t new_len;
unsigned int refc_type;
bool requeue_only = false;
- enum xfs_refcount_intent_type type;
int i;
int error = 0;
@@ -505,14 +491,18 @@ xfs_cui_item_recover(
* doesn't fit. We need to reserve enough blocks to handle a
* full btree split on either end of the refcount range.
*/
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate,
- mp->m_refc_maxlevels * 2, 0, XFS_TRANS_RESERVE, &tp);
+ resv = xlog_recover_resv(&M_RES(mp)->tr_itruncate);
+ error = xfs_trans_alloc(mp, &resv, mp->m_refc_maxlevels * 2, 0,
+ XFS_TRANS_RESERVE, &tp);
if (error)
return error;
cudp = xfs_trans_get_cud(tp, cuip);
for (i = 0; i < cuip->cui_format.cui_nextents; i++) {
+ struct xfs_refcount_intent fake = { };
+ struct xfs_phys_extent *refc;
+
refc = &cuip->cui_format.cui_extents[i];
refc_type = refc->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK;
switch (refc_type) {
@@ -520,7 +510,7 @@ xfs_cui_item_recover(
case XFS_REFCOUNT_DECREASE:
case XFS_REFCOUNT_ALLOC_COW:
case XFS_REFCOUNT_FREE_COW:
- type = refc_type;
+ fake.ri_type = refc_type;
break;
default:
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
@@ -529,13 +519,12 @@ xfs_cui_item_recover(
error = -EFSCORRUPTED;
goto abort_error;
}
- if (requeue_only) {
- new_fsb = refc->pe_startblock;
- new_len = refc->pe_len;
- } else
+
+ fake.ri_startblock = refc->pe_startblock;
+ fake.ri_blockcount = refc->pe_len;
+ if (!requeue_only)
error = xfs_trans_log_finish_refcount_update(tp, cudp,
- type, refc->pe_startblock, refc->pe_len,
- &new_fsb, &new_len, &rcur);
+ &fake, &rcur);
if (error == -EFSCORRUPTED)
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
&cuip->cui_format,
@@ -544,10 +533,13 @@ xfs_cui_item_recover(
goto abort_error;
/* Requeue what we didn't finish. */
- if (new_len > 0) {
- irec.br_startblock = new_fsb;
- irec.br_blockcount = new_len;
- switch (type) {
+ if (fake.ri_blockcount > 0) {
+ struct xfs_bmbt_irec irec = {
+ .br_startblock = fake.ri_startblock,
+ .br_blockcount = fake.ri_blockcount,
+ };
+
+ switch (fake.ri_type) {
case XFS_REFCOUNT_INCREASE:
xfs_refcount_increase_extent(tp, &irec);
break;
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index cbdc23217a42..d539487eaf1a 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -431,13 +431,6 @@ xfs_reflink_fill_cow_hole(
if (error)
return error;
- /*
- * Allocation succeeded but the requested range was not even partially
- * satisfied? Bail out!
- */
- if (nimaps == 0)
- return -ENOSPC;
-
convert:
return xfs_reflink_convert_unwritten(ip, imap, cmap, convert_now);
@@ -500,13 +493,6 @@ xfs_reflink_fill_delalloc(
error = xfs_trans_commit(tp);
if (error)
return error;
-
- /*
- * Allocation succeeded but the requested range was not even
- * partially satisfied? Bail out!
- */
- if (nimaps == 0)
- return -ENOSPC;
} while (cmap->br_startoff + cmap->br_blockcount <= imap->br_startoff);
return xfs_reflink_convert_unwritten(ip, imap, cmap, convert_now);
@@ -618,8 +604,11 @@ xfs_reflink_cancel_cow_blocks(
xfs_refcount_free_cow_extent(*tpp, del.br_startblock,
del.br_blockcount);
- xfs_free_extent_later(*tpp, del.br_startblock,
- del.br_blockcount, NULL);
+ error = xfs_free_extent_later(*tpp, del.br_startblock,
+ del.br_blockcount, NULL,
+ XFS_AG_RESV_NONE);
+ if (error)
+ break;
/* Roll the transaction */
error = xfs_defer_finish(tpp);
@@ -729,12 +718,6 @@ xfs_reflink_end_cow_extent(
int nmaps;
int error;
- /* No COW extents? That's easy! */
- if (ifp->if_bytes == 0) {
- *offset_fsb = end_fsb;
- return 0;
- }
-
resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0,
XFS_TRANS_RESERVE, &tp);
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index 534504ede1a3..2043cea261c0 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -492,6 +492,7 @@ xfs_rui_item_recover(
struct xfs_log_item *lip,
struct list_head *capture_list)
{
+ struct xfs_trans_res resv;
struct xfs_rui_log_item *ruip = RUI_ITEM(lip);
struct xfs_map_extent *rmap;
struct xfs_rud_log_item *rudp;
@@ -519,8 +520,9 @@ xfs_rui_item_recover(
}
}
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate,
- mp->m_rmap_maxlevels, 0, XFS_TRANS_RESERVE, &tp);
+ resv = xlog_recover_resv(&M_RES(mp)->tr_itruncate);
+ error = xfs_trans_alloc(mp, &resv, mp->m_rmap_maxlevels, 0,
+ XFS_TRANS_RESERVE, &tp);
if (error)
return error;
rudp = xfs_trans_get_rud(tp, ruip);
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 0bfbbc1dd0da..fc21b4e81ade 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -19,6 +19,7 @@
#include "xfs_icache.h"
#include "xfs_rtalloc.h"
#include "xfs_sb.h"
+#include "xfs_rtbitmap.h"
/*
* Read and return the summary information for a given extent size,
@@ -317,7 +318,7 @@ xfs_rtallocate_extent_block(
/*
* Searched the whole thing & didn't find a maxlen free extent.
*/
- if (minlen < maxlen && besti != -1) {
+ if (minlen <= maxlen && besti != -1) {
xfs_extlen_t p; /* amount to trim length by */
/*
@@ -839,8 +840,6 @@ xfs_growfs_rt_alloc(
nmap = 1;
error = xfs_bmapi_write(tp, ip, oblocks, nblocks - oblocks,
XFS_BMAPI_METADATA, 0, &map, &nmap);
- if (!error && nmap < 1)
- error = -ENOSPC;
if (error)
goto out_trans_cancel;
/*
@@ -917,6 +916,39 @@ xfs_alloc_rsum_cache(
}
/*
+ * If we changed the rt extent size (meaning there was no rt volume previously)
+ * and the root directory had EXTSZINHERIT and RTINHERIT set, it's possible
+ * that the extent size hint on the root directory is no longer congruent with
+ * the new rt extent size. Log the rootdir inode to fix this.
+ */
+static int
+xfs_growfs_rt_fixup_extsize(
+ struct xfs_mount *mp)
+{
+ struct xfs_inode *ip = mp->m_rootip;
+ struct xfs_trans *tp;
+ int error = 0;
+
+ xfs_ilock(ip, XFS_IOLOCK_EXCL);
+ if (!(ip->i_diflags & XFS_DIFLAG_RTINHERIT) ||
+ !(ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT))
+ goto out_iolock;
+
+ error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_ichange, 0, 0, false,
+ &tp);
+ if (error)
+ goto out_iolock;
+
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+ error = xfs_trans_commit(tp);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+out_iolock:
+ xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+ return error;
+}
+
+/*
* Visible (exported) functions.
*/
@@ -945,6 +977,7 @@ xfs_growfs_rt(
xfs_sb_t *sbp; /* old superblock */
xfs_fsblock_t sumbno; /* summary block number */
uint8_t *rsum_cache; /* old summary cache */
+ xfs_agblock_t old_rextsize = mp->m_sb.sb_rextsize;
sbp = &mp->m_sb;
@@ -954,34 +987,39 @@ xfs_growfs_rt(
/* Needs to have been mounted with an rt device. */
if (!XFS_IS_REALTIME_MOUNT(mp))
return -EINVAL;
+
+ if (!mutex_trylock(&mp->m_growlock))
+ return -EWOULDBLOCK;
/*
* Mount should fail if the rt bitmap/summary files don't load, but
* we'll check anyway.
*/
+ error = -EINVAL;
if (!mp->m_rbmip || !mp->m_rsumip)
- return -EINVAL;
+ goto out_unlock;
/* Shrink not supported. */
if (in->newblocks <= sbp->sb_rblocks)
- return -EINVAL;
+ goto out_unlock;
/* Can only change rt extent size when adding rt volume. */
if (sbp->sb_rblocks > 0 && in->extsize != sbp->sb_rextsize)
- return -EINVAL;
+ goto out_unlock;
/* Range check the extent size. */
if (XFS_FSB_TO_B(mp, in->extsize) > XFS_MAX_RTEXTSIZE ||
XFS_FSB_TO_B(mp, in->extsize) < XFS_MIN_RTEXTSIZE)
- return -EINVAL;
+ goto out_unlock;
/* Unsupported realtime features. */
+ error = -EOPNOTSUPP;
if (xfs_has_rmapbt(mp) || xfs_has_reflink(mp) || xfs_has_quota(mp))
- return -EOPNOTSUPP;
+ goto out_unlock;
nrblocks = in->newblocks;
error = xfs_sb_validate_fsb_count(sbp, nrblocks);
if (error)
- return error;
+ goto out_unlock;
/*
* Read in the last block of the device, make sure it exists.
*/
@@ -989,7 +1027,7 @@ xfs_growfs_rt(
XFS_FSB_TO_BB(mp, nrblocks - 1),
XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL);
if (error)
- return error;
+ goto out_unlock;
xfs_buf_relse(bp);
/*
@@ -997,8 +1035,12 @@ xfs_growfs_rt(
*/
nrextents = nrblocks;
do_div(nrextents, in->extsize);
+ if (!xfs_validate_rtextents(nrextents)) {
+ error = -EINVAL;
+ goto out_unlock;
+ }
nrbmblocks = howmany_64(nrextents, NBBY * sbp->sb_blocksize);
- nrextslog = xfs_highbit32(nrextents);
+ nrextslog = xfs_compute_rextslog(nrextents);
nrsumlevels = nrextslog + 1;
nrsumsize = (uint)sizeof(xfs_suminfo_t) * nrsumlevels * nrbmblocks;
nrsumblocks = XFS_B_TO_FSB(mp, nrsumsize);
@@ -1008,8 +1050,11 @@ xfs_growfs_rt(
* the log. This prevents us from getting a log overflow,
* since we'll log basically the whole summary file at once.
*/
- if (nrsumblocks > (mp->m_sb.sb_logblocks >> 1))
- return -EINVAL;
+ if (nrsumblocks > (mp->m_sb.sb_logblocks >> 1)) {
+ error = -EINVAL;
+ goto out_unlock;
+ }
+
/*
* Get the old block counts for bitmap and summary inodes.
* These can't change since other growfs callers are locked out.
@@ -1021,10 +1066,10 @@ xfs_growfs_rt(
*/
error = xfs_growfs_rt_alloc(mp, rbmblocks, nrbmblocks, mp->m_rbmip);
if (error)
- return error;
+ goto out_unlock;
error = xfs_growfs_rt_alloc(mp, rsumblocks, nrsumblocks, mp->m_rsumip);
if (error)
- return error;
+ goto out_unlock;
rsum_cache = mp->m_rsum_cache;
if (nrbmblocks != sbp->sb_rbmblocks)
@@ -1060,13 +1105,16 @@ xfs_growfs_rt(
nsbp->sb_rextents = nsbp->sb_rblocks;
do_div(nsbp->sb_rextents, nsbp->sb_rextsize);
ASSERT(nsbp->sb_rextents != 0);
- nsbp->sb_rextslog = xfs_highbit32(nsbp->sb_rextents);
+ nsbp->sb_rextslog = xfs_compute_rextslog(nsbp->sb_rextents);
nrsumlevels = nmp->m_rsumlevels = nsbp->sb_rextslog + 1;
nrsumsize =
(uint)sizeof(xfs_suminfo_t) * nrsumlevels *
nsbp->sb_rbmblocks;
nrsumblocks = XFS_B_TO_FSB(mp, nrsumsize);
nmp->m_rsumsize = nrsumsize = XFS_FSB_TO_B(mp, nrsumblocks);
+ /* recompute growfsrt reservation from new rsumsize */
+ xfs_trans_resv_calc(nmp, &nmp->m_resv);
+
/*
* Start a transaction, get the log reservation.
*/
@@ -1150,6 +1198,8 @@ error_cancel:
*/
mp->m_rsumlevels = nrsumlevels;
mp->m_rsumsize = nrsumsize;
+ /* recompute growfsrt reservation from new rsumsize */
+ xfs_trans_resv_calc(mp, &mp->m_resv);
error = xfs_trans_commit(tp);
if (error)
@@ -1161,6 +1211,12 @@ error_cancel:
if (error)
goto out_free;
+ if (old_rextsize != in->extsize) {
+ error = xfs_growfs_rt_fixup_extsize(mp);
+ if (error)
+ goto out_free;
+ }
+
/* Update secondary superblocks now the physical grow has completed */
error = xfs_update_secondary_sbs(mp);
@@ -1184,6 +1240,8 @@ out_free:
}
}
+out_unlock:
+ mutex_unlock(&mp->m_growlock);
return error;
}
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
index 65c284e9d33e..11859c259a1c 100644
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -11,22 +11,6 @@
struct xfs_mount;
struct xfs_trans;
-/*
- * XXX: Most of the realtime allocation functions deal in units of realtime
- * extents, not realtime blocks. This looks funny when paired with the type
- * name and screams for a larger cleanup.
- */
-struct xfs_rtalloc_rec {
- xfs_rtblock_t ar_startext;
- xfs_rtblock_t ar_extcount;
-};
-
-typedef int (*xfs_rtalloc_query_range_fn)(
- struct xfs_mount *mp,
- struct xfs_trans *tp,
- const struct xfs_rtalloc_rec *rec,
- void *priv);
-
#ifdef CONFIG_XFS_RT
/*
* Function prototypes for exported functions.
@@ -48,19 +32,6 @@ xfs_rtallocate_extent(
xfs_extlen_t prod, /* extent product factor */
xfs_rtblock_t *rtblock); /* out: start block allocated */
-/*
- * Free an extent in the realtime subvolume. Length is expressed in
- * realtime extents, as is the block number.
- */
-int /* error */
-xfs_rtfree_extent(
- struct xfs_trans *tp, /* transaction pointer */
- xfs_rtblock_t bno, /* starting block number to free */
- xfs_extlen_t len); /* length of extent freed */
-
-/* Same as above, but in units of rt blocks. */
-int xfs_rtfree_blocks(struct xfs_trans *tp, xfs_fsblock_t rtbno,
- xfs_filblks_t rtlen);
/*
* Initialize realtime fields in the mount structure.
@@ -102,55 +73,11 @@ xfs_growfs_rt(
struct xfs_mount *mp, /* file system mount structure */
xfs_growfs_rt_t *in); /* user supplied growfs struct */
-/*
- * From xfs_rtbitmap.c
- */
-int xfs_rtbuf_get(struct xfs_mount *mp, struct xfs_trans *tp,
- xfs_rtblock_t block, int issum, struct xfs_buf **bpp);
-int xfs_rtcheck_range(struct xfs_mount *mp, struct xfs_trans *tp,
- xfs_rtblock_t start, xfs_extlen_t len, int val,
- xfs_rtblock_t *new, int *stat);
-int xfs_rtfind_back(struct xfs_mount *mp, struct xfs_trans *tp,
- xfs_rtblock_t start, xfs_rtblock_t limit,
- xfs_rtblock_t *rtblock);
-int xfs_rtfind_forw(struct xfs_mount *mp, struct xfs_trans *tp,
- xfs_rtblock_t start, xfs_rtblock_t limit,
- xfs_rtblock_t *rtblock);
-int xfs_rtmodify_range(struct xfs_mount *mp, struct xfs_trans *tp,
- xfs_rtblock_t start, xfs_extlen_t len, int val);
-int xfs_rtmodify_summary_int(struct xfs_mount *mp, struct xfs_trans *tp,
- int log, xfs_rtblock_t bbno, int delta,
- struct xfs_buf **rbpp, xfs_fsblock_t *rsb,
- xfs_suminfo_t *sum);
-int xfs_rtmodify_summary(struct xfs_mount *mp, struct xfs_trans *tp, int log,
- xfs_rtblock_t bbno, int delta, struct xfs_buf **rbpp,
- xfs_fsblock_t *rsb);
-int xfs_rtfree_range(struct xfs_mount *mp, struct xfs_trans *tp,
- xfs_rtblock_t start, xfs_extlen_t len,
- struct xfs_buf **rbpp, xfs_fsblock_t *rsb);
-int xfs_rtalloc_query_range(struct xfs_mount *mp, struct xfs_trans *tp,
- const struct xfs_rtalloc_rec *low_rec,
- const struct xfs_rtalloc_rec *high_rec,
- xfs_rtalloc_query_range_fn fn, void *priv);
-int xfs_rtalloc_query_all(struct xfs_mount *mp, struct xfs_trans *tp,
- xfs_rtalloc_query_range_fn fn,
- void *priv);
-bool xfs_verify_rtbno(struct xfs_mount *mp, xfs_rtblock_t rtbno);
-int xfs_rtalloc_extent_is_free(struct xfs_mount *mp, struct xfs_trans *tp,
- xfs_rtblock_t start, xfs_extlen_t len,
- bool *is_free);
int xfs_rtalloc_reinit_frextents(struct xfs_mount *mp);
#else
# define xfs_rtallocate_extent(t,b,min,max,l,f,p,rb) (-ENOSYS)
-# define xfs_rtfree_extent(t,b,l) (-ENOSYS)
-# define xfs_rtfree_blocks(t,rb,rl) (-ENOSYS)
# define xfs_rtpick_extent(m,t,l,rb) (-ENOSYS)
# define xfs_growfs_rt(mp,in) (-ENOSYS)
-# define xfs_rtalloc_query_range(m,t,l,h,f,p) (-ENOSYS)
-# define xfs_rtalloc_query_all(m,t,f,p) (-ENOSYS)
-# define xfs_rtbuf_get(m,t,b,i,p) (-ENOSYS)
-# define xfs_verify_rtbno(m, r) (false)
-# define xfs_rtalloc_extent_is_free(m,t,s,l,i) (-ENOSYS)
# define xfs_rtalloc_reinit_frextents(m) (0)
static inline int /* error */
xfs_rtmount_init(
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index 8389f3ef88ef..78bd02a98aa5 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -191,10 +191,12 @@ xfs_symlink(
prid = xfs_get_initial_prid(dp);
/*
- * Make sure that we have allocated dquot(s) on disk.
+ * Make sure that we have allocated dquot(s) on disk. The uid/gid
+ * computation code must match what the VFS uses to assign i_[ug]id.
+ * INHERIT adjusts the gid computation for setgid/grpid systems.
*/
- error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(mnt_userns, &init_user_ns),
- mapped_fsgid(mnt_userns, &init_user_ns), prid,
+ error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(mnt_userns, i_user_ns(VFS_I(dp))),
+ mapped_fsgid(mnt_userns, i_user_ns(VFS_I(dp))), prid,
XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
&udqp, &gdqp, &pdqp);
if (error)
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 0cd62031e53f..a9e3081b6625 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -3208,17 +3208,14 @@ DEFINE_REFCOUNT_DEFERRED_EVENT(xfs_refcount_deferred);
TRACE_EVENT(xfs_refcount_finish_one_leftover,
TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
- int type, xfs_agblock_t agbno, xfs_extlen_t len,
- xfs_agblock_t new_agbno, xfs_extlen_t new_len),
- TP_ARGS(mp, agno, type, agbno, len, new_agbno, new_len),
+ int type, xfs_agblock_t agbno, xfs_extlen_t len),
+ TP_ARGS(mp, agno, type, agbno, len),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_agnumber_t, agno)
__field(int, type)
__field(xfs_agblock_t, agbno)
__field(xfs_extlen_t, len)
- __field(xfs_agblock_t, new_agbno)
- __field(xfs_extlen_t, new_len)
),
TP_fast_assign(
__entry->dev = mp->m_super->s_dev;
@@ -3226,17 +3223,13 @@ TRACE_EVENT(xfs_refcount_finish_one_leftover,
__entry->type = type;
__entry->agbno = agbno;
__entry->len = len;
- __entry->new_agbno = new_agbno;
- __entry->new_len = new_len;
),
- TP_printk("dev %d:%d type %d agno 0x%x agbno 0x%x fsbcount 0x%x new_agbno 0x%x new_fsbcount 0x%x",
+ TP_printk("dev %d:%d type %d agno 0x%x agbno 0x%x fsbcount 0x%x",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->type,
__entry->agno,
__entry->agbno,
- __entry->len,
- __entry->new_agbno,
- __entry->new_len)
+ __entry->len)
);
/* simple inode-based error/%ip tracepoint class */
@@ -3498,6 +3491,31 @@ DEFINE_FSMAP_EVENT(xfs_fsmap_low_key);
DEFINE_FSMAP_EVENT(xfs_fsmap_high_key);
DEFINE_FSMAP_EVENT(xfs_fsmap_mapping);
+DECLARE_EVENT_CLASS(xfs_fsmap_linear_class,
+ TP_PROTO(struct xfs_mount *mp, u32 keydev, uint64_t bno),
+ TP_ARGS(mp, keydev, bno),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(dev_t, keydev)
+ __field(xfs_fsblock_t, bno)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->keydev = new_decode_dev(keydev);
+ __entry->bno = bno;
+ ),
+ TP_printk("dev %d:%d keydev %d:%d bno 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ MAJOR(__entry->keydev), MINOR(__entry->keydev),
+ __entry->bno)
+)
+#define DEFINE_FSMAP_LINEAR_EVENT(name) \
+DEFINE_EVENT(xfs_fsmap_linear_class, name, \
+ TP_PROTO(struct xfs_mount *mp, u32 keydev, uint64_t bno), \
+ TP_ARGS(mp, keydev, bno))
+DEFINE_FSMAP_LINEAR_EVENT(xfs_fsmap_low_key_linear);
+DEFINE_FSMAP_LINEAR_EVENT(xfs_fsmap_high_key_linear);
+
DECLARE_EVENT_CLASS(xfs_getfsmap_class,
TP_PROTO(struct xfs_mount *mp, struct xfs_fsmap *fsmap),
TP_ARGS(mp, fsmap),