diff options
Diffstat (limited to 'fs')
241 files changed, 7946 insertions, 6202 deletions
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 3576123d8299..6ecf863bfa2f 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -625,7 +625,7 @@ static void v9fs_mmap_vm_close(struct vm_area_struct *vma) inode = file_inode(vma->vm_file); - if (!mapping_cap_writeback_dirty(inode->i_mapping)) + if (!mapping_can_writeback(inode->i_mapping)) wbc.nr_to_write = 0; might_sleep(); diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index 74df32be4c6a..e34fa20acf61 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -80,8 +80,10 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses, if (ret) return ret; - if (v9ses->cache) - sb->s_bdi->ra_pages = VM_READAHEAD_PAGES; + if (!v9ses->cache) { + sb->s_bdi->ra_pages = 0; + sb->s_bdi->io_pages = 0; + } sb->s_flags |= SB_ACTIVE | SB_DIRSYNC; if (!v9ses->cache) diff --git a/fs/Makefile b/fs/Makefile index 1c7b0e3f6daa..7bb2a05fda1f 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -13,7 +13,8 @@ obj-y := open.o read_write.o file_table.o super.o \ seq_file.o xattr.o libfs.o fs-writeback.o \ pnode.o splice.o sync.o utimes.o d_path.o \ stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \ - fs_types.o fs_context.o fs_parser.o fsopen.o init.o + fs_types.o fs_context.o fs_parser.o fsopen.o init.o \ + kernel_read_file.o ifeq ($(CONFIG_BLOCK),y) obj-y += buffer.o block_dev.o direct-io.o mpage.o @@ -37,7 +38,6 @@ obj-$(CONFIG_FS_DAX) += dax.o obj-$(CONFIG_FS_ENCRYPTION) += crypto/ obj-$(CONFIG_FS_VERITY) += verity/ obj-$(CONFIG_FILE_LOCKING) += locks.o -obj-$(CONFIG_COMPAT) += compat.o obj-$(CONFIG_BINFMT_AOUT) += binfmt_aout.o obj-$(CONFIG_BINFMT_EM86) += binfmt_em86.o obj-$(CONFIG_BINFMT_MISC) += binfmt_misc.o diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 1d13d2e882ad..0fe8844b4bee 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -810,14 +810,32 @@ void afs_evict_inode(struct inode *inode) static void afs_setattr_success(struct afs_operation *op) { - struct inode *inode = &op->file[0].vnode->vfs_inode; + struct afs_vnode_param *vp = &op->file[0]; + struct inode *inode = &vp->vnode->vfs_inode; + loff_t old_i_size = i_size_read(inode); + + op->setattr.old_i_size = old_i_size; + afs_vnode_commit_status(op, vp); + /* inode->i_size has now been changed. */ + + if (op->setattr.attr->ia_valid & ATTR_SIZE) { + loff_t size = op->setattr.attr->ia_size; + if (size > old_i_size) + pagecache_isize_extended(inode, old_i_size, size); + } +} + +static void afs_setattr_edit_file(struct afs_operation *op) +{ + struct afs_vnode_param *vp = &op->file[0]; + struct inode *inode = &vp->vnode->vfs_inode; - afs_vnode_commit_status(op, &op->file[0]); if (op->setattr.attr->ia_valid & ATTR_SIZE) { - loff_t i_size = inode->i_size, size = op->setattr.attr->ia_size; - if (size > i_size) - pagecache_isize_extended(inode, i_size, size); - truncate_pagecache(inode, size); + loff_t size = op->setattr.attr->ia_size; + loff_t i_size = op->setattr.old_i_size; + + if (size < i_size) + truncate_pagecache(inode, size); } } @@ -825,6 +843,7 @@ static const struct afs_operation_ops afs_setattr_operation = { .issue_afs_rpc = afs_fs_setattr, .issue_yfs_rpc = yfs_fs_setattr, .success = afs_setattr_success, + .edit_dir = afs_setattr_edit_file, }; /* @@ -863,11 +882,16 @@ int afs_setattr(struct dentry *dentry, struct iattr *attr) if (S_ISREG(vnode->vfs_inode.i_mode)) filemap_write_and_wait(vnode->vfs_inode.i_mapping); + /* Prevent any new writebacks from starting whilst we do this. */ + down_write(&vnode->validate_lock); + op = afs_alloc_operation(((attr->ia_valid & ATTR_FILE) ? afs_file_key(attr->ia_file) : NULL), vnode->volume); - if (IS_ERR(op)) - return PTR_ERR(op); + if (IS_ERR(op)) { + ret = PTR_ERR(op); + goto out_unlock; + } afs_op_set_vnode(op, 0, vnode); op->setattr.attr = attr; @@ -880,5 +904,10 @@ int afs_setattr(struct dentry *dentry, struct iattr *attr) op->file[0].update_ctime = 1; op->ops = &afs_setattr_operation; - return afs_do_sync_operation(op); + ret = afs_do_sync_operation(op); + +out_unlock: + up_write(&vnode->validate_lock); + _leave(" = %d", ret); + return ret; } diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 18042b7dab6a..e5f0446f27e5 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -812,6 +812,7 @@ struct afs_operation { } store; struct { struct iattr *attr; + loff_t old_i_size; } setattr; struct afs_acl *acl; struct yfs_acl *yacl; diff --git a/fs/afs/super.c b/fs/afs/super.c index b552357b1d13..3a40ee752c1e 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -456,7 +456,6 @@ static int afs_fill_super(struct super_block *sb, struct afs_fs_context *ctx) ret = super_setup_bdi(sb); if (ret) return ret; - sb->s_bdi->ra_pages = VM_READAHEAD_PAGES; /* allocate the root inode and dentry */ if (as->dyn_root) { diff --git a/fs/afs/write.c b/fs/afs/write.c index 4b2265cb1891..da12abd6db21 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -738,11 +738,21 @@ static int afs_writepages_region(struct address_space *mapping, int afs_writepages(struct address_space *mapping, struct writeback_control *wbc) { + struct afs_vnode *vnode = AFS_FS_I(mapping->host); pgoff_t start, end, next; int ret; _enter(""); + /* We have to be careful as we can end up racing with setattr() + * truncating the pagecache since the caller doesn't take a lock here + * to prevent it. + */ + if (wbc->sync_mode == WB_SYNC_ALL) + down_read(&vnode->validate_lock); + else if (!down_read_trylock(&vnode->validate_lock)) + return 0; + if (wbc->range_cyclic) { start = mapping->writeback_index; end = -1; @@ -762,6 +772,7 @@ int afs_writepages(struct address_space *mapping, ret = afs_writepages_region(mapping, wbc, start, end, &next); } + up_read(&vnode->validate_lock); _leave(" = %d", ret); return ret; } @@ -1489,12 +1489,8 @@ static ssize_t aio_setup_rw(int rw, const struct iocb *iocb, *iovec = NULL; return ret; } -#ifdef CONFIG_COMPAT - if (compat) - return compat_import_iovec(rw, buf, len, UIO_FASTIOV, iovec, - iter); -#endif - return import_iovec(rw, buf, len, UIO_FASTIOV, iovec, iter); + + return __import_iovec(rw, buf, len, UIO_FASTIOV, iovec, iter, compat); } static inline void aio_rw_done(struct kiocb *req, ssize_t ret) diff --git a/fs/autofs/dev-ioctl.c b/fs/autofs/dev-ioctl.c index 75105f45c51a..322b7dfb4ea0 100644 --- a/fs/autofs/dev-ioctl.c +++ b/fs/autofs/dev-ioctl.c @@ -8,6 +8,7 @@ #include <linux/compat.h> #include <linux/syscalls.h> #include <linux/magic.h> +#include <linux/nospec.h> #include "autofs_i.h" @@ -563,7 +564,7 @@ out: static ioctl_fn lookup_dev_ioctl(unsigned int cmd) { - static ioctl_fn _ioctls[] = { + static const ioctl_fn _ioctls[] = { autofs_dev_ioctl_version, autofs_dev_ioctl_protover, autofs_dev_ioctl_protosubver, @@ -581,7 +582,10 @@ static ioctl_fn lookup_dev_ioctl(unsigned int cmd) }; unsigned int idx = cmd_idx(cmd); - return (idx >= ARRAY_SIZE(_ioctls)) ? NULL : _ioctls[idx]; + if (idx >= ARRAY_SIZE(_ioctls)) + return NULL; + idx = array_index_nospec(idx, ARRAY_SIZE(_ioctls)); + return _ioctls[idx]; } /* ioctl dispatcher */ diff --git a/fs/autofs/waitq.c b/fs/autofs/waitq.c index 74c886f7c51c..5ced859dac53 100644 --- a/fs/autofs/waitq.c +++ b/fs/autofs/waitq.c @@ -53,7 +53,7 @@ static int autofs_write(struct autofs_sb_info *sbi, mutex_lock(&sbi->pipe_mutex); while (bytes) { - wr = kernel_write(file, data, bytes, &file->f_pos); + wr = __kernel_write(file, data, bytes, NULL); if (wr <= 0) break; data += wr; diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 13d053982dd7..e7e9d0cde51a 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -13,6 +13,7 @@ #include <linux/module.h> #include <linux/kernel.h> #include <linux/fs.h> +#include <linux/log2.h> #include <linux/mm.h> #include <linux/mman.h> #include <linux/errno.h> @@ -421,6 +422,26 @@ static int elf_read(struct file *file, void *buf, size_t len, loff_t pos) return 0; } +static unsigned long maximum_alignment(struct elf_phdr *cmds, int nr) +{ + unsigned long alignment = 0; + int i; + + for (i = 0; i < nr; i++) { + if (cmds[i].p_type == PT_LOAD) { + unsigned long p_align = cmds[i].p_align; + + /* skip non-power of two alignments as invalid */ + if (!is_power_of_2(p_align)) + continue; + alignment = max(alignment, p_align); + } + } + + /* ensure we align to at least one page */ + return ELF_PAGEALIGN(alignment); +} + /** * load_elf_phdrs() - load ELF program headers * @elf_ex: ELF header of the binary whose program headers should be loaded @@ -1008,6 +1029,7 @@ out_free_interp: int elf_prot, elf_flags; unsigned long k, vaddr; unsigned long total_size = 0; + unsigned long alignment; if (elf_ppnt->p_type != PT_LOAD) continue; @@ -1086,6 +1108,9 @@ out_free_interp: load_bias = ELF_ET_DYN_BASE; if (current->flags & PF_RANDOMIZE) load_bias += arch_mmap_rnd(); + alignment = maximum_alignment(elf_phdata, elf_ex->e_phnum); + if (alignment) + load_bias &= ~(alignment - 1); elf_flags |= MAP_FIXED; } else load_bias = 0; @@ -1389,126 +1414,6 @@ out: * Jeremy Fitzhardinge <jeremy@sw.oz.au> */ -/* - * The purpose of always_dump_vma() is to make sure that special kernel mappings - * that are useful for post-mortem analysis are included in every core dump. - * In that way we ensure that the core dump is fully interpretable later - * without matching up the same kernel and hardware config to see what PC values - * meant. These special mappings include - vDSO, vsyscall, and other - * architecture specific mappings - */ -static bool always_dump_vma(struct vm_area_struct *vma) -{ - /* Any vsyscall mappings? */ - if (vma == get_gate_vma(vma->vm_mm)) - return true; - - /* - * Assume that all vmas with a .name op should always be dumped. - * If this changes, a new vm_ops field can easily be added. - */ - if (vma->vm_ops && vma->vm_ops->name && vma->vm_ops->name(vma)) - return true; - - /* - * arch_vma_name() returns non-NULL for special architecture mappings, - * such as vDSO sections. - */ - if (arch_vma_name(vma)) - return true; - - return false; -} - -/* - * Decide what to dump of a segment, part, all or none. - */ -static unsigned long vma_dump_size(struct vm_area_struct *vma, - unsigned long mm_flags) -{ -#define FILTER(type) (mm_flags & (1UL << MMF_DUMP_##type)) - - /* always dump the vdso and vsyscall sections */ - if (always_dump_vma(vma)) - goto whole; - - if (vma->vm_flags & VM_DONTDUMP) - return 0; - - /* support for DAX */ - if (vma_is_dax(vma)) { - if ((vma->vm_flags & VM_SHARED) && FILTER(DAX_SHARED)) - goto whole; - if (!(vma->vm_flags & VM_SHARED) && FILTER(DAX_PRIVATE)) - goto whole; - return 0; - } - - /* Hugetlb memory check */ - if (is_vm_hugetlb_page(vma)) { - if ((vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_SHARED)) - goto whole; - if (!(vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_PRIVATE)) - goto whole; - return 0; - } - - /* Do not dump I/O mapped devices or special mappings */ - if (vma->vm_flags & VM_IO) - return 0; - - /* By default, dump shared memory if mapped from an anonymous file. */ - if (vma->vm_flags & VM_SHARED) { - if (file_inode(vma->vm_file)->i_nlink == 0 ? - FILTER(ANON_SHARED) : FILTER(MAPPED_SHARED)) - goto whole; - return 0; - } - - /* Dump segments that have been written to. */ - if (vma->anon_vma && FILTER(ANON_PRIVATE)) - goto whole; - if (vma->vm_file == NULL) - return 0; - - if (FILTER(MAPPED_PRIVATE)) - goto whole; - - /* - * If this looks like the beginning of a DSO or executable mapping, - * check for an ELF header. If we find one, dump the first page to - * aid in determining what was mapped here. - */ - if (FILTER(ELF_HEADERS) && - vma->vm_pgoff == 0 && (vma->vm_flags & VM_READ)) { - u32 __user *header = (u32 __user *) vma->vm_start; - u32 word; - /* - * Doing it this way gets the constant folded by GCC. - */ - union { - u32 cmp; - char elfmag[SELFMAG]; - } magic; - BUILD_BUG_ON(SELFMAG != sizeof word); - magic.elfmag[EI_MAG0] = ELFMAG0; - magic.elfmag[EI_MAG1] = ELFMAG1; - magic.elfmag[EI_MAG2] = ELFMAG2; - magic.elfmag[EI_MAG3] = ELFMAG3; - if (unlikely(get_user(word, header))) - word = 0; - if (word == magic.cmp) - return PAGE_SIZE; - } - -#undef FILTER - - return 0; - -whole: - return vma->vm_end - vma->vm_start; -} - /* An ELF note in memory */ struct memelfnote { @@ -2220,32 +2125,6 @@ static void free_note_info(struct elf_note_info *info) #endif -static struct vm_area_struct *first_vma(struct task_struct *tsk, - struct vm_area_struct *gate_vma) -{ - struct vm_area_struct *ret = tsk->mm->mmap; - - if (ret) - return ret; - return gate_vma; -} -/* - * Helper function for iterating across a vma list. It ensures that the caller - * will visit `gate_vma' prior to terminating the search. - */ -static struct vm_area_struct *next_vma(struct vm_area_struct *this_vma, - struct vm_area_struct *gate_vma) -{ - struct vm_area_struct *ret; - - ret = this_vma->vm_next; - if (ret) - return ret; - if (this_vma == gate_vma) - return NULL; - return gate_vma; -} - static void fill_extnum_info(struct elfhdr *elf, struct elf_shdr *shdr4extnum, elf_addr_t e_shoff, int segs) { @@ -2272,9 +2151,8 @@ static void fill_extnum_info(struct elfhdr *elf, struct elf_shdr *shdr4extnum, static int elf_core_dump(struct coredump_params *cprm) { int has_dumped = 0; - int segs, i; - size_t vma_data_size = 0; - struct vm_area_struct *vma, *gate_vma; + int vma_count, segs, i; + size_t vma_data_size; struct elfhdr elf; loff_t offset = 0, dataoff; struct elf_note_info info = { }; @@ -2282,30 +2160,16 @@ static int elf_core_dump(struct coredump_params *cprm) struct elf_shdr *shdr4extnum = NULL; Elf_Half e_phnum; elf_addr_t e_shoff; - elf_addr_t *vma_filesz = NULL; + struct core_vma_metadata *vma_meta; + + if (dump_vma_snapshot(cprm, &vma_count, &vma_meta, &vma_data_size)) + return 0; - /* - * We no longer stop all VM operations. - * - * This is because those proceses that could possibly change map_count - * or the mmap / vma pages are now blocked in do_exit on current - * finishing this core dump. - * - * Only ptrace can touch these memory addresses, but it doesn't change - * the map_count or the pages allocated. So no possibility of crashing - * exists while dumping the mm->vm_next areas to the core file. - */ - /* * The number of segs are recored into ELF header as 16bit value. * Please check DEFAULT_MAX_MAP_COUNT definition when you modify here. */ - segs = current->mm->map_count; - segs += elf_core_extra_phdrs(); - - gate_vma = get_gate_vma(current->mm); - if (gate_vma != NULL) - segs++; + segs = vma_count + elf_core_extra_phdrs(); /* for notes section */ segs++; @@ -2343,24 +2207,6 @@ static int elf_core_dump(struct coredump_params *cprm) dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE); - /* - * Zero vma process will get ZERO_SIZE_PTR here. - * Let coredump continue for register state at least. - */ - vma_filesz = kvmalloc(array_size(sizeof(*vma_filesz), (segs - 1)), - GFP_KERNEL); - if (!vma_filesz) - goto end_coredump; - - for (i = 0, vma = first_vma(current, gate_vma); vma != NULL; - vma = next_vma(vma, gate_vma)) { - unsigned long dump_size; - - dump_size = vma_dump_size(vma, cprm->mm_flags); - vma_filesz[i++] = dump_size; - vma_data_size += dump_size; - } - offset += vma_data_size; offset += elf_core_extra_data_size(); e_shoff = offset; @@ -2381,21 +2227,23 @@ static int elf_core_dump(struct coredump_params *cprm) goto end_coredump; /* Write program headers for segments dump */ - for (i = 0, vma = first_vma(current, gate_vma); vma != NULL; - vma = next_vma(vma, gate_vma)) { + for (i = 0; i < vma_count; i++) { + struct core_vma_metadata *meta = vma_meta + i; struct elf_phdr phdr; phdr.p_type = PT_LOAD; phdr.p_offset = offset; - phdr.p_vaddr = vma->vm_start; + phdr.p_vaddr = meta->start; phdr.p_paddr = 0; - phdr.p_filesz = vma_filesz[i++]; - phdr.p_memsz = vma->vm_end - vma->vm_start; + phdr.p_filesz = meta->dump_size; + phdr.p_memsz = meta->end - meta->start; offset += phdr.p_filesz; - phdr.p_flags = vma->vm_flags & VM_READ ? PF_R : 0; - if (vma->vm_flags & VM_WRITE) + phdr.p_flags = 0; + if (meta->flags & VM_READ) + phdr.p_flags |= PF_R; + if (meta->flags & VM_WRITE) phdr.p_flags |= PF_W; - if (vma->vm_flags & VM_EXEC) + if (meta->flags & VM_EXEC) phdr.p_flags |= PF_X; phdr.p_align = ELF_EXEC_PAGESIZE; @@ -2417,28 +2265,11 @@ static int elf_core_dump(struct coredump_params *cprm) if (!dump_skip(cprm, dataoff - cprm->pos)) goto end_coredump; - for (i = 0, vma = first_vma(current, gate_vma); vma != NULL; - vma = next_vma(vma, gate_vma)) { - unsigned long addr; - unsigned long end; + for (i = 0; i < vma_count; i++) { + struct core_vma_metadata *meta = vma_meta + i; - end = vma->vm_start + vma_filesz[i++]; - - for (addr = vma->vm_start; addr < end; addr += PAGE_SIZE) { - struct page *page; - int stop; - - page = get_dump_page(addr); - if (page) { - void *kaddr = kmap(page); - stop = !dump_emit(cprm, kaddr, PAGE_SIZE); - kunmap(page); - put_page(page); - } else - stop = !dump_skip(cprm, PAGE_SIZE); - if (stop) - goto end_coredump; - } + if (!dump_user_range(cprm, meta->start, meta->dump_size)) + goto end_coredump; } dump_truncate(cprm); @@ -2453,7 +2284,7 @@ static int elf_core_dump(struct coredump_params *cprm) end_coredump: free_note_info(&info); kfree(shdr4extnum); - kvfree(vma_filesz); + kvfree(vma_meta); kfree(phdr4note); return has_dumped; } diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 50f845702b92..be4062b8ba75 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -1215,76 +1215,6 @@ struct elf_prstatus_fdpic int pr_fpvalid; /* True if math co-processor being used. */ }; -/* - * Decide whether a segment is worth dumping; default is yes to be - * sure (missing info is worse than too much; etc). - * Personally I'd include everything, and use the coredump limit... - * - * I think we should skip something. But I am not sure how. H.J. - */ -static int maydump(struct vm_area_struct *vma, unsigned long mm_flags) -{ - int dump_ok; - - /* Do not dump I/O mapped devices or special mappings */ - if (vma->vm_flags & VM_IO) { - kdcore("%08lx: %08lx: no (IO)", vma->vm_start, vma->vm_flags); - return 0; - } - - /* If we may not read the contents, don't allow us to dump - * them either. "dump_write()" can't handle it anyway. - */ - if (!(vma->vm_flags & VM_READ)) { - kdcore("%08lx: %08lx: no (!read)", vma->vm_start, vma->vm_flags); - return 0; - } - - /* support for DAX */ - if (vma_is_dax(vma)) { - if (vma->vm_flags & VM_SHARED) { - dump_ok = test_bit(MMF_DUMP_DAX_SHARED, &mm_flags); - kdcore("%08lx: %08lx: %s (DAX shared)", vma->vm_start, - vma->vm_flags, dump_ok ? "yes" : "no"); - } else { - dump_ok = test_bit(MMF_DUMP_DAX_PRIVATE, &mm_flags); - kdcore("%08lx: %08lx: %s (DAX private)", vma->vm_start, - vma->vm_flags, dump_ok ? "yes" : "no"); - } - return dump_ok; - } - - /* By default, dump shared memory if mapped from an anonymous file. */ - if (vma->vm_flags & VM_SHARED) { - if (file_inode(vma->vm_file)->i_nlink == 0) { - dump_ok = test_bit(MMF_DUMP_ANON_SHARED, &mm_flags); - kdcore("%08lx: %08lx: %s (share)", vma->vm_start, - vma->vm_flags, dump_ok ? "yes" : "no"); - return dump_ok; - } - - dump_ok = test_bit(MMF_DUMP_MAPPED_SHARED, &mm_flags); - kdcore("%08lx: %08lx: %s (share)", vma->vm_start, - vma->vm_flags, dump_ok ? "yes" : "no"); - return dump_ok; - } - -#ifdef CONFIG_MMU - /* By default, if it hasn't been written to, don't write it out */ - if (!vma->anon_vma) { - dump_ok = test_bit(MMF_DUMP_MAPPED_PRIVATE, &mm_flags); - kdcore("%08lx: %08lx: %s (!anon)", vma->vm_start, - vma->vm_flags, dump_ok ? "yes" : "no"); - return dump_ok; - } -#endif - - dump_ok = test_bit(MMF_DUMP_ANON_PRIVATE, &mm_flags); - kdcore("%08lx: %08lx: %s", vma->vm_start, vma->vm_flags, - dump_ok ? "yes" : "no"); - return dump_ok; -} - /* An ELF note in memory */ struct memelfnote { @@ -1524,54 +1454,21 @@ static void fill_extnum_info(struct elfhdr *elf, struct elf_shdr *shdr4extnum, /* * dump the segments for an MMU process */ -static bool elf_fdpic_dump_segments(struct coredump_params *cprm) +static bool elf_fdpic_dump_segments(struct coredump_params *cprm, + struct core_vma_metadata *vma_meta, + int vma_count) { - struct vm_area_struct *vma; + int i; - for (vma = current->mm->mmap; vma; vma = vma->vm_next) { -#ifdef CONFIG_MMU - unsigned long addr; -#endif + for (i = 0; i < vma_count; i++) { + struct core_vma_metadata *meta = vma_meta + i; - if (!maydump(vma, cprm->mm_flags)) - continue; - -#ifdef CONFIG_MMU - for (addr = vma->vm_start; addr < vma->vm_end; - addr += PAGE_SIZE) { - bool res; - struct page *page = get_dump_page(addr); - if (page) { - void *kaddr = kmap(page); - res = dump_emit(cprm, kaddr, PAGE_SIZE); - kunmap(page); - put_page(page); - } else { - res = dump_skip(cprm, PAGE_SIZE); - } - if (!res) - return false; - } -#else - if (!dump_emit(cprm, (void *) vma->vm_start, - vma->vm_end - vma->vm_start)) + if (!dump_user_range(cprm, meta->start, meta->dump_size)) return false; -#endif } return true; } -static size_t elf_core_vma_data_size(unsigned long mm_flags) -{ - struct vm_area_struct *vma; - size_t size = 0; - - for (vma = current->mm->mmap; vma; vma = vma->vm_next) - if (maydump(vma, mm_flags)) - size += vma->vm_end - vma->vm_start; - return size; -} - /* * Actual dumper * @@ -1582,9 +1479,8 @@ static size_t elf_core_vma_data_size(unsigned long mm_flags) static int elf_fdpic_core_dump(struct coredump_params *cprm) { int has_dumped = 0; - int segs; + int vma_count, segs; int i; - struct vm_area_struct *vma; struct elfhdr *elf = NULL; loff_t offset = 0, dataoff; struct memelfnote psinfo_note, auxv_note; @@ -1598,18 +1494,8 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) elf_addr_t e_shoff; struct core_thread *ct; struct elf_thread_status *tmp; - - /* - * We no longer stop all VM operations. - * - * This is because those proceses that could possibly change map_count - * or the mmap / vma pages are now blocked in do_exit on current - * finishing this core dump. - * - * Only ptrace can touch these memory addresses, but it doesn't change - * the map_count or the pages allocated. So no possibility of crashing - * exists while dumping the mm->vm_next areas to the core file. - */ + struct core_vma_metadata *vma_meta = NULL; + size_t vma_data_size; /* alloc memory for large data structures: too large to be on stack */ elf = kmalloc(sizeof(*elf), GFP_KERNEL); @@ -1619,6 +1505,9 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) if (!psinfo) goto end_coredump; + if (dump_vma_snapshot(cprm, &vma_count, &vma_meta, &vma_data_size)) + goto end_coredump; + for (ct = current->mm->core_state->dumper.next; ct; ct = ct->next) { tmp = elf_dump_thread_status(cprm->siginfo->si_signo, @@ -1638,8 +1527,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) tmp->next = thread_list; thread_list = tmp; - segs = current->mm->map_count; - segs += elf_core_extra_phdrs(); + segs = vma_count + elf_core_extra_phdrs(); /* for notes section */ segs++; @@ -1684,7 +1572,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) /* Page-align dumped data */ dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE); - offset += elf_core_vma_data_size(cprm->mm_flags); + offset += vma_data_size; offset += elf_core_extra_data_size(); e_shoff = offset; @@ -1704,23 +1592,26 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) goto end_coredump; /* write program headers for segments dump */ - for (vma = current->mm->mmap; vma; vma = vma->vm_next) { + for (i = 0; i < vma_count; i++) { + struct core_vma_metadata *meta = vma_meta + i; struct elf_phdr phdr; size_t sz; - sz = vma->vm_end - vma->vm_start; + sz = meta->end - meta->start; phdr.p_type = PT_LOAD; phdr.p_offset = offset; - phdr.p_vaddr = vma->vm_start; + phdr.p_vaddr = meta->start; phdr.p_paddr = 0; - phdr.p_filesz = maydump(vma, cprm->mm_flags) ? sz : 0; + phdr.p_filesz = meta->dump_size; phdr.p_memsz = sz; offset += phdr.p_filesz; - phdr.p_flags = vma->vm_flags & VM_READ ? PF_R : 0; - if (vma->vm_flags & VM_WRITE) + phdr.p_flags = 0; + if (meta->flags & VM_READ) + phdr.p_flags |= PF_R; + if (meta->flags & VM_WRITE) phdr.p_flags |= PF_W; - if (vma->vm_flags & VM_EXEC) + if (meta->flags & VM_EXEC) phdr.p_flags |= PF_X; phdr.p_align = ELF_EXEC_PAGESIZE; @@ -1752,7 +1643,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) if (!dump_skip(cprm, dataoff - cprm->pos)) goto end_coredump; - if (!elf_fdpic_dump_segments(cprm)) + if (!elf_fdpic_dump_segments(cprm, vma_meta, vma_count)) goto end_coredump; if (!elf_core_write_extra_data(cprm)) @@ -1776,6 +1667,7 @@ end_coredump: thread_list = thread_list->next; kfree(tmp); } + kvfree(vma_meta); kfree(phdr4note); kfree(elf); kfree(psinfo); diff --git a/fs/block_dev.c b/fs/block_dev.c index 8ae833e00443..9e84b1928b94 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -103,6 +103,35 @@ void invalidate_bdev(struct block_device *bdev) } EXPORT_SYMBOL(invalidate_bdev); +/* + * Drop all buffers & page cache for given bdev range. This function bails + * with error if bdev has other exclusive owner (such as filesystem). + */ +int truncate_bdev_range(struct block_device *bdev, fmode_t mode, + loff_t lstart, loff_t lend) +{ + struct block_device *claimed_bdev = NULL; + int err; + + /* + * If we don't hold exclusive handle for the device, upgrade to it + * while we discard the buffer cache to avoid discarding buffers + * under live filesystem. + */ + if (!(mode & FMODE_EXCL)) { + claimed_bdev = bdev->bd_contains; + err = bd_prepare_to_claim(bdev, claimed_bdev, + truncate_bdev_range); + if (err) + return err; + } + truncate_inode_pages_range(bdev->bd_inode->i_mapping, lstart, lend); + if (claimed_bdev) + bd_abort_claiming(bdev, claimed_bdev, truncate_bdev_range); + return 0; +} +EXPORT_SYMBOL(truncate_bdev_range); + static void set_init_blocksize(struct block_device *bdev) { bdev->bd_inode->i_blkbits = blksize_bits(bdev_logical_block_size(bdev)); @@ -862,7 +891,7 @@ static int bdev_set(struct inode *inode, void *data) return 0; } -struct block_device *bdget(dev_t dev) +static struct block_device *bdget(dev_t dev) { struct block_device *bdev; struct inode *inode; @@ -876,11 +905,11 @@ struct block_device *bdget(dev_t dev) bdev = &BDEV_I(inode)->bdev; if (inode->i_state & I_NEW) { + spin_lock_init(&bdev->bd_size_lock); bdev->bd_contains = NULL; bdev->bd_super = NULL; bdev->bd_inode = inode; bdev->bd_part_count = 0; - bdev->bd_invalidated = 0; inode->i_mode = S_IFBLK; inode->i_rdev = dev; inode->i_bdev = bdev; @@ -891,8 +920,6 @@ struct block_device *bdget(dev_t dev) return bdev; } -EXPORT_SYMBOL(bdget); - /** * bdgrab -- Grab a reference to an already referenced block device * @bdev: Block device to grab a reference to. @@ -904,6 +931,11 @@ struct block_device *bdgrab(struct block_device *bdev) } EXPORT_SYMBOL(bdgrab); +struct block_device *bdget_part(struct hd_struct *part) +{ + return bdget(part_devt(part)); +} + long nr_blockdev_pages(void) { struct inode *inode; @@ -1290,6 +1322,7 @@ static void check_disk_size_change(struct gendisk *disk, { loff_t disk_size, bdev_size; + spin_lock(&bdev->bd_size_lock); disk_size = (loff_t)get_capacity(disk) << 9; bdev_size = i_size_read(bdev->bd_inode); if (disk_size != bdev_size) { @@ -1299,85 +1332,51 @@ static void check_disk_size_change(struct gendisk *disk, disk->disk_name, bdev_size, disk_size); } i_size_write(bdev->bd_inode, disk_size); - if (bdev_size > disk_size && __invalidate_device(bdev, false)) + } + spin_unlock(&bdev->bd_size_lock); + + if (bdev_size > disk_size) { + if (__invalidate_device(bdev, false)) pr_warn("VFS: busy inodes on resized disk %s\n", disk->disk_name); } - bdev->bd_invalidated = 0; } /** - * revalidate_disk - wrapper for lower-level driver's revalidate_disk call-back - * @disk: struct gendisk to be revalidated + * revalidate_disk_size - checks for disk size change and adjusts bdev size. + * @disk: struct gendisk to check + * @verbose: if %true log a message about a size change if there is any * - * This routine is a wrapper for lower-level driver's revalidate_disk - * call-backs. It is used to do common pre and post operations needed - * for all revalidate_disk operations. + * This routine checks to see if the bdev size does not match the disk size + * and adjusts it if it differs. When shrinking the bdev size, its all caches + * are freed. */ -int revalidate_disk(struct gendisk *disk) +void revalidate_disk_size(struct gendisk *disk, bool verbose) { - int ret = 0; - - if (disk->fops->revalidate_disk) - ret = disk->fops->revalidate_disk(disk); + struct block_device *bdev; /* * Hidden disks don't have associated bdev so there's no point in - * revalidating it. + * revalidating them. */ - if (!(disk->flags & GENHD_FL_HIDDEN)) { - struct block_device *bdev = bdget_disk(disk, 0); - - if (!bdev) - return ret; + if (disk->flags & GENHD_FL_HIDDEN) + return; - mutex_lock(&bdev->bd_mutex); - check_disk_size_change(disk, bdev, ret == 0); - mutex_unlock(&bdev->bd_mutex); + bdev = bdget_disk(disk, 0); + if (bdev) { + check_disk_size_change(disk, bdev, verbose); bdput(bdev); } - return ret; } -EXPORT_SYMBOL(revalidate_disk); +EXPORT_SYMBOL(revalidate_disk_size); -/* - * This routine checks whether a removable media has been changed, - * and invalidates all buffer-cache-entries in that case. This - * is a relatively slow routine, so we have to try to minimize using - * it. Thus it is called only upon a 'mount' or 'open'. This - * is the best way of combining speed and utility, I think. - * People changing diskettes in the middle of an operation deserve - * to lose :-) - */ -int check_disk_change(struct block_device *bdev) +void bd_set_nr_sectors(struct block_device *bdev, sector_t sectors) { - struct gendisk *disk = bdev->bd_disk; - const struct block_device_operations *bdops = disk->fops; - unsigned int events; - - events = disk_clear_events(disk, DISK_EVENT_MEDIA_CHANGE | - DISK_EVENT_EJECT_REQUEST); - if (!(events & DISK_EVENT_MEDIA_CHANGE)) - return 0; - - if (__invalidate_device(bdev, true)) - pr_warn("VFS: busy inodes on changed media %s\n", - disk->disk_name); - bdev->bd_invalidated = 1; - if (bdops->revalidate_disk) - bdops->revalidate_disk(bdev->bd_disk); - return 1; -} - -EXPORT_SYMBOL(check_disk_change); - -void bd_set_size(struct block_device *bdev, loff_t size) -{ - inode_lock(bdev->bd_inode); - i_size_write(bdev->bd_inode, size); - inode_unlock(bdev->bd_inode); + spin_lock(&bdev->bd_size_lock); + i_size_write(bdev->bd_inode, (loff_t)sectors << SECTOR_SHIFT); + spin_unlock(&bdev->bd_size_lock); } -EXPORT_SYMBOL(bd_set_size); +EXPORT_SYMBOL(bd_set_nr_sectors); static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part); @@ -1388,6 +1387,8 @@ int bdev_disk_changed(struct block_device *bdev, bool invalidate) lockdep_assert_held(&bdev->bd_mutex); + clear_bit(GD_NEED_PART_SCAN, &bdev->bd_disk->state); + rescan: ret = blk_drop_partitions(bdev); if (ret) @@ -1446,22 +1447,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, void *holder, struct gendisk *disk; int ret; int partno; - int perm = 0; bool first_open = false, unblock_events = true, need_restart; - if (mode & FMODE_READ) - perm |= MAY_READ; - if (mode & FMODE_WRITE) - perm |= MAY_WRITE; - /* - * hooks: /n/, see "layering violations". - */ - if (!for_part) { - ret = devcgroup_inode_permission(bdev->bd_inode, perm); - if (ret != 0) - return ret; - } - restart: need_restart = false; ret = -ENXIO; @@ -1514,7 +1501,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, void *holder, } if (!ret) { - bd_set_size(bdev,(loff_t)get_capacity(disk)<<9); + bd_set_nr_sectors(bdev, get_capacity(disk)); set_init_blocksize(bdev); } @@ -1524,7 +1511,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, void *holder, * The latter is necessary to prevent ghost * partitions on a removed medium. */ - if (bdev->bd_invalidated && + if (test_bit(GD_NEED_PART_SCAN, &disk->state) && (!ret || ret == -ENOMEDIUM)) bdev_disk_changed(bdev, ret == -ENOMEDIUM); @@ -1542,7 +1529,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, void *holder, ret = -ENXIO; goto out_clear; } - bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9); + bd_set_nr_sectors(bdev, bdev->bd_part->nr_sects); set_init_blocksize(bdev); } @@ -1554,7 +1541,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, void *holder, if (bdev->bd_disk->fops->open) ret = bdev->bd_disk->fops->open(bdev, mode); /* the same as first opener case, read comment there */ - if (bdev->bd_invalidated && + if (test_bit(GD_NEED_PART_SCAN, &disk->state) && (!ret || ret == -ENOMEDIUM)) bdev_disk_changed(bdev, ret == -ENOMEDIUM); if (ret) @@ -1632,16 +1619,27 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, void *holder, * RETURNS: * 0 on success, -errno on failure. */ -int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder) +static int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder) { - int res; + int ret, perm = 0; - res =__blkdev_get(bdev, mode, holder, 0); - if (res) - bdput(bdev); - return res; + if (mode & FMODE_READ) + perm |= MAY_READ; + if (mode & FMODE_WRITE) + perm |= MAY_WRITE; + ret = devcgroup_inode_permission(bdev->bd_inode, perm); + if (ret) + goto bdput; + + ret =__blkdev_get(bdev, mode, holder, 0); + if (ret) + goto bdput; + return 0; + +bdput: + bdput(bdev); + return ret; } -EXPORT_SYMBOL(blkdev_get); /** * blkdev_get_by_path - open a block device by name @@ -1889,7 +1887,7 @@ ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) if (bdev_read_only(I_BDEV(bd_inode))) return -EPERM; - if (IS_SWAPFILE(bd_inode) && !is_hibernate_resume_dev(bd_inode)) + if (IS_SWAPFILE(bd_inode) && !is_hibernate_resume_dev(bd_inode->i_rdev)) return -ETXTBSY; if (!iov_iter_count(from)) @@ -1969,7 +1967,6 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start, loff_t len) { struct block_device *bdev = I_BDEV(bdev_file_inode(file)); - struct address_space *mapping; loff_t end = start + len - 1; loff_t isize; int error; @@ -1997,8 +1994,9 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start, return -EINVAL; /* Invalidate the page cache, including dirty pages. */ - mapping = bdev->bd_inode->i_mapping; - truncate_inode_pages_range(mapping, start, end); + error = truncate_bdev_range(bdev, file->f_mode, start, end); + if (error) + return error; switch (mode) { case FALLOC_FL_ZERO_RANGE: @@ -2025,7 +2023,7 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start, * the caller will be given -EBUSY. The third argument is * inclusive, so the rounding here is safe. */ - return invalidate_inode_pages2_range(mapping, + return invalidate_inode_pages2_range(bdev->bd_inode->i_mapping, start >> PAGE_SHIFT, end >> PAGE_SHIFT); } diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index 575636f6491e..68b95ad82126 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -14,6 +14,7 @@ config BTRFS_FS select LZO_DECOMPRESS select ZSTD_COMPRESS select ZSTD_DECOMPRESS + select FS_IOMAP select RAID6_PQ select XOR_BLOCKS select SRCU diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index ea1c28ccb44f..b3268f4ea5f3 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -2997,7 +2997,6 @@ int btrfs_backref_finish_upper_links(struct btrfs_backref_cache *cache, while (!list_empty(&pending_edge)) { struct btrfs_backref_node *upper; struct btrfs_backref_node *lower; - struct rb_node *rb_node; edge = list_first_entry(&pending_edge, struct btrfs_backref_edge, list[UPPER]); diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index ea8aaf36647e..c0f1d6818df7 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1766,16 +1766,10 @@ static void link_block_group(struct btrfs_block_group *cache) { struct btrfs_space_info *space_info = cache->space_info; int index = btrfs_bg_flags_to_raid_index(cache->flags); - bool first = false; down_write(&space_info->groups_sem); - if (list_empty(&space_info->block_groups[index])) - first = true; list_add_tail(&cache->list, &space_info->block_groups[index]); up_write(&space_info->groups_sem); - - if (first) - btrfs_sysfs_add_block_group_type(cache); } static struct btrfs_block_group *btrfs_create_block_group_cache( @@ -1873,7 +1867,7 @@ static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info) return ret; } -static int read_block_group_item(struct btrfs_block_group *cache, +static void read_block_group_item(struct btrfs_block_group *cache, struct btrfs_path *path, const struct btrfs_key *key) { @@ -1887,8 +1881,6 @@ static int read_block_group_item(struct btrfs_block_group *cache, sizeof(bgi)); cache->used = btrfs_stack_block_group_used(&bgi); cache->flags = btrfs_stack_block_group_flags(&bgi); - - return 0; } static int read_one_block_group(struct btrfs_fs_info *info, @@ -1907,9 +1899,7 @@ static int read_one_block_group(struct btrfs_fs_info *info, if (!cache) return -ENOMEM; - ret = read_block_group_item(cache, path, key); - if (ret < 0) - goto error; + read_block_group_item(cache, path, key); set_free_space_tree_thresholds(cache); @@ -2035,8 +2025,18 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) btrfs_release_path(path); } - rcu_read_lock(); - list_for_each_entry_rcu(space_info, &info->space_info, list) { + list_for_each_entry(space_info, &info->space_info, list) { + int i; + + for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { + if (list_empty(&space_info->block_groups[i])) + continue; + cache = list_first_entry(&space_info->block_groups[i], + struct btrfs_block_group, + list); + btrfs_sysfs_add_block_group_type(cache); + } + if (!(btrfs_get_alloc_profile(info, space_info->flags) & (BTRFS_BLOCK_GROUP_RAID10 | BTRFS_BLOCK_GROUP_RAID1_MASK | @@ -2056,7 +2056,6 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) list) inc_block_group_ro(cache, 1); } - rcu_read_unlock(); btrfs_init_global_block_rsv(info); ret = check_chunk_block_group_mappings(info); @@ -2097,12 +2096,16 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) return; while (!list_empty(&trans->new_bgs)) { + int index; + block_group = list_first_entry(&trans->new_bgs, struct btrfs_block_group, bg_list); if (ret) goto next; + index = btrfs_bg_flags_to_raid_index(block_group->flags); + ret = insert_block_group_item(trans, block_group); if (ret) btrfs_abort_transaction(trans, ret); @@ -2111,6 +2114,16 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) if (ret) btrfs_abort_transaction(trans, ret); add_block_group_free_space(trans, block_group); + + /* + * If we restriped during balance, we may have added a new raid + * type, so now add the sysfs entries when it is safe to do so. + * We don't have to worry about locking here as it's handled in + * btrfs_sysfs_add_block_group_type. + */ + if (block_group->space_info->block_group_kobjs[index] == NULL) + btrfs_sysfs_add_block_group_type(block_group); + /* Already aborted the transaction if it failed. */ next: btrfs_delayed_refs_rsv_release(fs_info, 1); @@ -2785,7 +2798,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) * finished yet (no block group item in the extent tree * yet, etc). If this is the case, wait for all free * space endio workers to finish and retry. This is a - * a very rare case so no need for a more efficient and + * very rare case so no need for a more efficient and * complex approach. */ if (ret == -ENOENT) { @@ -2961,6 +2974,13 @@ int btrfs_add_reserved_bytes(struct btrfs_block_group *cache, space_info, -ram_bytes); if (delalloc) cache->delalloc_bytes += num_bytes; + + /* + * Compression can use less space than we reserved, so wake + * tickets if that happens + */ + if (num_bytes < ram_bytes) + btrfs_try_granting_tickets(cache->fs_info, space_info); } spin_unlock(&cache->lock); spin_unlock(&space_info->lock); @@ -2994,6 +3014,8 @@ void btrfs_free_reserved_bytes(struct btrfs_block_group *cache, if (delalloc) cache->delalloc_bytes -= num_bytes; spin_unlock(&cache->lock); + + btrfs_try_granting_tickets(cache->fs_info, space_info); spin_unlock(&space_info->lock); } @@ -3002,12 +3024,10 @@ static void force_metadata_allocation(struct btrfs_fs_info *info) struct list_head *head = &info->space_info; struct btrfs_space_info *found; - rcu_read_lock(); - list_for_each_entry_rcu(found, head, list) { + list_for_each_entry(found, head, list) { if (found->flags & BTRFS_BLOCK_GROUP_METADATA) found->force_alloc = CHUNK_ALLOC_FORCE; } - rcu_read_unlock(); } static int should_alloc_chunk(struct btrfs_fs_info *fs_info, @@ -3338,14 +3358,6 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) } spin_unlock(&info->block_group_cache_lock); - /* - * Now that all the block groups are freed, go through and free all the - * space_info structs. This is only called during the final stages of - * unmount, and so we know nobody is using them. We call - * synchronize_rcu() once before we start, just to be on the safe side. - */ - synchronize_rcu(); - btrfs_release_global_block_rsv(info); while (!list_empty(&info->space_info)) { diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index c47b6c6fea9f..92dd86bceae3 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -21,14 +21,18 @@ * new data the application may have written before commit. */ enum { - BTRFS_INODE_ORDERED_DATA_CLOSE, + BTRFS_INODE_FLUSH_ON_CLOSE, BTRFS_INODE_DUMMY, BTRFS_INODE_IN_DEFRAG, BTRFS_INODE_HAS_ASYNC_EXTENT, + /* + * Always set under the VFS' inode lock, otherwise it can cause races + * during fsync (we start as a fast fsync and then end up in a full + * fsync racing with ordered extent completion). + */ BTRFS_INODE_NEEDS_FULL_SYNC, BTRFS_INODE_COPY_EVERYTHING, BTRFS_INODE_IN_DELALLOC_LIST, - BTRFS_INODE_READDIO_NEED_LOCK, BTRFS_INODE_HAS_PROPS, BTRFS_INODE_SNAPSHOT_FLUSH, }; @@ -212,6 +216,11 @@ struct btrfs_inode { struct inode vfs_inode; }; +static inline u32 btrfs_inode_sectorsize(const struct btrfs_inode *inode) +{ + return inode->root->fs_info->sectorsize; +} + static inline struct btrfs_inode *BTRFS_I(const struct inode *inode) { return container_of(inode, struct btrfs_inode, vfs_inode); @@ -324,23 +333,6 @@ struct btrfs_dio_private { u8 csums[]; }; -/* - * Disable DIO read nolock optimization, so new dio readers will be forced - * to grab i_mutex. It is used to avoid the endless truncate due to - * nonlocked dio read. - */ -static inline void btrfs_inode_block_unlocked_dio(struct btrfs_inode *inode) -{ - set_bit(BTRFS_INODE_READDIO_NEED_LOCK, &inode->runtime_flags); - smp_mb(); -} - -static inline void btrfs_inode_resume_unlocked_dio(struct btrfs_inode *inode) -{ - smp_mb__before_atomic(); - clear_bit(BTRFS_INODE_READDIO_NEED_LOCK, &inode->runtime_flags); -} - /* Array of bytes with variable length, hexadecimal format 0x1234 */ #define CSUM_FMT "0x%*phN" #define CSUM_FMT_VALUE(size, bytes) size, bytes diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 1ab56a734e70..eeface30facd 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -29,41 +29,6 @@ #include "extent_io.h" #include "extent_map.h" -int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, - u64 start, struct page **pages, unsigned long *out_pages, - unsigned long *total_in, unsigned long *total_out); -int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb); -int zlib_decompress(struct list_head *ws, unsigned char *data_in, - struct page *dest_page, unsigned long start_byte, size_t srclen, - size_t destlen); -struct list_head *zlib_alloc_workspace(unsigned int level); -void zlib_free_workspace(struct list_head *ws); -struct list_head *zlib_get_workspace(unsigned int level); - -int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, - u64 start, struct page **pages, unsigned long *out_pages, - unsigned long *total_in, unsigned long *total_out); -int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb); -int lzo_decompress(struct list_head *ws, unsigned char *data_in, - struct page *dest_page, unsigned long start_byte, size_t srclen, - size_t destlen); -struct list_head *lzo_alloc_workspace(unsigned int level); -void lzo_free_workspace(struct list_head *ws); - -int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, - u64 start, struct page **pages, unsigned long *out_pages, - unsigned long *total_in, unsigned long *total_out); -int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb); -int zstd_decompress(struct list_head *ws, unsigned char *data_in, - struct page *dest_page, unsigned long start_byte, size_t srclen, - size_t destlen); -void zstd_init_workspace_manager(void); -void zstd_cleanup_workspace_manager(void); -struct list_head *zstd_alloc_workspace(unsigned int level); -void zstd_free_workspace(struct list_head *ws); -struct list_head *zstd_get_workspace(unsigned int level); -void zstd_put_workspace(struct list_head *ws); - static const char* const btrfs_compress_types[] = { "", "zlib", "lzo", "zstd" }; const char* btrfs_compress_type2str(enum btrfs_compression_type type) diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 9f3dbe372631..8001b700ea3a 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -144,4 +144,39 @@ bool btrfs_compress_is_valid_type(const char *str, size_t len); int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end); +int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, + u64 start, struct page **pages, unsigned long *out_pages, + unsigned long *total_in, unsigned long *total_out); +int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb); +int zlib_decompress(struct list_head *ws, unsigned char *data_in, + struct page *dest_page, unsigned long start_byte, size_t srclen, + size_t destlen); +struct list_head *zlib_alloc_workspace(unsigned int level); +void zlib_free_workspace(struct list_head *ws); +struct list_head *zlib_get_workspace(unsigned int level); + +int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, + u64 start, struct page **pages, unsigned long *out_pages, + unsigned long *total_in, unsigned long *total_out); +int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb); +int lzo_decompress(struct list_head *ws, unsigned char *data_in, + struct page *dest_page, unsigned long start_byte, size_t srclen, + size_t destlen); +struct list_head *lzo_alloc_workspace(unsigned int level); +void lzo_free_workspace(struct list_head *ws); + +int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, + u64 start, struct page **pages, unsigned long *out_pages, + unsigned long *total_in, unsigned long *total_out); +int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb); +int zstd_decompress(struct list_head *ws, unsigned char *data_in, + struct page *dest_page, unsigned long start_byte, size_t srclen, + size_t destlen); +void zstd_init_workspace_manager(void); +void zstd_cleanup_workspace_manager(void); +struct list_head *zstd_alloc_workspace(unsigned int level); +void zstd_free_workspace(struct list_head *ws); +struct list_head *zstd_get_workspace(unsigned int level); +void zstd_put_workspace(struct list_head *ws); + #endif diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index cd392da69b81..113da62dc17f 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -198,7 +198,8 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, btrfs_node_key(buf, &disk_key, 0); cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid, - &disk_key, level, buf->start, 0); + &disk_key, level, buf->start, 0, + BTRFS_NESTING_NEW_ROOT); if (IS_ERR(cow)) return PTR_ERR(cow); @@ -957,7 +958,8 @@ static struct extent_buffer *alloc_tree_block_no_bg_flush( const struct btrfs_disk_key *disk_key, int level, u64 hint, - u64 empty_size) + u64 empty_size, + enum btrfs_lock_nesting nest) { struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *ret; @@ -986,7 +988,7 @@ static struct extent_buffer *alloc_tree_block_no_bg_flush( ret = btrfs_alloc_tree_block(trans, root, parent_start, root->root_key.objectid, disk_key, level, - hint, empty_size); + hint, empty_size, nest); trans->can_flush_pending_bgs = true; return ret; @@ -1009,7 +1011,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, struct extent_buffer *buf, struct extent_buffer *parent, int parent_slot, struct extent_buffer **cow_ret, - u64 search_start, u64 empty_size) + u64 search_start, u64 empty_size, + enum btrfs_lock_nesting nest) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_disk_key disk_key; @@ -1040,7 +1043,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, parent_start = parent->start; cow = alloc_tree_block_no_bg_flush(trans, root, parent_start, &disk_key, - level, search_start, empty_size); + level, search_start, empty_size, nest); if (IS_ERR(cow)) return PTR_ERR(cow); @@ -1061,6 +1064,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, ret = update_ref_for_cow(trans, root, buf, cow, &last_ref); if (ret) { + btrfs_tree_unlock(cow); + free_extent_buffer(cow); btrfs_abort_transaction(trans, ret); return ret; } @@ -1068,6 +1073,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) { ret = btrfs_reloc_cow_block(trans, root, buf, cow); if (ret) { + btrfs_tree_unlock(cow); + free_extent_buffer(cow); btrfs_abort_transaction(trans, ret); return ret; } @@ -1100,6 +1107,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, if (last_ref) { ret = tree_mod_log_free_eb(buf); if (ret) { + btrfs_tree_unlock(cow); + free_extent_buffer(cow); btrfs_abort_transaction(trans, ret); return ret; } @@ -1446,7 +1455,8 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans, noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, struct extent_buffer *parent, int parent_slot, - struct extent_buffer **cow_ret) + struct extent_buffer **cow_ret, + enum btrfs_lock_nesting nest) { struct btrfs_fs_info *fs_info = root->fs_info; u64 search_start; @@ -1485,7 +1495,7 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, */ btrfs_qgroup_trace_subtree_after_cow(trans, root, buf); ret = __btrfs_cow_block(trans, root, buf, parent, - parent_slot, cow_ret, search_start, 0); + parent_slot, cow_ret, search_start, 0, nest); trace_btrfs_cow_block(root, buf, *cow_ret); @@ -1657,7 +1667,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, err = __btrfs_cow_block(trans, root, cur, parent, i, &cur, search_start, min(16 * blocksize, - (end_slot - i) * blocksize)); + (end_slot - i) * blocksize), + BTRFS_NESTING_COW); if (err) { btrfs_tree_unlock(cur); free_extent_buffer(cur); @@ -1855,7 +1866,8 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, btrfs_tree_lock(child); btrfs_set_lock_blocking_write(child); - ret = btrfs_cow_block(trans, root, child, mid, 0, &child); + ret = btrfs_cow_block(trans, root, child, mid, 0, &child, + BTRFS_NESTING_COW); if (ret) { btrfs_tree_unlock(child); free_extent_buffer(child); @@ -1891,10 +1903,11 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, left = NULL; if (left) { - btrfs_tree_lock(left); + __btrfs_tree_lock(left, BTRFS_NESTING_LEFT); btrfs_set_lock_blocking_write(left); wret = btrfs_cow_block(trans, root, left, - parent, pslot - 1, &left); + parent, pslot - 1, &left, + BTRFS_NESTING_LEFT_COW); if (wret) { ret = wret; goto enospc; @@ -1906,10 +1919,11 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, right = NULL; if (right) { - btrfs_tree_lock(right); + __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT); btrfs_set_lock_blocking_write(right); wret = btrfs_cow_block(trans, root, right, - parent, pslot + 1, &right); + parent, pslot + 1, &right, + BTRFS_NESTING_RIGHT_COW); if (wret) { ret = wret; goto enospc; @@ -2069,7 +2083,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, if (left) { u32 left_nr; - btrfs_tree_lock(left); + __btrfs_tree_lock(left, BTRFS_NESTING_LEFT); btrfs_set_lock_blocking_write(left); left_nr = btrfs_header_nritems(left); @@ -2077,7 +2091,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, wret = 1; } else { ret = btrfs_cow_block(trans, root, left, parent, - pslot - 1, &left); + pslot - 1, &left, + BTRFS_NESTING_LEFT_COW); if (ret) wret = 1; else { @@ -2123,7 +2138,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, if (right) { u32 right_nr; - btrfs_tree_lock(right); + __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT); btrfs_set_lock_blocking_write(right); right_nr = btrfs_header_nritems(right); @@ -2132,7 +2147,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, } else { ret = btrfs_cow_block(trans, root, right, parent, pslot + 1, - &right); + &right, BTRFS_NESTING_RIGHT_COW); if (ret) wret = 1; else { @@ -2601,7 +2616,7 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root, * We don't know the level of the root node until we actually * have it read locked */ - b = btrfs_read_lock_root_node(root); + b = __btrfs_read_lock_root_node(root, p->recurse); level = btrfs_header_level(b); if (level > write_lock_level) goto out; @@ -2740,11 +2755,13 @@ again: btrfs_set_path_blocking(p); if (last_level) err = btrfs_cow_block(trans, root, b, NULL, 0, - &b); + &b, + BTRFS_NESTING_COW); else err = btrfs_cow_block(trans, root, b, p->nodes[level + 1], - p->slots[level + 1], &b); + p->slots[level + 1], &b, + BTRFS_NESTING_COW); if (err) { ret = err; goto done; @@ -2875,7 +2892,8 @@ cow_done: } else { if (!btrfs_tree_read_lock_atomic(b)) { btrfs_set_path_blocking(p); - btrfs_tree_read_lock(b); + __btrfs_tree_read_lock(b, BTRFS_NESTING_NORMAL, + p->recurse); } p->locks[level] = BTRFS_READ_LOCK; } @@ -3164,6 +3182,58 @@ void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info, } /* + * Check key order of two sibling extent buffers. + * + * Return true if something is wrong. + * Return false if everything is fine. + * + * Tree-checker only works inside one tree block, thus the following + * corruption can not be detected by tree-checker: + * + * Leaf @left | Leaf @right + * -------------------------------------------------------------- + * | 1 | 2 | 3 | 4 | 5 | f6 | | 7 | 8 | + * + * Key f6 in leaf @left itself is valid, but not valid when the next + * key in leaf @right is 7. + * This can only be checked at tree block merge time. + * And since tree checker has ensured all key order in each tree block + * is correct, we only need to bother the last key of @left and the first + * key of @right. + */ +static bool check_sibling_keys(struct extent_buffer *left, + struct extent_buffer *right) +{ + struct btrfs_key left_last; + struct btrfs_key right_first; + int level = btrfs_header_level(left); + int nr_left = btrfs_header_nritems(left); + int nr_right = btrfs_header_nritems(right); + + /* No key to check in one of the tree blocks */ + if (!nr_left || !nr_right) + return false; + + if (level) { + btrfs_node_key_to_cpu(left, &left_last, nr_left - 1); + btrfs_node_key_to_cpu(right, &right_first, 0); + } else { + btrfs_item_key_to_cpu(left, &left_last, nr_left - 1); + btrfs_item_key_to_cpu(right, &right_first, 0); + } + + if (btrfs_comp_cpu_keys(&left_last, &right_first) >= 0) { + btrfs_crit(left->fs_info, +"bad key order, sibling blocks, left last (%llu %u %llu) right first (%llu %u %llu)", + left_last.objectid, left_last.type, + left_last.offset, right_first.objectid, + right_first.type, right_first.offset); + return true; + } + return false; +} + +/* * try to push data from one node into the next node left in the * tree. * @@ -3207,6 +3277,12 @@ static int push_node_left(struct btrfs_trans_handle *trans, } else push_items = min(src_nritems - 8, push_items); + /* dst is the left eb, src is the middle eb */ + if (check_sibling_keys(dst, src)) { + ret = -EUCLEAN; + btrfs_abort_transaction(trans, ret); + return ret; + } ret = tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items); if (ret) { btrfs_abort_transaction(trans, ret); @@ -3275,6 +3351,12 @@ static int balance_node_right(struct btrfs_trans_handle *trans, if (max_push < push_items) push_items = max_push; + /* dst is the right eb, src is the middle eb */ + if (check_sibling_keys(src, dst)) { + ret = -EUCLEAN; + btrfs_abort_transaction(trans, ret); + return ret; + } ret = tree_mod_log_insert_move(dst, push_items, 0, dst_nritems); BUG_ON(ret < 0); memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items), @@ -3331,7 +3413,8 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, btrfs_node_key(lower, &lower_key, 0); c = alloc_tree_block_no_bg_flush(trans, root, 0, &lower_key, level, - root->node->start, 0); + root->node->start, 0, + BTRFS_NESTING_NEW_ROOT); if (IS_ERR(c)) return PTR_ERR(c); @@ -3461,7 +3544,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans, btrfs_node_key(c, &disk_key, mid); split = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, level, - c->start, 0); + c->start, 0, BTRFS_NESTING_SPLIT); if (IS_ERR(split)) return PTR_ERR(split); @@ -3730,7 +3813,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root if (IS_ERR(right)) return 1; - btrfs_tree_lock(right); + __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT); btrfs_set_lock_blocking_write(right); free_space = btrfs_leaf_free_space(right); @@ -3739,7 +3822,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root /* cow and double check */ ret = btrfs_cow_block(trans, root, right, upper, - slot + 1, &right); + slot + 1, &right, BTRFS_NESTING_RIGHT_COW); if (ret) goto out_unlock; @@ -3751,6 +3834,12 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root if (left_nritems == 0) goto out_unlock; + if (check_sibling_keys(left, right)) { + ret = -EUCLEAN; + btrfs_tree_unlock(right); + free_extent_buffer(right); + return ret; + } if (path->slots[0] == left_nritems && !empty) { /* Key greater than all keys in the leaf, right neighbor has * enough room for it and we're not emptying our leaf to delete @@ -3963,7 +4052,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root if (IS_ERR(left)) return 1; - btrfs_tree_lock(left); + __btrfs_tree_lock(left, BTRFS_NESTING_LEFT); btrfs_set_lock_blocking_write(left); free_space = btrfs_leaf_free_space(left); @@ -3974,7 +4063,8 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root /* cow and double check */ ret = btrfs_cow_block(trans, root, left, - path->nodes[1], slot - 1, &left); + path->nodes[1], slot - 1, &left, + BTRFS_NESTING_LEFT_COW); if (ret) { /* we hit -ENOSPC, but it isn't fatal here */ if (ret == -ENOSPC) @@ -3988,6 +4078,10 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root goto out; } + if (check_sibling_keys(left, right)) { + ret = -EUCLEAN; + goto out; + } return __push_leaf_left(path, min_data_size, empty, left, free_space, right_nritems, max_slot); @@ -4236,8 +4330,18 @@ again: else btrfs_item_key(l, &disk_key, mid); + /* + * We have to about BTRFS_NESTING_NEW_ROOT here if we've done a double + * split, because we're only allowed to have MAX_LOCKDEP_SUBCLASSES + * subclasses, which is 8 at the time of this patch, and we've maxed it + * out. In the future we could add a + * BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just + * use BTRFS_NESTING_NEW_ROOT. + */ right = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, 0, - l->start, 0); + l->start, 0, num_doubles ? + BTRFS_NESTING_NEW_ROOT : + BTRFS_NESTING_SPLIT); if (IS_ERR(right)) return PTR_ERR(right); @@ -4482,9 +4586,7 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans, return ret; path->slots[0]++; - setup_items_for_insert(root, path, new_key, &item_size, - item_size, item_size + - sizeof(struct btrfs_item), 1); + setup_items_for_insert(root, path, new_key, &item_size, 1); leaf = path->nodes[0]; memcpy_extent_buffer(leaf, btrfs_item_ptr_offset(leaf, path->slots[0]), @@ -4657,14 +4759,20 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size) } } -/* - * this is a helper for btrfs_insert_empty_items, the main goal here is - * to save stack depth by doing the bulk of the work in a function - * that doesn't call btrfs_search_slot +/** + * setup_items_for_insert - Helper called before inserting one or more items + * to a leaf. Main purpose is to save stack depth by doing the bulk of the work + * in a function that doesn't call btrfs_search_slot + * + * @root: root we are inserting items to + * @path: points to the leaf/slot where we are going to insert new items + * @cpu_key: array of keys for items to be inserted + * @data_size: size of the body of each item we are going to insert + * @nr: size of @cpu_key/@data_size arrays */ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, const struct btrfs_key *cpu_key, u32 *data_size, - u32 total_data, u32 total_size, int nr) + int nr) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_item *item; @@ -4675,6 +4783,12 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, struct extent_buffer *leaf; int slot; struct btrfs_map_token token; + u32 total_size; + u32 total_data = 0; + + for (i = 0; i < nr; i++) + total_data += data_size[i]; + total_size = total_data + (nr * sizeof(struct btrfs_item)); if (path->slots[0] == 0) { btrfs_cpu_key_to_disk(&disk_key, cpu_key); @@ -4701,7 +4815,8 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, if (old_data < data_end) { btrfs_print_leaf(leaf); - btrfs_crit(fs_info, "slot %d old_data %d data_end %d", + btrfs_crit(fs_info, + "item at slot %d with data offset %u beyond data end of leaf %u", slot, old_data, data_end); BUG(); } @@ -4734,8 +4849,8 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, btrfs_cpu_key_to_disk(&disk_key, cpu_key + i); btrfs_set_item_key(leaf, &disk_key, slot + i); item = btrfs_item_nr(slot + i); - btrfs_set_token_item_offset(&token, item, data_end - data_size[i]); data_end -= data_size[i]; + btrfs_set_token_item_offset(&token, item, data_end); btrfs_set_token_item_size(&token, item, data_size[i]); } @@ -4777,8 +4892,7 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, slot = path->slots[0]; BUG_ON(slot < 0); - setup_items_for_insert(root, path, cpu_key, data_size, - total_data, total_size, nr); + setup_items_for_insert(root, path, cpu_key, data_size, nr); return 0; } @@ -5115,7 +5229,7 @@ again: slot--; /* * check this node pointer against the min_trans parameters. - * If it is too old, old, skip to the next one. + * If it is too old, skip to the next one. */ while (slot < nritems) { u64 gen; @@ -5379,7 +5493,9 @@ again: } if (!ret) { btrfs_set_path_blocking(path); - btrfs_tree_read_lock(next); + __btrfs_tree_read_lock(next, + BTRFS_NESTING_RIGHT, + path->recurse); } next_rw_lock = BTRFS_READ_LOCK; } @@ -5414,7 +5530,9 @@ again: ret = btrfs_try_tree_read_lock(next); if (!ret) { btrfs_set_path_blocking(path); - btrfs_tree_read_lock(next); + __btrfs_tree_read_lock(next, + BTRFS_NESTING_RIGHT, + path->recurse); } next_rw_lock = BTRFS_READ_LOCK; } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 9a72896bed2e..aac3d6f4e35b 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -374,6 +374,7 @@ struct btrfs_path { unsigned int search_commit_root:1; unsigned int need_commit_sem:1; unsigned int skip_release_on_error:1; + unsigned int recurse:1; }; #define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r->fs_info) >> 4) - \ sizeof(struct btrfs_item)) @@ -494,7 +495,7 @@ enum btrfs_orphan_cleanup_state { ORPHAN_CLEANUP_DONE = 2, }; -void btrfs_init_async_reclaim_work(struct work_struct *work); +void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info); /* fs_info */ struct reloc_control; @@ -541,11 +542,6 @@ enum { /* Used to record internally whether fs has been frozen */ BTRFS_FS_FROZEN, /* - * Indicate that a whole-filesystem exclusive operation is running - * (device replace, resize, device add/delete, balance) - */ - BTRFS_FS_EXCL_OP, - /* * Indicate that balance has been set up from the ioctl and is in the * main phase. The fs_info::balance_ctl is initialized. * Set and cleared while holding fs_info::balance_mutex. @@ -565,6 +561,19 @@ enum { BTRFS_FS_DISCARD_RUNNING, }; +/* + * Exclusive operations (device replace, resize, device add/remove, balance) + */ +enum btrfs_exclusive_operation { + BTRFS_EXCLOP_NONE, + BTRFS_EXCLOP_BALANCE, + BTRFS_EXCLOP_DEV_ADD, + BTRFS_EXCLOP_DEV_REMOVE, + BTRFS_EXCLOP_DEV_REPLACE, + BTRFS_EXCLOP_RESIZE, + BTRFS_EXCLOP_SWAP_ACTIVATE, +}; + struct btrfs_fs_info { u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; unsigned long flags; @@ -912,6 +921,7 @@ struct btrfs_fs_info { /* Used to reclaim the metadata space in the background. */ struct work_struct async_reclaim_work; + struct work_struct async_data_reclaim_work; spinlock_t unused_bgs_lock; struct list_head unused_bgs; @@ -935,6 +945,9 @@ struct btrfs_fs_info { */ int send_in_progress; + /* Type of exclusive operation running */ + unsigned long exclusive_operation; + #ifdef CONFIG_BTRFS_FS_REF_VERIFY spinlock_t ref_verify_lock; struct rb_root block_tree; @@ -1181,24 +1194,40 @@ struct btrfs_root { #endif }; -struct btrfs_clone_extent_info { +/* + * Structure that conveys information about an extent that is going to replace + * all the extents in a file range. + */ +struct btrfs_replace_extent_info { u64 disk_offset; u64 disk_len; u64 data_offset; u64 data_len; u64 file_offset; + /* Pointer to a file extent item of type regular or prealloc. */ char *extent_buf; - u32 item_size; + /* + * Set to true when attempting to replace a file range with a new extent + * described by this structure, set to false when attempting to clone an + * existing extent into a file range. + */ + bool is_new_extent; + /* Meaningful only if is_new_extent is true. */ + int qgroup_reserved; + /* + * Meaningful only if is_new_extent is true. + * Used to track how many extent items we have already inserted in a + * subvolume tree that refer to the extent described by this structure, + * so that we know when to create a new delayed ref or update an existing + * one. + */ + int insertions; }; struct btrfs_file_private { void *filldir_buf; }; -static inline u32 btrfs_inode_sectorsize(const struct inode *inode) -{ - return btrfs_sb(inode->i_sb)->sectorsize; -} static inline u32 BTRFS_LEAF_DATA_SIZE(const struct btrfs_fs_info *info) { @@ -1391,6 +1420,16 @@ static inline void btrfs_init_map_token(struct btrfs_map_token *token, #define cpu_to_le8(v) (v) #define __le8 u8 +static inline u8 get_unaligned_le8(const void *p) +{ + return *(u8 *)p; +} + +static inline void put_unaligned_le8(u8 val, void *p) +{ + *(u8 *)p = val; +} + #define read_eb_member(eb, ptr, type, member, result) (\ read_extent_buffer(eb, (char *)(result), \ ((unsigned long)(ptr)) + \ @@ -1449,27 +1488,25 @@ static inline void btrfs_set_token_##name(struct btrfs_map_token *token,\ static inline u##bits btrfs_##name(const struct extent_buffer *eb) \ { \ const type *p = page_address(eb->pages[0]); \ - u##bits res = le##bits##_to_cpu(p->member); \ - return res; \ + return get_unaligned_le##bits(&p->member); \ } \ static inline void btrfs_set_##name(const struct extent_buffer *eb, \ u##bits val) \ { \ type *p = page_address(eb->pages[0]); \ - p->member = cpu_to_le##bits(val); \ + put_unaligned_le##bits(val, &p->member); \ } #define BTRFS_SETGET_STACK_FUNCS(name, type, member, bits) \ static inline u##bits btrfs_##name(const type *s) \ { \ - return le##bits##_to_cpu(s->member); \ + return get_unaligned_le##bits(&s->member); \ } \ static inline void btrfs_set_##name(type *s, u##bits val) \ { \ - s->member = cpu_to_le##bits(val); \ + put_unaligned_le##bits(val, &s->member); \ } - static inline u64 btrfs_device_total_bytes(const struct extent_buffer *eb, struct btrfs_dev_item *s) { @@ -2524,7 +2561,8 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, u64 parent, u64 root_objectid, const struct btrfs_disk_key *key, int level, u64 hint, - u64 empty_size); + u64 empty_size, + enum btrfs_lock_nesting nest); void btrfs_free_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, @@ -2592,6 +2630,8 @@ enum btrfs_reserve_flush_enum { * * Can be interruped by fatal signal. */ + BTRFS_RESERVE_FLUSH_DATA, + BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE, BTRFS_RESERVE_FLUSH_ALL, /* @@ -2619,7 +2659,7 @@ enum btrfs_flush_state { int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, struct btrfs_block_rsv *rsv, int nitems, bool use_global_rsv); -void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info, +void btrfs_subvolume_release_metadata(struct btrfs_root *root, struct btrfs_block_rsv *rsv); void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes); @@ -2651,8 +2691,6 @@ void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info, struct btrfs_path *path, const struct btrfs_key *new_key); struct extent_buffer *btrfs_root_node(struct btrfs_root *root); -struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root); -struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root); int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *key, int lowest_level, u64 min_trans); @@ -2665,7 +2703,8 @@ struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent, int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, struct extent_buffer *parent, int parent_slot, - struct extent_buffer **cow_ret); + struct extent_buffer **cow_ret, + enum btrfs_lock_nesting nest); int btrfs_copy_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, @@ -2713,7 +2752,7 @@ static inline int btrfs_del_item(struct btrfs_trans_handle *trans, void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, const struct btrfs_key *cpu_key, u32 *data_size, - u32 total_data, u32 total_size, int nr); + int nr); int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, const struct btrfs_key *key, void *data, u32 data_size); int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, @@ -2930,6 +2969,10 @@ void btrfs_inode_safe_disk_i_size_write(struct inode *inode, u64 new_i_size); u64 btrfs_file_extent_end(const struct btrfs_path *path); /* inode.c */ +blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio, + int mirror_num, unsigned long bio_flags); +int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u64 phy_offset, + struct page *page, u64 start, u64 end, int mirror); struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode, u64 start, u64 len); noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, @@ -2956,7 +2999,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, u32 min_type); int btrfs_start_delalloc_snapshot(struct btrfs_root *root); -int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr); +int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, u64 nr); int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end, unsigned int extra_bits, struct extent_state **cached_state); @@ -3017,6 +3060,7 @@ int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end); void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start, u64 end, int uptodate); extern const struct dentry_operations btrfs_dentry_operations; +ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter); /* ioctl.c */ long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); @@ -3031,6 +3075,9 @@ void btrfs_get_block_group_info(struct list_head *groups_list, struct btrfs_ioctl_space_info *space); void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_balance_args *bargs); +bool btrfs_exclop_start(struct btrfs_fs_info *fs_info, + enum btrfs_exclusive_operation type); +void btrfs_exclop_finish(struct btrfs_fs_info *fs_info); /* file.c */ int __init btrfs_auto_defrag_init(void); @@ -3053,9 +3100,9 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans, int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, u64 start, u64 end, int drop_cache); -int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, +int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, const u64 start, const u64 end, - struct btrfs_clone_extent_info *clone_info, + struct btrfs_replace_extent_info *extent_info, struct btrfs_trans_handle **trans_out); int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, u64 start, u64 end); @@ -3536,9 +3583,7 @@ static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info) /* Sanity test specific functions */ #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS -void btrfs_test_inode_set_ops(struct inode *inode); void btrfs_test_destroy_inode(struct inode *inode); - static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info) { return test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state); diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c index 0e354e9e57d0..bacee09b7bfd 100644 --- a/fs/btrfs/delalloc-space.c +++ b/fs/btrfs/delalloc-space.c @@ -115,126 +115,15 @@ int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes) { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_space_info *data_sinfo = fs_info->data_sinfo; - u64 used; - int ret = 0; - int need_commit = 2; - int have_pinned_space; + enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_DATA; /* Make sure bytes are sectorsize aligned */ bytes = ALIGN(bytes, fs_info->sectorsize); - if (btrfs_is_free_space_inode(inode)) { - need_commit = 0; - ASSERT(current->journal_info); - } - -again: - /* Make sure we have enough space to handle the data first */ - spin_lock(&data_sinfo->lock); - used = btrfs_space_info_used(data_sinfo, true); - - if (used + bytes > data_sinfo->total_bytes) { - struct btrfs_trans_handle *trans; - - /* - * If we don't have enough free bytes in this space then we need - * to alloc a new chunk. - */ - if (!data_sinfo->full) { - u64 alloc_target; - - data_sinfo->force_alloc = CHUNK_ALLOC_FORCE; - spin_unlock(&data_sinfo->lock); - - alloc_target = btrfs_data_alloc_profile(fs_info); - /* - * It is ugly that we don't call nolock join - * transaction for the free space inode case here. - * But it is safe because we only do the data space - * reservation for the free space cache in the - * transaction context, the common join transaction - * just increase the counter of the current transaction - * handler, doesn't try to acquire the trans_lock of - * the fs. - */ - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) - return PTR_ERR(trans); - - ret = btrfs_chunk_alloc(trans, alloc_target, - CHUNK_ALLOC_NO_FORCE); - btrfs_end_transaction(trans); - if (ret < 0) { - if (ret != -ENOSPC) - return ret; - else { - have_pinned_space = 1; - goto commit_trans; - } - } - - goto again; - } - - /* - * If we don't have enough pinned space to deal with this - * allocation, and no removed chunk in current transaction, - * don't bother committing the transaction. - */ - have_pinned_space = __percpu_counter_compare( - &data_sinfo->total_bytes_pinned, - used + bytes - data_sinfo->total_bytes, - BTRFS_TOTAL_BYTES_PINNED_BATCH); - spin_unlock(&data_sinfo->lock); - - /* Commit the current transaction and try again */ -commit_trans: - if (need_commit) { - need_commit--; - - if (need_commit > 0) { - btrfs_start_delalloc_roots(fs_info, -1); - btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, - (u64)-1); - } - - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) - return PTR_ERR(trans); - if (have_pinned_space >= 0 || - test_bit(BTRFS_TRANS_HAVE_FREE_BGS, - &trans->transaction->flags) || - need_commit > 0) { - ret = btrfs_commit_transaction(trans); - if (ret) - return ret; - /* - * The cleaner kthread might still be doing iput - * operations. Wait for it to finish so that - * more space is released. We don't need to - * explicitly run the delayed iputs here because - * the commit_transaction would have woken up - * the cleaner. - */ - ret = btrfs_wait_on_delayed_iputs(fs_info); - if (ret) - return ret; - goto again; - } else { - btrfs_end_transaction(trans); - } - } - - trace_btrfs_space_reservation(fs_info, - "space_info:enospc", - data_sinfo->flags, bytes, 1); - return -ENOSPC; - } - btrfs_space_info_update_bytes_may_use(fs_info, data_sinfo, bytes); - spin_unlock(&data_sinfo->lock); + if (btrfs_is_free_space_inode(inode)) + flush = BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE; - return 0; + return btrfs_reserve_data_bytes(fs_info, bytes, flush); } int btrfs_check_data_free_space(struct btrfs_inode *inode, @@ -277,9 +166,7 @@ void btrfs_free_reserved_data_space_noquota(struct btrfs_fs_info *fs_info, ASSERT(IS_ALIGNED(len, fs_info->sectorsize)); data_sinfo = fs_info->data_sinfo; - spin_lock(&data_sinfo->lock); - btrfs_space_info_update_bytes_may_use(fs_info, data_sinfo, -len); - spin_unlock(&data_sinfo->lock); + btrfs_space_info_free_bytes_may_use(fs_info, data_sinfo, len); } /* diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index bf1595a42a98..5aba81e16113 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -627,8 +627,7 @@ static int btrfs_delayed_inode_reserve_metadata( */ if (!src_rsv || (!trans->bytes_reserved && src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) { - ret = btrfs_qgroup_reserve_meta_prealloc(root, - fs_info->nodesize, true); + ret = btrfs_qgroup_reserve_meta_prealloc(root, num_bytes, true); if (ret < 0) return ret; ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes, @@ -769,8 +768,7 @@ static int btrfs_batch_insert_items(struct btrfs_root *root, } /* insert the keys of the items */ - setup_items_for_insert(root, path, keys, data_size, - total_data_size, total_size, nitems); + setup_items_for_insert(root, path, keys, data_size, nitems); /* insert the dir index items */ slot = path->slots[0]; diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index db93909b25e0..4a0243cb9d97 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -64,10 +64,6 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, int scrub_ret); -static void btrfs_dev_replace_update_device_in_mapping_tree( - struct btrfs_fs_info *fs_info, - struct btrfs_device *srcdev, - struct btrfs_device *tgtdev); static int btrfs_dev_replace_kthread(void *data); int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info) @@ -224,13 +220,12 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, { struct btrfs_device *device; struct block_device *bdev; - struct list_head *devices; struct rcu_string *name; u64 devid = BTRFS_DEV_REPLACE_DEVID; int ret = 0; *device_out = NULL; - if (fs_info->fs_devices->seeding) { + if (srcdev->fs_devices->seeding) { btrfs_err(fs_info, "the filesystem is a seed filesystem!"); return -EINVAL; } @@ -244,8 +239,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, sync_blockdev(bdev); - devices = &fs_info->fs_devices->devices; - list_for_each_entry(device, devices, dev_list) { + list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) { if (device->bdev == bdev) { btrfs_err(fs_info, "target device is in the filesystem!"); @@ -512,7 +506,7 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0); up_write(&dev_replace->rwsem); - ret = btrfs_sysfs_add_devices_dir(tgt_device->fs_devices, tgt_device); + ret = btrfs_sysfs_add_device(tgt_device); if (ret) btrfs_err(fs_info, "kobj add dev failed %d", ret); @@ -599,6 +593,63 @@ static void btrfs_rm_dev_replace_unblocked(struct btrfs_fs_info *fs_info) wake_up(&fs_info->dev_replace.replace_wait); } +/* + * When finishing the device replace, before swapping the source device with the + * target device we must update the chunk allocation state in the target device, + * as it is empty because replace works by directly copying the chunks and not + * through the normal chunk allocation path. + */ +static int btrfs_set_target_alloc_state(struct btrfs_device *srcdev, + struct btrfs_device *tgtdev) +{ + struct extent_state *cached_state = NULL; + u64 start = 0; + u64 found_start; + u64 found_end; + int ret = 0; + + lockdep_assert_held(&srcdev->fs_info->chunk_mutex); + + while (!find_first_extent_bit(&srcdev->alloc_state, start, + &found_start, &found_end, + CHUNK_ALLOCATED, &cached_state)) { + ret = set_extent_bits(&tgtdev->alloc_state, found_start, + found_end, CHUNK_ALLOCATED); + if (ret) + break; + start = found_end + 1; + } + + free_extent_state(cached_state); + return ret; +} + +static void btrfs_dev_replace_update_device_in_mapping_tree( + struct btrfs_fs_info *fs_info, + struct btrfs_device *srcdev, + struct btrfs_device *tgtdev) +{ + struct extent_map_tree *em_tree = &fs_info->mapping_tree; + struct extent_map *em; + struct map_lookup *map; + u64 start = 0; + int i; + + write_lock(&em_tree->lock); + do { + em = lookup_extent_mapping(em_tree, start, (u64)-1); + if (!em) + break; + map = em->map_lookup; + for (i = 0; i < map->num_stripes; i++) + if (srcdev == map->stripes[i].dev) + map->stripes[i].dev = tgtdev; + start = em->start + em->len; + free_extent_map(em); + } while (start); + write_unlock(&em_tree->lock); +} + static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, int scrub_ret) { @@ -630,7 +681,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, * flush all outstanding I/O and inode extent mappings before the * copy operation is declared as being finished */ - ret = btrfs_start_delalloc_roots(fs_info, -1); + ret = btrfs_start_delalloc_roots(fs_info, U64_MAX); if (ret) { mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); return ret; @@ -673,8 +724,14 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, dev_replace->time_stopped = ktime_get_real_seconds(); dev_replace->item_needs_writeback = 1; - /* replace old device with new one in mapping tree */ + /* + * Update allocation state in the new device and replace the old device + * with the new one in the mapping tree. + */ if (!scrub_ret) { + scrub_ret = btrfs_set_target_alloc_state(src_device, tgt_device); + if (scrub_ret) + goto error; btrfs_dev_replace_update_device_in_mapping_tree(fs_info, src_device, tgt_device); @@ -685,6 +742,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, btrfs_dev_name(src_device), src_device->devid, rcu_str_deref(tgt_device->name), scrub_ret); +error: up_write(&dev_replace->rwsem); mutex_unlock(&fs_info->chunk_mutex); mutex_unlock(&fs_info->fs_devices->device_list_mutex); @@ -743,9 +801,11 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, mutex_unlock(&fs_info->fs_devices->device_list_mutex); /* replace the sysfs entry */ - btrfs_sysfs_remove_devices_dir(fs_info->fs_devices, src_device); + btrfs_sysfs_remove_device(src_device); btrfs_sysfs_update_devid(tgt_device); - btrfs_rm_dev_replace_free_srcdev(src_device); + if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &src_device->dev_state)) + btrfs_scratch_superblocks(fs_info, src_device->bdev, + src_device->name->str); /* write back the superblocks */ trans = btrfs_start_transaction(root, 0); @@ -754,33 +814,9 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); - return 0; -} - -static void btrfs_dev_replace_update_device_in_mapping_tree( - struct btrfs_fs_info *fs_info, - struct btrfs_device *srcdev, - struct btrfs_device *tgtdev) -{ - struct extent_map_tree *em_tree = &fs_info->mapping_tree; - struct extent_map *em; - struct map_lookup *map; - u64 start = 0; - int i; + btrfs_rm_dev_replace_free_srcdev(src_device); - write_lock(&em_tree->lock); - do { - em = lookup_extent_mapping(em_tree, start, (u64)-1); - if (!em) - break; - map = em->map_lookup; - for (i = 0; i < map->num_stripes; i++) - if (srcdev == map->stripes[i].dev) - map->stripes[i].dev = tgtdev; - start = em->start + em->len; - free_extent_map(em); - } while (start); - write_unlock(&em_tree->lock); + return 0; } /* @@ -983,7 +1019,7 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info) * should never allow both to start and pause. We don't want to allow * dev-replace to start anyway. */ - if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REPLACE)) { down_write(&dev_replace->rwsem); dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED; @@ -1020,7 +1056,7 @@ static int btrfs_dev_replace_kthread(void *data) ret = btrfs_dev_replace_finishing(fs_info, ret); WARN_ON(ret && ret != -ECANCELED); - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + btrfs_exclop_finish(fs_info); return 0; } diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index f6bba7eb1fa1..8e3438672a82 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -50,7 +50,6 @@ BTRFS_SUPER_FLAG_METADUMP |\ BTRFS_SUPER_FLAG_METADUMP_V2) -static const struct extent_io_ops btree_extent_io_ops; static void end_workqueue_fn(struct btrfs_work *work); static void btrfs_destroy_ordered_extents(struct btrfs_root *root); static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, @@ -205,53 +204,6 @@ void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb, #endif /* - * extents on the btree inode are pretty simple, there's one extent - * that covers the entire device - */ -struct extent_map *btree_get_extent(struct btrfs_inode *inode, - struct page *page, size_t pg_offset, - u64 start, u64 len) -{ - struct extent_map_tree *em_tree = &inode->extent_tree; - struct extent_map *em; - int ret; - - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, start, len); - if (em) { - read_unlock(&em_tree->lock); - goto out; - } - read_unlock(&em_tree->lock); - - em = alloc_extent_map(); - if (!em) { - em = ERR_PTR(-ENOMEM); - goto out; - } - em->start = 0; - em->len = (u64)-1; - em->block_len = (u64)-1; - em->block_start = 0; - - write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em, 0); - if (ret == -EEXIST) { - free_extent_map(em); - em = lookup_extent_mapping(em_tree, start, len); - if (!em) - em = ERR_PTR(-EIO); - } else if (ret) { - free_extent_map(em); - em = ERR_PTR(ret); - } - write_unlock(&em_tree->lock); - -out: - return em; -} - -/* * Compute the csum of a btree block and store the result to provided buffer. */ static void csum_tree_block(struct extent_buffer *buf, u8 *result) @@ -545,38 +497,35 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct page *page) static int check_tree_block_fsid(struct extent_buffer *eb) { struct btrfs_fs_info *fs_info = eb->fs_info; - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; u8 fsid[BTRFS_FSID_SIZE]; - int ret = 1; + u8 *metadata_uuid; read_extent_buffer(eb, fsid, offsetof(struct btrfs_header, fsid), BTRFS_FSID_SIZE); - while (fs_devices) { - u8 *metadata_uuid; + /* + * Checking the incompat flag is only valid for the current fs. For + * seed devices it's forbidden to have their uuid changed so reading + * ->fsid in this case is fine + */ + if (btrfs_fs_incompat(fs_info, METADATA_UUID)) + metadata_uuid = fs_devices->metadata_uuid; + else + metadata_uuid = fs_devices->fsid; - /* - * Checking the incompat flag is only valid for the current - * fs. For seed devices it's forbidden to have their uuid - * changed so reading ->fsid in this case is fine - */ - if (fs_devices == fs_info->fs_devices && - btrfs_fs_incompat(fs_info, METADATA_UUID)) - metadata_uuid = fs_devices->metadata_uuid; - else - metadata_uuid = fs_devices->fsid; + if (!memcmp(fsid, metadata_uuid, BTRFS_FSID_SIZE)) + return 0; - if (!memcmp(fsid, metadata_uuid, BTRFS_FSID_SIZE)) { - ret = 0; - break; - } - fs_devices = fs_devices->seed; - } - return ret; + list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) + if (!memcmp(fsid, seed_devs->fsid, BTRFS_FSID_SIZE)) + return 0; + + return 1; } -static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, - u64 phy_offset, struct page *page, - u64 start, u64 end, int mirror) +int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio, u64 phy_offset, + struct page *page, u64 start, u64 end, + int mirror) { u64 found_start; int found_level; @@ -636,16 +585,15 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, csum_tree_block(eb, result); if (memcmp_extent_buffer(eb, result, 0, csum_size)) { - u32 val; - u32 found = 0; - - memcpy(&found, result, csum_size); + u8 val[BTRFS_CSUM_SIZE] = { 0 }; read_extent_buffer(eb, &val, 0, csum_size); btrfs_warn_rl(fs_info, - "%s checksum verify failed on %llu wanted %x found %x level %d", + "%s checksum verify failed on %llu wanted " CSUM_FMT " found " CSUM_FMT " level %d", fs_info->sb->s_id, eb->start, - val, found, btrfs_header_level(eb)); + CSUM_FMT_VALUE(csum_size, val), + CSUM_FMT_VALUE(csum_size, result), + btrfs_header_level(eb)); ret = -EUCLEAN; goto err; } @@ -865,9 +813,8 @@ static int check_async_write(struct btrfs_fs_info *fs_info, return 1; } -static blk_status_t btree_submit_bio_hook(struct inode *inode, struct bio *bio, - int mirror_num, - unsigned long bio_flags) +blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, + int mirror_num, unsigned long bio_flags) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); int async = check_async_write(fs_info, BTRFS_I(inode)); @@ -952,11 +899,6 @@ static int btree_writepages(struct address_space *mapping, return btree_write_cache_pages(mapping, wbc); } -static int btree_readpage(struct file *file, struct page *page) -{ - return extent_read_full_page(page, btree_get_extent, 0); -} - static int btree_releasepage(struct page *page, gfp_t gfp_flags) { if (PageWriteback(page) || PageDirty(page)) @@ -996,7 +938,6 @@ static int btree_set_page_dirty(struct page *page) } static const struct address_space_operations btree_aops = { - .readpage = btree_readpage, .writepages = btree_writepages, .releasepage = btree_releasepage, .invalidatepage = btree_invalidatepage, @@ -1209,7 +1150,8 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, root->root_key.type = BTRFS_ROOT_ITEM_KEY; root->root_key.offset = 0; - leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0); + leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0, + BTRFS_NESTING_NORMAL); if (IS_ERR(leaf)) { ret = PTR_ERR(leaf); leaf = NULL; @@ -1281,7 +1223,7 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, */ leaf = btrfs_alloc_tree_block(trans, root, 0, BTRFS_TREE_LOG_OBJECTID, - NULL, 0, 0, 0); + NULL, 0, 0, 0, BTRFS_NESTING_NORMAL); if (IS_ERR(leaf)) { btrfs_put_root(root); return ERR_CAST(leaf); @@ -1506,10 +1448,12 @@ void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info) struct btrfs_root *root; while (!list_empty(&fs_info->allocated_roots)) { + char buf[BTRFS_ROOT_NAME_BUF_LEN]; + root = list_first_entry(&fs_info->allocated_roots, struct btrfs_root, leak_list); - btrfs_err(fs_info, "leaked root %llu-%llu refcount %d", - root->root_key.objectid, root->root_key.offset, + btrfs_err(fs_info, "leaked root %s refcount %d", + btrfs_root_name(root->root_key.objectid, buf), refcount_read(&root->refs)); while (refcount_read(&root->refs) > 1) btrfs_put_root(root); @@ -2116,12 +2060,10 @@ static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info) RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree, - IO_TREE_INODE_IO, inode); + IO_TREE_BTREE_INODE_IO, inode); BTRFS_I(inode)->io_tree.track_uptodate = false; extent_map_tree_init(&BTRFS_I(inode)->extent_tree); - BTRFS_I(inode)->io_tree.ops = &btree_extent_io_ops; - BTRFS_I(inode)->root = btrfs_grab_root(fs_info->tree_root); memset(&BTRFS_I(inode)->location, 0, sizeof(struct btrfs_key)); set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags); @@ -2627,18 +2569,17 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) level = btrfs_super_root_level(sb); tree_root->node = read_tree_block(fs_info, btrfs_super_root(sb), generation, level, NULL); - if (IS_ERR(tree_root->node) || - !extent_buffer_uptodate(tree_root->node)) { + if (IS_ERR(tree_root->node)) { handle_error = true; + ret = PTR_ERR(tree_root->node); + tree_root->node = NULL; + btrfs_warn(fs_info, "couldn't read tree root"); + continue; - if (IS_ERR(tree_root->node)) { - ret = PTR_ERR(tree_root->node); - tree_root->node = NULL; - } else if (!extent_buffer_uptodate(tree_root->node)) { - ret = -EUCLEAN; - } - - btrfs_warn(fs_info, "failed to read tree root"); + } else if (!extent_buffer_uptodate(tree_root->node)) { + handle_error = true; + ret = -EIO; + btrfs_warn(fs_info, "error while reading tree root"); continue; } @@ -2754,7 +2695,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) fs_info->check_integrity_print_mask = 0; #endif btrfs_init_balance(fs_info); - btrfs_init_async_reclaim_work(&fs_info->async_reclaim_work); + btrfs_init_async_reclaim_work(fs_info); spin_lock_init(&fs_info->block_group_cache_lock); fs_info->block_group_cache_tree = RB_ROOT; @@ -2929,7 +2870,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device } /* - * Verify the type first, if that or the the checksum value are + * Verify the type first, if that or the checksum value are * corrupted, we'll find out */ csum_type = btrfs_super_csum_type(disk_super); @@ -3091,8 +3032,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device goto fail_sb_buffer; } - sb->s_bdi->capabilities |= BDI_CAP_CGROUP_WRITEBACK; - sb->s_bdi->ra_pages = VM_READAHEAD_PAGES; sb->s_bdi->ra_pages *= btrfs_super_num_devices(disk_super); sb->s_bdi->ra_pages = max(sb->s_bdi->ra_pages, SZ_4M / PAGE_SIZE); @@ -3418,6 +3357,8 @@ fail_block_groups: btrfs_put_block_group_cache(fs_info); fail_tree_roots: + if (fs_info->data_reloc_root) + btrfs_drop_and_free_fs_root(fs_info, fs_info->data_reloc_root); free_root_pointers(fs_info, true); invalidate_inode_pages2(fs_info->btree_inode->i_mapping); @@ -3481,8 +3422,12 @@ struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev, return ERR_CAST(page); super = page_address(page); - if (btrfs_super_bytenr(super) != bytenr || - btrfs_super_magic(super) != BTRFS_MAGIC) { + if (btrfs_super_magic(super) != BTRFS_MAGIC) { + btrfs_release_disk_super(super); + return ERR_PTR(-ENODATA); + } + + if (btrfs_super_bytenr(super) != bytenr) { btrfs_release_disk_super(super); return ERR_PTR(-EINVAL); } @@ -4055,6 +4000,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) btrfs_cleanup_defrag_inodes(fs_info); cancel_work_sync(&fs_info->async_reclaim_work); + cancel_work_sync(&fs_info->async_data_reclaim_work); /* Cancel or finish ongoing discard work */ btrfs_discard_cleanup(fs_info); @@ -4686,9 +4632,3 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info) return 0; } - -static const struct extent_io_ops btree_extent_io_ops = { - /* mandatory callbacks */ - .submit_bio_hook = btree_submit_bio_hook, - .readpage_end_io_hook = btree_readpage_end_io_hook, -}; diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 00dc39d47ed3..fee69ced58b4 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -76,7 +76,11 @@ void btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info); void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info); void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); - +int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio, u64 phy_offset, + struct page *page, u64 start, u64 end, + int mirror); +blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, + int mirror_num, unsigned long bio_flags); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info); #endif @@ -123,9 +127,6 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, u64 objectid); int btree_lock_page_hook(struct page *page, void *data, void (*flush_fn)(void *)); -struct extent_map *btree_get_extent(struct btrfs_inode *inode, - struct page *page, size_t pg_offset, - u64 start, u64 len); int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags); int __init btrfs_end_io_wq_init(void); void __cold btrfs_end_io_wq_exit(void); diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index 219a09a2b734..9800a8306368 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -40,6 +40,7 @@ struct io_failure_record; enum { IO_TREE_FS_PINNED_EXTENTS, IO_TREE_FS_EXCLUDED_EXTENTS, + IO_TREE_BTREE_INODE_IO, IO_TREE_INODE_IO, IO_TREE_INODE_IO_FAILURE, IO_TREE_RELOC_BLOCKS, @@ -48,6 +49,7 @@ enum { IO_TREE_INODE_FILE_EXTENT, IO_TREE_LOG_CSUM_RANGE, IO_TREE_SELFTEST, + IO_TREE_DEVICE_ALLOC_STATE, }; struct extent_io_tree { @@ -61,7 +63,6 @@ struct extent_io_tree { u8 owner; spinlock_t lock; - const struct extent_io_ops *ops; }; struct extent_state { diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index e9eedc053fc5..3b21fee13e77 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -400,12 +400,11 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, if (type == BTRFS_SHARED_BLOCK_REF_KEY) { ASSERT(eb->fs_info); /* - * Every shared one has parent tree - * block, which must be aligned to - * nodesize. + * Every shared one has parent tree block, + * which must be aligned to sector size. */ if (offset && - IS_ALIGNED(offset, eb->fs_info->nodesize)) + IS_ALIGNED(offset, eb->fs_info->sectorsize)) return type; } } else if (is_data == BTRFS_REF_TYPE_DATA) { @@ -414,12 +413,11 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, if (type == BTRFS_SHARED_DATA_REF_KEY) { ASSERT(eb->fs_info); /* - * Every shared one has parent tree - * block, which must be aligned to - * nodesize. + * Every shared one has parent tree block, + * which must be aligned to sector size. */ if (offset && - IS_ALIGNED(offset, eb->fs_info->nodesize)) + IS_ALIGNED(offset, eb->fs_info->sectorsize)) return type; } } else { @@ -429,8 +427,9 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, } btrfs_print_leaf((struct extent_buffer *)eb); - btrfs_err(eb->fs_info, "eb %llu invalid extent inline ref type %d", - eb->start, type); + btrfs_err(eb->fs_info, + "eb %llu iref 0x%lx invalid extent inline ref type %d", + eb->start, (unsigned long)iref, type); WARN_ON(1); return BTRFS_REF_TYPE_INVALID; @@ -1178,7 +1177,22 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans, num_bytes, parent, root_objectid, owner, offset, 1); if (ret == 0) { - BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID); + /* + * We're adding refs to a tree block we already own, this + * should not happen at all. + */ + if (owner < BTRFS_FIRST_FREE_OBJECTID) { + btrfs_crit(trans->fs_info, +"adding refs to an existing tree ref, bytenr %llu num_bytes %llu root_objectid %llu", + bytenr, num_bytes, root_objectid); + if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) { + WARN_ON(1); + btrfs_crit(trans->fs_info, + "path->slots[0]=%d path->nodes[0]:", path->slots[0]); + btrfs_print_leaf(path->nodes[0]); + } + return -EUCLEAN; + } update_inline_extent_backref(path, iref, refs_to_add, extent_op, NULL); } else if (ret == -ENOENT) { @@ -1398,6 +1412,9 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, /* * __btrfs_inc_extent_ref - insert backreference for a given extent * + * The counterpart is in __btrfs_free_extent(), with examples and more details + * how it works. + * * @trans: Handle of transaction * * @node: The delayed ref node used to get the bytenr/length for @@ -2850,11 +2867,10 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, len -= to_add; } spin_unlock(&global_rsv->lock); - /* Add to any tickets we may have */ - if (len) - btrfs_try_granting_tickets(fs_info, - space_info); } + /* Add to any tickets we may have */ + if (!readonly && return_free_space && len) + btrfs_try_granting_tickets(fs_info, space_info); spin_unlock(&space_info->lock); } @@ -2936,6 +2952,65 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) return 0; } +/* + * Drop one or more refs of @node. + * + * 1. Locate the extent refs. + * It's either inline in EXTENT/METADATA_ITEM or in keyed SHARED_* item. + * Locate it, then reduce the refs number or remove the ref line completely. + * + * 2. Update the refs count in EXTENT/METADATA_ITEM + * + * Inline backref case: + * + * in extent tree we have: + * + * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82 + * refs 2 gen 6 flags DATA + * extent data backref root FS_TREE objectid 258 offset 0 count 1 + * extent data backref root FS_TREE objectid 257 offset 0 count 1 + * + * This function gets called with: + * + * node->bytenr = 13631488 + * node->num_bytes = 1048576 + * root_objectid = FS_TREE + * owner_objectid = 257 + * owner_offset = 0 + * refs_to_drop = 1 + * + * Then we should get some like: + * + * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82 + * refs 1 gen 6 flags DATA + * extent data backref root FS_TREE objectid 258 offset 0 count 1 + * + * Keyed backref case: + * + * in extent tree we have: + * + * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24 + * refs 754 gen 6 flags DATA + * [...] + * item 2 key (13631488 EXTENT_DATA_REF <HASH>) itemoff 3915 itemsize 28 + * extent data backref root FS_TREE objectid 866 offset 0 count 1 + * + * This function get called with: + * + * node->bytenr = 13631488 + * node->num_bytes = 1048576 + * root_objectid = FS_TREE + * owner_objectid = 866 + * owner_offset = 0 + * refs_to_drop = 1 + * + * Then we should get some like: + * + * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24 + * refs 753 gen 6 flags DATA + * + * And that (13631488 EXTENT_DATA_REF <HASH>) gets removed. + */ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *node, u64 parent, u64 root_objectid, u64 owner_objectid, @@ -2968,7 +3043,15 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, path->leave_spinning = 1; is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID; - BUG_ON(!is_data && refs_to_drop != 1); + + if (!is_data && refs_to_drop != 1) { + btrfs_crit(info, +"invalid refs_to_drop, dropping more than 1 refs for tree block %llu refs_to_drop %u", + node->bytenr, refs_to_drop); + ret = -EINVAL; + btrfs_abort_transaction(trans, ret); + goto out; + } if (is_data) skinny_metadata = false; @@ -2977,6 +3060,13 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, parent, root_objectid, owner_objectid, owner_offset); if (ret == 0) { + /* + * Either the inline backref or the SHARED_DATA_REF/ + * SHARED_BLOCK_REF is found + * + * Here is a quick path to locate EXTENT/METADATA_ITEM. + * It's possible the EXTENT/METADATA_ITEM is near current slot. + */ extent_slot = path->slots[0]; while (extent_slot >= 0) { btrfs_item_key_to_cpu(path->nodes[0], &key, @@ -2993,13 +3083,21 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, found_extent = 1; break; } + + /* Quick path didn't find the EXTEMT/METADATA_ITEM */ if (path->slots[0] - extent_slot > 5) break; extent_slot--; } if (!found_extent) { - BUG_ON(iref); + if (iref) { + btrfs_crit(info, +"invalid iref, no EXTENT/METADATA_ITEM found but has inline extent ref"); + btrfs_abort_transaction(trans, -EUCLEAN); + goto err_dump; + } + /* Must be SHARED_* item, remove the backref first */ ret = remove_extent_backref(trans, path, NULL, refs_to_drop, is_data, &last_ref); @@ -3010,6 +3108,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, btrfs_release_path(path); path->leave_spinning = 1; + /* Slow path to locate EXTENT/METADATA_ITEM */ key.objectid = bytenr; key.type = BTRFS_EXTENT_ITEM_KEY; key.offset = num_bytes; @@ -3084,19 +3183,26 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID && key.type == BTRFS_EXTENT_ITEM_KEY) { struct btrfs_tree_block_info *bi; - BUG_ON(item_size < sizeof(*ei) + sizeof(*bi)); + if (item_size < sizeof(*ei) + sizeof(*bi)) { + btrfs_crit(info, +"invalid extent item size for key (%llu, %u, %llu) owner %llu, has %u expect >= %lu", + key.objectid, key.type, key.offset, + owner_objectid, item_size, + sizeof(*ei) + sizeof(*bi)); + btrfs_abort_transaction(trans, -EUCLEAN); + goto err_dump; + } bi = (struct btrfs_tree_block_info *)(ei + 1); WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi)); } refs = btrfs_extent_refs(leaf, ei); if (refs < refs_to_drop) { - btrfs_err(info, - "trying to drop %d refs but we only have %Lu for bytenr %Lu", + btrfs_crit(info, + "trying to drop %d refs but we only have %llu for bytenr %llu", refs_to_drop, refs, bytenr); - ret = -EINVAL; - btrfs_abort_transaction(trans, ret); - goto out; + btrfs_abort_transaction(trans, -EUCLEAN); + goto err_dump; } refs -= refs_to_drop; @@ -3108,7 +3214,12 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, * be updated by remove_extent_backref */ if (iref) { - BUG_ON(!found_extent); + if (!found_extent) { + btrfs_crit(info, +"invalid iref, got inlined extent ref but no EXTENT/METADATA_ITEM found"); + btrfs_abort_transaction(trans, -EUCLEAN); + goto err_dump; + } } else { btrfs_set_extent_refs(leaf, ei, refs); btrfs_mark_buffer_dirty(leaf); @@ -3123,13 +3234,39 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } } } else { + /* In this branch refs == 1 */ if (found_extent) { - BUG_ON(is_data && refs_to_drop != - extent_data_ref_count(path, iref)); + if (is_data && refs_to_drop != + extent_data_ref_count(path, iref)) { + btrfs_crit(info, + "invalid refs_to_drop, current refs %u refs_to_drop %u", + extent_data_ref_count(path, iref), + refs_to_drop); + btrfs_abort_transaction(trans, -EUCLEAN); + goto err_dump; + } if (iref) { - BUG_ON(path->slots[0] != extent_slot); + if (path->slots[0] != extent_slot) { + btrfs_crit(info, +"invalid iref, extent item key (%llu %u %llu) doesn't have wanted iref", + key.objectid, key.type, + key.offset); + btrfs_abort_transaction(trans, -EUCLEAN); + goto err_dump; + } } else { - BUG_ON(path->slots[0] != extent_slot + 1); + /* + * No inline ref, we must be at SHARED_* item, + * And it's single ref, it must be: + * | extent_slot ||extent_slot + 1| + * [ EXTENT/METADATA_ITEM ][ SHARED_* ITEM ] + */ + if (path->slots[0] != extent_slot + 1) { + btrfs_crit(info, + "invalid SHARED_* item, previous item is not EXTENT/METADATA_ITEM"); + btrfs_abort_transaction(trans, -EUCLEAN); + goto err_dump; + } path->slots[0] = extent_slot; num_to_del = 2; } @@ -3170,6 +3307,19 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, out: btrfs_free_path(path); return ret; +err_dump: + /* + * Leaf dump can take up a lot of log buffer, so we only do full leaf + * dump for debug build. + */ + if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) { + btrfs_crit(info, "path->slots[0]=%d extent_slot=%d", + path->slots[0], extent_slot); + btrfs_print_leaf(path->nodes[0]); + } + + btrfs_free_path(path); + return -EUCLEAN; } /* @@ -3919,11 +4069,12 @@ static int prepare_allocation(struct btrfs_fs_info *fs_info, * |- Push harder to find free extents * |- If not found, re-iterate all block groups */ -static noinline int find_free_extent(struct btrfs_fs_info *fs_info, +static noinline int find_free_extent(struct btrfs_root *root, u64 ram_bytes, u64 num_bytes, u64 empty_size, u64 hint_byte_orig, struct btrfs_key *ins, u64 flags, int delalloc) { + struct btrfs_fs_info *fs_info = root->fs_info; int ret = 0; int cache_block_group_error = 0; struct btrfs_block_group *block_group = NULL; @@ -3955,7 +4106,7 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info, ins->objectid = 0; ins->offset = 0; - trace_find_free_extent(fs_info, num_bytes, empty_size, flags); + trace_find_free_extent(root, num_bytes, empty_size, flags); space_info = btrfs_find_space_info(fs_info, flags); if (!space_info) { @@ -4204,7 +4355,7 @@ int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, flags = get_alloc_profile_by_root(root, is_data); again: WARN_ON(num_bytes < fs_info->sectorsize); - ret = find_free_extent(fs_info, ram_bytes, num_bytes, empty_size, + ret = find_free_extent(root, ram_bytes, num_bytes, empty_size, hint_byte, ins, flags, delalloc); if (!ret && !is_data) { btrfs_dec_block_group_reservations(fs_info, ins->objectid); @@ -4505,7 +4656,8 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, static struct extent_buffer * btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, - u64 bytenr, int level, u64 owner) + u64 bytenr, int level, u64 owner, + enum btrfs_lock_nesting nest) { struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *buf; @@ -4528,7 +4680,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, } btrfs_set_buffer_lockdep_class(owner, buf, level); - btrfs_tree_lock(buf); + __btrfs_tree_lock(buf, nest); btrfs_clean_tree_block(buf); clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); @@ -4574,7 +4726,8 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, u64 parent, u64 root_objectid, const struct btrfs_disk_key *key, int level, u64 hint, - u64 empty_size) + u64 empty_size, + enum btrfs_lock_nesting nest) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_key ins; @@ -4590,7 +4743,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS if (btrfs_is_testing(fs_info)) { buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr, - level, root_objectid); + level, root_objectid, nest); if (!IS_ERR(buf)) root->alloc_bytenr += blocksize; return buf; @@ -4607,7 +4760,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, goto out_unuse; buf = btrfs_init_new_buffer(trans, root, ins.objectid, level, - root_objectid); + root_objectid, nest); if (IS_ERR(buf)) { ret = PTR_ERR(buf); goto out_free_reserved; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index a940edb1e64f..60f5f68d892d 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -160,19 +160,20 @@ static int add_extent_changeset(struct extent_state *state, unsigned bits, return ret; } -static int __must_check submit_one_bio(struct bio *bio, int mirror_num, - unsigned long bio_flags) +int __must_check submit_one_bio(struct bio *bio, int mirror_num, + unsigned long bio_flags) { blk_status_t ret = 0; struct extent_io_tree *tree = bio->bi_private; bio->bi_private = NULL; - if (tree->ops) - ret = tree->ops->submit_bio_hook(tree->private_data, bio, - mirror_num, bio_flags); + if (is_data_inode(tree->private_data)) + ret = btrfs_submit_data_bio(tree->private_data, bio, mirror_num, + bio_flags); else - btrfsic_submit_bio(bio); + ret = btrfs_submit_metadata_bio(tree->private_data, bio, + mirror_num, bio_flags); return blk_status_to_errno(ret); } @@ -280,7 +281,6 @@ void extent_io_tree_init(struct btrfs_fs_info *fs_info, { tree->fs_info = fs_info; tree->state = RB_ROOT; - tree->ops = NULL; tree->dirty_bytes = 0; spin_lock_init(&tree->lock); tree->private_data = private_data; @@ -2819,8 +2819,6 @@ static void end_bio_extent_readpage(struct bio *bio) struct page *page = bvec->bv_page; struct inode *inode = page->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - bool data_inode = btrfs_ino(BTRFS_I(inode)) - != BTRFS_BTREE_INODE_OBJECTID; btrfs_debug(fs_info, "end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u", @@ -2851,9 +2849,12 @@ static void end_bio_extent_readpage(struct bio *bio) mirror = io_bio->mirror_num; if (likely(uptodate)) { - ret = tree->ops->readpage_end_io_hook(io_bio, offset, - page, start, end, - mirror); + if (is_data_inode(inode)) + ret = btrfs_verify_data_csum(io_bio, offset, page, + start, end, mirror); + else + ret = btrfs_validate_metadata_buffer(io_bio, + offset, page, start, end, mirror); if (ret) uptodate = 0; else @@ -2866,7 +2867,7 @@ static void end_bio_extent_readpage(struct bio *bio) if (likely(uptodate)) goto readpage_ok; - if (data_inode) { + if (is_data_inode(inode)) { /* * The generic bio_readpage_error handles errors the @@ -2881,7 +2882,7 @@ static void end_bio_extent_readpage(struct bio *bio) if (!btrfs_submit_read_repair(inode, bio, offset, page, start - page_offset(page), start, end, mirror, - tree->ops->submit_bio_hook)) { + btrfs_submit_data_bio)) { uptodate = !bio->bi_status; offset += len; continue; @@ -3053,7 +3054,6 @@ static int submit_extent_page(unsigned int opf, else contig = bio_end_sector(bio) == sector; - ASSERT(tree->ops); if (btrfs_bio_fits_in_stripe(page, page_size, bio, bio_flags)) can_merge = false; @@ -3110,8 +3110,7 @@ void set_page_extent_mapped(struct page *page) static struct extent_map * __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, - u64 start, u64 len, get_extent_t *get_extent, - struct extent_map **em_cached) + u64 start, u64 len, struct extent_map **em_cached) { struct extent_map *em; @@ -3127,7 +3126,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, *em_cached = NULL; } - em = get_extent(BTRFS_I(inode), page, pg_offset, start, len); + em = btrfs_get_extent(BTRFS_I(inode), page, pg_offset, start, len); if (em_cached && !IS_ERR_OR_NULL(em)) { BUG_ON(*em_cached); refcount_inc(&em->refs); @@ -3142,12 +3141,9 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, * XXX JDM: This needs looking at to ensure proper page locking * return 0 on success, otherwise return error */ -static int __do_readpage(struct page *page, - get_extent_t *get_extent, - struct extent_map **em_cached, - struct bio **bio, int mirror_num, - unsigned long *bio_flags, unsigned int read_flags, - u64 *prev_em_start) +int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, + struct bio **bio, unsigned long *bio_flags, + unsigned int read_flags, u64 *prev_em_start) { struct inode *inode = page->mapping->host; u64 start = page_offset(page); @@ -3209,7 +3205,7 @@ static int __do_readpage(struct page *page, break; } em = __get_extent_map(inode, page, pg_offset, cur, - end - cur + 1, get_extent, em_cached); + end - cur + 1, em_cached); if (IS_ERR_OR_NULL(em)) { SetPageError(page); unlock_extent(tree, cur, end); @@ -3241,7 +3237,7 @@ static int __do_readpage(struct page *page, /* * If we have a file range that points to a compressed extent - * and it's followed by a consecutive file range that points to + * and it's followed by a consecutive file range that points * to the same compressed extent (possibly with a different * offset and/or length, so it either points to the whole extent * or only part of it), we must make sure we do not submit a @@ -3325,7 +3321,7 @@ static int __do_readpage(struct page *page, ret = submit_extent_page(REQ_OP_READ | read_flags, NULL, page, offset, disk_io_size, pg_offset, bio, - end_bio_extent_readpage, mirror_num, + end_bio_extent_readpage, 0, *bio_flags, this_bio_flag, force_bio_submit); @@ -3362,44 +3358,12 @@ static inline void contiguous_readpages(struct page *pages[], int nr_pages, btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); for (index = 0; index < nr_pages; index++) { - __do_readpage(pages[index], btrfs_get_extent, em_cached, - bio, 0, bio_flags, REQ_RAHEAD, prev_em_start); + btrfs_do_readpage(pages[index], em_cached, bio, bio_flags, + REQ_RAHEAD, prev_em_start); put_page(pages[index]); } } -static int __extent_read_full_page(struct page *page, - get_extent_t *get_extent, - struct bio **bio, int mirror_num, - unsigned long *bio_flags, - unsigned int read_flags) -{ - struct btrfs_inode *inode = BTRFS_I(page->mapping->host); - u64 start = page_offset(page); - u64 end = start + PAGE_SIZE - 1; - int ret; - - btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); - - ret = __do_readpage(page, get_extent, NULL, bio, mirror_num, - bio_flags, read_flags, NULL); - return ret; -} - -int extent_read_full_page(struct page *page, get_extent_t *get_extent, - int mirror_num) -{ - struct bio *bio = NULL; - unsigned long bio_flags = 0; - int ret; - - ret = __extent_read_full_page(page, get_extent, &bio, mirror_num, - &bio_flags, 0); - if (bio) - ret = submit_one_bio(bio, mirror_num, bio_flags); - return ret; -} - static void update_nr_written(struct writeback_control *wbc, unsigned long nr_written) { @@ -4552,7 +4516,7 @@ next: * helper function for fiemap, which doesn't want to see any holes. * This maps until we find something past 'last' */ -static struct extent_map *get_extent_skip_holes(struct inode *inode, +static struct extent_map *get_extent_skip_holes(struct btrfs_inode *inode, u64 offset, u64 last) { u64 sectorsize = btrfs_inode_sectorsize(inode); @@ -4567,7 +4531,7 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode, if (len == 0) break; len = ALIGN(len, sectorsize); - em = btrfs_get_extent_fiemap(BTRFS_I(inode), offset, len); + em = btrfs_get_extent_fiemap(inode, offset, len); if (IS_ERR_OR_NULL(em)) return em; @@ -4696,7 +4660,7 @@ static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo, return ret; } -int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, +int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { int ret = 0; @@ -4707,12 +4671,12 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 last; u64 last_for_get_extent = 0; u64 disko = 0; - u64 isize = i_size_read(inode); + u64 isize = i_size_read(&inode->vfs_inode); struct btrfs_key found_key; struct extent_map *em = NULL; struct extent_state *cached_state = NULL; struct btrfs_path *path; - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_root *root = inode->root; struct fiemap_cache cache = { 0 }; struct ulist *roots; struct ulist *tmp_ulist; @@ -4743,8 +4707,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, * lookup the last file extent. We're not using i_size here * because there might be preallocation past i_size */ - ret = btrfs_lookup_file_extent(NULL, root, path, - btrfs_ino(BTRFS_I(inode)), -1, 0); + ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), -1, + 0); if (ret < 0) { goto out_free_ulist; } else { @@ -4758,7 +4722,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, found_type = found_key.type; /* No extents, but there might be delalloc bits */ - if (found_key.objectid != btrfs_ino(BTRFS_I(inode)) || + if (found_key.objectid != btrfs_ino(inode) || found_type != BTRFS_EXTENT_DATA_KEY) { /* have to trust i_size as the end */ last = (u64)-1; @@ -4784,7 +4748,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, last_for_get_extent = isize; } - lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len - 1, + lock_extent_bits(&inode->io_tree, start, start + len - 1, &cached_state); em = get_extent_skip_holes(inode, start, last_for_get_extent); @@ -4853,8 +4817,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, * then we're just getting a count and we can skip the * lookup stuff. */ - ret = btrfs_check_shared(root, - btrfs_ino(BTRFS_I(inode)), + ret = btrfs_check_shared(root, btrfs_ino(inode), bytenr, roots, tmp_ulist); if (ret < 0) goto out_free; @@ -4898,7 +4861,7 @@ out_free: ret = emit_last_fiemap_cache(fieinfo, &cache); free_extent_map(em); out: - unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len - 1, + unlock_extent_cached(&inode->io_tree, start, start + len - 1, &cached_state); out_free_ulist: @@ -4990,7 +4953,7 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, rwlock_init(&eb->lock); atomic_set(&eb->blocking_readers, 0); eb->blocking_writers = 0; - eb->lock_nested = false; + eb->lock_recursed = false; init_waitqueue_head(&eb->write_lock_wq); init_waitqueue_head(&eb->read_lock_wq); @@ -5574,20 +5537,19 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) } ClearPageError(page); - err = __extent_read_full_page(page, - btree_get_extent, &bio, - mirror_num, &bio_flags, - REQ_META); + err = submit_extent_page(REQ_OP_READ | REQ_META, NULL, + page, page_offset(page), PAGE_SIZE, 0, + &bio, end_bio_extent_readpage, + mirror_num, 0, 0, false); if (err) { - ret = err; /* - * We use &bio in above __extent_read_full_page, - * so we ensure that if it returns error, the - * current page fails to add itself to bio and - * it's been unlocked. - * - * We must dec io_pages by ourselves. + * We failed to submit the bio so it's the + * caller's responsibility to perform cleanup + * i.e unlock page/set error bit. */ + ret = err; + SetPageError(page); + unlock_page(page); atomic_dec(&eb->io_pages); } } else { @@ -5622,6 +5584,36 @@ unlock_exit: return ret; } +static bool report_eb_range(const struct extent_buffer *eb, unsigned long start, + unsigned long len) +{ + btrfs_warn(eb->fs_info, + "access to eb bytenr %llu len %lu out of range start %lu len %lu", + eb->start, eb->len, start, len); + WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + + return true; +} + +/* + * Check if the [start, start + len) range is valid before reading/writing + * the eb. + * NOTE: @start and @len are offset inside the eb, not logical address. + * + * Caller should not touch the dst/src memory if this function returns error. + */ +static inline int check_eb_range(const struct extent_buffer *eb, + unsigned long start, unsigned long len) +{ + unsigned long offset; + + /* start, start + len should not go beyond eb->len nor overflow */ + if (unlikely(check_add_overflow(start, len, &offset) || offset > eb->len)) + return report_eb_range(eb, start, len); + + return false; +} + void read_extent_buffer(const struct extent_buffer *eb, void *dstv, unsigned long start, unsigned long len) { @@ -5632,12 +5624,8 @@ void read_extent_buffer(const struct extent_buffer *eb, void *dstv, char *dst = (char *)dstv; unsigned long i = start >> PAGE_SHIFT; - if (start + len > eb->len) { - WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, wanted %lu %lu\n", - eb->start, eb->len, start, len); - memset(dst, 0, len); + if (check_eb_range(eb, start, len)) return; - } offset = offset_in_page(start); @@ -5702,8 +5690,8 @@ int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv, unsigned long i = start >> PAGE_SHIFT; int ret = 0; - WARN_ON(start > eb->len); - WARN_ON(start + len > eb->start + eb->len); + if (check_eb_range(eb, start, len)) + return -EINVAL; offset = offset_in_page(start); @@ -5756,8 +5744,8 @@ void write_extent_buffer(const struct extent_buffer *eb, const void *srcv, char *src = (char *)srcv; unsigned long i = start >> PAGE_SHIFT; - WARN_ON(start > eb->len); - WARN_ON(start + len > eb->start + eb->len); + if (check_eb_range(eb, start, len)) + return; offset = offset_in_page(start); @@ -5785,8 +5773,8 @@ void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start, char *kaddr; unsigned long i = start >> PAGE_SHIFT; - WARN_ON(start > eb->len); - WARN_ON(start + len > eb->start + eb->len); + if (check_eb_range(eb, start, len)) + return; offset = offset_in_page(start); @@ -5830,6 +5818,10 @@ void copy_extent_buffer(const struct extent_buffer *dst, char *kaddr; unsigned long i = dst_offset >> PAGE_SHIFT; + if (check_eb_range(dst, dst_offset, len) || + check_eb_range(src, src_offset, len)) + return; + WARN_ON(src->len != dst_len); offset = offset_in_page(dst_offset); @@ -6019,25 +6011,15 @@ void memcpy_extent_buffer(const struct extent_buffer *dst, unsigned long dst_offset, unsigned long src_offset, unsigned long len) { - struct btrfs_fs_info *fs_info = dst->fs_info; size_t cur; size_t dst_off_in_page; size_t src_off_in_page; unsigned long dst_i; unsigned long src_i; - if (src_offset + len > dst->len) { - btrfs_err(fs_info, - "memmove bogus src_offset %lu move len %lu dst len %lu", - src_offset, len, dst->len); - BUG(); - } - if (dst_offset + len > dst->len) { - btrfs_err(fs_info, - "memmove bogus dst_offset %lu move len %lu dst len %lu", - dst_offset, len, dst->len); - BUG(); - } + if (check_eb_range(dst, dst_offset, len) || + check_eb_range(dst, src_offset, len)) + return; while (len > 0) { dst_off_in_page = offset_in_page(dst_offset); @@ -6064,7 +6046,6 @@ void memmove_extent_buffer(const struct extent_buffer *dst, unsigned long dst_offset, unsigned long src_offset, unsigned long len) { - struct btrfs_fs_info *fs_info = dst->fs_info; size_t cur; size_t dst_off_in_page; size_t src_off_in_page; @@ -6073,18 +6054,9 @@ void memmove_extent_buffer(const struct extent_buffer *dst, unsigned long dst_i; unsigned long src_i; - if (src_offset + len > dst->len) { - btrfs_err(fs_info, - "memmove bogus src_offset %lu move len %lu len %lu", - src_offset, len, dst->len); - BUG(); - } - if (dst_offset + len > dst->len) { - btrfs_err(fs_info, - "memmove bogus dst_offset %lu move len %lu len %lu", - dst_offset, len, dst->len); - BUG(); - } + if (check_eb_range(dst, dst_offset, len) || + check_eb_range(dst, src_offset, len)) + return; if (dst_offset < src_offset) { memcpy_extent_buffer(dst, dst_offset, src_offset, len); return; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 30794ae58498..f39d02e7f7ef 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -74,18 +74,6 @@ typedef blk_status_t (submit_bio_hook_t)(struct inode *inode, struct bio *bio, typedef blk_status_t (extent_submit_bio_start_t)(void *private_data, struct bio *bio, u64 bio_offset); -struct extent_io_ops { - /* - * The following callbacks must be always defined, the function - * pointer will be called unconditionally. - */ - submit_bio_hook_t *submit_bio_hook; - int (*readpage_end_io_hook)(struct btrfs_io_bio *io_bio, u64 phy_offset, - struct page *page, u64 start, u64 end, - int mirror); -}; - - #define INLINE_EXTENT_BUFFER_PAGES 16 #define MAX_INLINE_EXTENT_BUFFER_SIZE (INLINE_EXTENT_BUFFER_PAGES * PAGE_SIZE) struct extent_buffer { @@ -102,7 +90,7 @@ struct extent_buffer { int blocking_writers; atomic_t blocking_readers; - bool lock_nested; + bool lock_recursed; /* >= 0 if eb belongs to a log tree, -1 otherwise */ short log_index; @@ -193,8 +181,11 @@ typedef struct extent_map *(get_extent_t)(struct btrfs_inode *inode, int try_release_extent_mapping(struct page *page, gfp_t mask); int try_release_extent_buffer(struct page *page); -int extent_read_full_page(struct page *page, get_extent_t *get_extent, - int mirror_num); +int __must_check submit_one_bio(struct bio *bio, int mirror_num, + unsigned long bio_flags); +int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, + struct bio **bio, unsigned long *bio_flags, + unsigned int read_flags, u64 *prev_em_start); int extent_write_full_page(struct page *page, struct writeback_control *wbc); int extent_write_locked_range(struct inode *inode, u64 start, u64 end, int mode); @@ -203,7 +194,7 @@ int extent_writepages(struct address_space *mapping, int btree_write_cache_pages(struct address_space *mapping, struct writeback_control *wbc); void extent_readahead(struct readahead_control *rac); -int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, +int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len); void set_page_extent_mapped(struct page *page); diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 7d5ec71615b8..8f4f2bd6d9b9 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -318,8 +318,8 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, if (page_offsets) offset = page_offset(bvec.bv_page) + bvec.bv_offset; - count = btrfs_find_ordered_sum(inode, offset, disk_bytenr, - csum, nblocks); + count = btrfs_find_ordered_sum(BTRFS_I(inode), offset, + disk_bytenr, csum, nblocks); if (count) goto found; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 4507c3d09399..0ff659455b1e 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1057,11 +1057,7 @@ delete_extent_item: if (btrfs_comp_cpu_keys(&key, &slot_key) > 0) path->slots[0]++; } - setup_items_for_insert(root, path, &key, - &extent_item_size, - extent_item_size, - sizeof(struct btrfs_item) + - extent_item_size, 1); + setup_items_for_insert(root, path, &key, &extent_item_size, 1); *key_inserted = 1; } @@ -1477,9 +1473,7 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, int ret = 0; start_pos = round_down(pos, fs_info->sectorsize); - last_pos = start_pos - + round_up(pos + write_bytes - start_pos, - fs_info->sectorsize) - 1; + last_pos = round_up(pos + write_bytes, fs_info->sectorsize) - 1; if (start_pos < inode->vfs_inode.i_size) { struct btrfs_ordered_extent *ordered; @@ -1497,8 +1491,7 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, unlock_page(pages[i]); put_page(pages[i]); } - btrfs_start_ordered_extent(&inode->vfs_inode, - ordered, 1); + btrfs_start_ordered_extent(ordered, 1); btrfs_put_ordered_extent(ordered); return -EAGAIN; } @@ -1872,7 +1865,7 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) loff_t endbyte; int err; - written = generic_file_direct_write(iocb, from); + written = btrfs_direct_IO(iocb, from); if (written < 0 || !iov_iter_count(from)) return written; @@ -2025,7 +2018,40 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, atomic_inc(&BTRFS_I(inode)->sync_writers); if (iocb->ki_flags & IOCB_DIRECT) { + /* + * 1. We must always clear IOCB_DSYNC in order to not deadlock + * in iomap, as it calls generic_write_sync() in this case. + * 2. If we are async, we can call iomap_dio_complete() either + * in + * + * 2.1. A worker thread from the last bio completed. In this + * case we need to mark the btrfs_dio_data that it is + * async in order to call generic_write_sync() properly. + * This is handled by setting BTRFS_DIO_SYNC_STUB in the + * current->journal_info. + * 2.2 The submitter context, because all IO completed + * before we exited iomap_dio_rw(). In this case we can + * just re-set the IOCB_DSYNC on the iocb and we'll do + * the sync below. If our ->end_io() gets called and + * current->journal_info is set, then we know we're in + * our current context and we will clear + * current->journal_info to indicate that we need to + * sync below. + */ + if (sync) { + ASSERT(current->journal_info == NULL); + iocb->ki_flags &= ~IOCB_DSYNC; + current->journal_info = BTRFS_DIO_SYNC_STUB; + } num_written = __btrfs_direct_write(iocb, from); + + /* + * As stated above, we cleared journal_info, so we need to do + * the sync ourselves. + */ + if (sync && current->journal_info == NULL) + iocb->ki_flags |= IOCB_DSYNC; + current->journal_info = NULL; } else { num_written = btrfs_buffered_write(iocb, from); if (num_written > 0) @@ -2065,12 +2091,12 @@ int btrfs_release_file(struct inode *inode, struct file *filp) filp->private_data = NULL; /* - * ordered_data_close is set by setattr when we are about to truncate - * a file from a non-zero size to a zero size. This tries to - * flush down new bytes that may have been written if the - * application were using truncate to replace a file in place. + * Set by setattr when we are about to truncate a file from a non-zero + * size to a zero size. This tries to flush down new bytes that may + * have been written if the application were using truncate to replace + * a file in place. */ - if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, + if (test_and_clear_bit(BTRFS_INODE_FLUSH_ON_CLOSE, &BTRFS_I(inode)->runtime_flags)) filemap_flush(inode->i_mapping); return 0; @@ -2116,20 +2142,24 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) struct btrfs_trans_handle *trans; struct btrfs_log_ctx ctx; int ret = 0, err; + u64 len; + bool full_sync; trace_btrfs_sync_file(file, datasync); btrfs_init_log_ctx(&ctx, inode); /* - * Set the range to full if the NO_HOLES feature is not enabled. - * This is to avoid missing file extent items representing holes after - * replaying the log. + * Always set the range to a full range, otherwise we can get into + * several problems, from missing file extent items to represent holes + * when not using the NO_HOLES feature, to log tree corruption due to + * races between hole detection during logging and completion of ordered + * extents outside the range, to missing checksums due to ordered extents + * for which we flushed only a subset of their pages. */ - if (!btrfs_fs_incompat(fs_info, NO_HOLES)) { - start = 0; - end = LLONG_MAX; - } + start = 0; + end = LLONG_MAX; + len = (u64)LLONG_MAX + 1; /* * We write the dirty pages in the range and wait until they complete @@ -2153,19 +2183,12 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) atomic_inc(&root->log_batch); /* - * If the inode needs a full sync, make sure we use a full range to - * avoid log tree corruption, due to hole detection racing with ordered - * extent completion for adjacent ranges and races between logging and - * completion of ordered extents for adjancent ranges - both races - * could lead to file extent items in the log with overlapping ranges. - * Do this while holding the inode lock, to avoid races with other - * tasks. + * Always check for the full sync flag while holding the inode's lock, + * to avoid races with other tasks. The flag must be either set all the + * time during logging or always off all the time while logging. */ - if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, - &BTRFS_I(inode)->runtime_flags)) { - start = 0; - end = LLONG_MAX; - } + full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags); /* * Before we acquired the inode's lock, someone may have dirtied more @@ -2196,20 +2219,42 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * We have to do this here to avoid the priority inversion of waiting on * IO of a lower priority task while holding a transaction open. * - * Also, the range length can be represented by u64, we have to do the - * typecasts to avoid signed overflow if it's [0, LLONG_MAX]. + * For a full fsync we wait for the ordered extents to complete while + * for a fast fsync we wait just for writeback to complete, and then + * attach the ordered extents to the transaction so that a transaction + * commit waits for their completion, to avoid data loss if we fsync, + * the current transaction commits before the ordered extents complete + * and a power failure happens right after that. */ - ret = btrfs_wait_ordered_range(inode, start, (u64)end - (u64)start + 1); - if (ret) { - up_write(&BTRFS_I(inode)->dio_sem); - inode_unlock(inode); - goto out; + if (full_sync) { + ret = btrfs_wait_ordered_range(inode, start, len); + } else { + /* + * Get our ordered extents as soon as possible to avoid doing + * checksum lookups in the csum tree, and use instead the + * checksums attached to the ordered extents. + */ + btrfs_get_ordered_extents_for_logging(BTRFS_I(inode), + &ctx.ordered_extents); + ret = filemap_fdatawait_range(inode->i_mapping, start, end); } + + if (ret) + goto out_release_extents; + atomic_inc(&root->log_batch); + /* + * If we are doing a fast fsync we can not bail out if the inode's + * last_trans is <= then the last committed transaction, because we only + * update the last_trans of the inode during ordered extent completion, + * and for a fast fsync we don't wait for that, we only wait for the + * writeback to complete. + */ smp_mb(); if (btrfs_inode_in_log(BTRFS_I(inode), fs_info->generation) || - BTRFS_I(inode)->last_trans <= fs_info->last_trans_committed) { + (BTRFS_I(inode)->last_trans <= fs_info->last_trans_committed && + (full_sync || list_empty(&ctx.ordered_extents)))) { /* * We've had everything committed since the last time we were * modified so clear this flag in case it was set for whatever @@ -2225,9 +2270,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * checked called fsync. */ ret = filemap_check_wb_err(inode->i_mapping, file->f_wb_err); - up_write(&BTRFS_I(inode)->dio_sem); - inode_unlock(inode); - goto out; + goto out_release_extents; } /* @@ -2244,12 +2287,11 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); - up_write(&BTRFS_I(inode)->dio_sem); - inode_unlock(inode); - goto out; + goto out_release_extents; } - ret = btrfs_log_dentry_safe(trans, dentry, start, end, &ctx); + ret = btrfs_log_dentry_safe(trans, dentry, &ctx); + btrfs_release_log_ctx_extents(&ctx); if (ret < 0) { /* Fallthrough and commit/free transaction. */ ret = 1; @@ -2276,6 +2318,13 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) goto out; } } + if (!full_sync) { + ret = btrfs_wait_ordered_range(inode, start, len); + if (ret) { + btrfs_end_transaction(trans); + goto out; + } + } ret = btrfs_commit_transaction(trans); } else { ret = btrfs_end_transaction(trans); @@ -2286,6 +2335,12 @@ out: if (!ret) ret = err; return ret > 0 ? -EIO : ret; + +out_release_extents: + btrfs_release_log_ctx_extents(&ctx); + up_write(&BTRFS_I(inode)->dio_sem); + inode_unlock(inode); + goto out; } static const struct vm_operations_struct btrfs_file_vm_ops = { @@ -2481,7 +2536,8 @@ static int btrfs_punch_hole_lock_range(struct inode *inode, lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, cached_state); - ordered = btrfs_lookup_first_ordered_extent(inode, lockend); + ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode), + lockend); /* * We need to make sure we have no ordered extents in this range @@ -2509,11 +2565,11 @@ static int btrfs_punch_hole_lock_range(struct inode *inode, return 0; } -static int btrfs_insert_clone_extent(struct btrfs_trans_handle *trans, +static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans, struct inode *inode, struct btrfs_path *path, - struct btrfs_clone_extent_info *clone_info, - const u64 clone_len) + struct btrfs_replace_extent_info *extent_info, + const u64 replace_len) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; @@ -2522,51 +2578,69 @@ static int btrfs_insert_clone_extent(struct btrfs_trans_handle *trans, struct btrfs_key key; int slot; struct btrfs_ref ref = { 0 }; - u64 ref_offset; int ret; - if (clone_len == 0) + if (replace_len == 0) return 0; - if (clone_info->disk_offset == 0 && + if (extent_info->disk_offset == 0 && btrfs_fs_incompat(fs_info, NO_HOLES)) return 0; key.objectid = btrfs_ino(BTRFS_I(inode)); key.type = BTRFS_EXTENT_DATA_KEY; - key.offset = clone_info->file_offset; + key.offset = extent_info->file_offset; ret = btrfs_insert_empty_item(trans, root, path, &key, - clone_info->item_size); + sizeof(struct btrfs_file_extent_item)); if (ret) return ret; leaf = path->nodes[0]; slot = path->slots[0]; - write_extent_buffer(leaf, clone_info->extent_buf, + write_extent_buffer(leaf, extent_info->extent_buf, btrfs_item_ptr_offset(leaf, slot), - clone_info->item_size); + sizeof(struct btrfs_file_extent_item)); extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); - btrfs_set_file_extent_offset(leaf, extent, clone_info->data_offset); - btrfs_set_file_extent_num_bytes(leaf, extent, clone_len); + ASSERT(btrfs_file_extent_type(leaf, extent) != BTRFS_FILE_EXTENT_INLINE); + btrfs_set_file_extent_offset(leaf, extent, extent_info->data_offset); + btrfs_set_file_extent_num_bytes(leaf, extent, replace_len); + if (extent_info->is_new_extent) + btrfs_set_file_extent_generation(leaf, extent, trans->transid); btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), - clone_info->file_offset, clone_len); + extent_info->file_offset, replace_len); if (ret) return ret; /* If it's a hole, nothing more needs to be done. */ - if (clone_info->disk_offset == 0) + if (extent_info->disk_offset == 0) return 0; - inode_add_bytes(inode, clone_len); - btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, - clone_info->disk_offset, - clone_info->disk_len, 0); - ref_offset = clone_info->file_offset - clone_info->data_offset; - btrfs_init_data_ref(&ref, root->root_key.objectid, - btrfs_ino(BTRFS_I(inode)), ref_offset); - ret = btrfs_inc_extent_ref(trans, &ref); + inode_add_bytes(inode, replace_len); + + if (extent_info->is_new_extent && extent_info->insertions == 0) { + key.objectid = extent_info->disk_offset; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = extent_info->disk_len; + ret = btrfs_alloc_reserved_file_extent(trans, root, + btrfs_ino(BTRFS_I(inode)), + extent_info->file_offset, + extent_info->qgroup_reserved, + &key); + } else { + u64 ref_offset; + + btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, + extent_info->disk_offset, + extent_info->disk_len, 0); + ref_offset = extent_info->file_offset - extent_info->data_offset; + btrfs_init_data_ref(&ref, root->root_key.objectid, + btrfs_ino(BTRFS_I(inode)), ref_offset); + ret = btrfs_inc_extent_ref(trans, &ref); + } + + extent_info->insertions++; return ret; } @@ -2574,15 +2648,15 @@ static int btrfs_insert_clone_extent(struct btrfs_trans_handle *trans, /* * The respective range must have been previously locked, as well as the inode. * The end offset is inclusive (last byte of the range). - * @clone_info is NULL for fallocate's hole punching and non-NULL for extent - * cloning. - * When cloning, we don't want to end up in a state where we dropped extents - * without inserting a new one, so we must abort the transaction to avoid a - * corruption. + * @extent_info is NULL for fallocate's hole punching and non-NULL when replacing + * the file range with an extent. + * When not punching a hole, we don't want to end up in a state where we dropped + * extents without inserting a new one, so we must abort the transaction to avoid + * a corruption. */ -int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, +int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, const u64 start, const u64 end, - struct btrfs_clone_extent_info *clone_info, + struct btrfs_replace_extent_info *extent_info, struct btrfs_trans_handle **trans_out) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -2611,10 +2685,10 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, /* * 1 - update the inode * 1 - removing the extents in the range - * 1 - adding the hole extent if no_holes isn't set or if we are cloning - * an extent + * 1 - adding the hole extent if no_holes isn't set or if we are + * replacing the range with a new extent */ - if (!btrfs_fs_incompat(fs_info, NO_HOLES) || clone_info) + if (!btrfs_fs_incompat(fs_info, NO_HOLES) || extent_info) rsv_count = 3; else rsv_count = 2; @@ -2644,14 +2718,15 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, * returned by __btrfs_drop_extents() without having * changed anything in the file. */ - if (clone_info && ret && ret != -EOPNOTSUPP) + if (extent_info && !extent_info->is_new_extent && + ret && ret != -EOPNOTSUPP) btrfs_abort_transaction(trans, ret); break; } trans->block_rsv = &fs_info->trans_block_rsv; - if (!clone_info && cur_offset < drop_end && + if (!extent_info && cur_offset < drop_end && cur_offset < ino_size) { ret = fill_holes(trans, BTRFS_I(inode), path, cur_offset, drop_end); @@ -2665,7 +2740,7 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, btrfs_abort_transaction(trans, ret); break; } - } else if (!clone_info && cur_offset < drop_end) { + } else if (!extent_info && cur_offset < drop_end) { /* * We are past the i_size here, but since we didn't * insert holes we need to clear the mapped area so we @@ -2685,18 +2760,18 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, } } - if (clone_info && drop_end > clone_info->file_offset) { - u64 clone_len = drop_end - clone_info->file_offset; + if (extent_info && drop_end > extent_info->file_offset) { + u64 replace_len = drop_end - extent_info->file_offset; - ret = btrfs_insert_clone_extent(trans, inode, path, - clone_info, clone_len); + ret = btrfs_insert_replace_extent(trans, inode, path, + extent_info, replace_len); if (ret) { btrfs_abort_transaction(trans, ret); break; } - clone_info->data_len -= clone_len; - clone_info->data_offset += clone_len; - clone_info->file_offset += clone_len; + extent_info->data_len -= replace_len; + extent_info->data_offset += replace_len; + extent_info->file_offset += replace_len; } cur_offset = drop_end; @@ -2720,7 +2795,7 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, BUG_ON(ret); /* shouldn't happen */ trans->block_rsv = rsv; - if (!clone_info) { + if (!extent_info) { ret = find_first_non_hole(inode, &cur_offset, &len); if (unlikely(ret < 0)) break; @@ -2739,7 +2814,7 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, * than 16Mb would force the full fsync any way (when * try_release_extent_mapping() is invoked during page cache truncation. */ - if (clone_info) + if (extent_info && !extent_info->is_new_extent) set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); @@ -2765,7 +2840,7 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, * (because it's useless) or if it represents a 0 bytes range (when * cur_offset == drop_end). */ - if (!clone_info && cur_offset < ino_size && cur_offset < drop_end) { + if (!extent_info && cur_offset < ino_size && cur_offset < drop_end) { ret = fill_holes(trans, BTRFS_I(inode), path, cur_offset, drop_end); if (ret) { @@ -2773,7 +2848,7 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, btrfs_abort_transaction(trans, ret); goto out_trans; } - } else if (!clone_info && cur_offset < drop_end) { + } else if (!extent_info && cur_offset < drop_end) { /* See the comment in the loop above for the reasoning here. */ ret = btrfs_inode_clear_file_extent_range(BTRFS_I(inode), cur_offset, drop_end - cur_offset); @@ -2783,9 +2858,9 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, } } - if (clone_info) { - ret = btrfs_insert_clone_extent(trans, inode, path, clone_info, - clone_info->data_len); + if (extent_info) { + ret = btrfs_insert_replace_extent(trans, inode, path, extent_info, + extent_info->data_len); if (ret) { btrfs_abort_transaction(trans, ret); goto out_trans; @@ -2840,9 +2915,9 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) goto out_only_mutex; } - lockstart = round_up(offset, btrfs_inode_sectorsize(inode)); + lockstart = round_up(offset, btrfs_inode_sectorsize(BTRFS_I(inode))); lockend = round_down(offset + len, - btrfs_inode_sectorsize(inode)) - 1; + btrfs_inode_sectorsize(BTRFS_I(inode))) - 1; same_block = (BTRFS_BYTES_TO_BLKS(fs_info, offset)) == (BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)); /* @@ -2927,7 +3002,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) goto out; } - ret = btrfs_punch_hole_range(inode, path, lockstart, lockend, NULL, + ret = btrfs_replace_file_extents(inode, path, lockstart, lockend, NULL, &trans); btrfs_free_path(path); if (ret) @@ -3044,7 +3119,7 @@ enum { RANGE_BOUNDARY_HOLE, }; -static int btrfs_zero_range_check_range_boundary(struct inode *inode, +static int btrfs_zero_range_check_range_boundary(struct btrfs_inode *inode, u64 offset) { const u64 sectorsize = btrfs_inode_sectorsize(inode); @@ -3052,7 +3127,7 @@ static int btrfs_zero_range_check_range_boundary(struct inode *inode, int ret; offset = round_down(offset, sectorsize); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize); if (IS_ERR(em)) return PTR_ERR(em); @@ -3077,7 +3152,7 @@ static int btrfs_zero_range(struct inode *inode, struct extent_changeset *data_reserved = NULL; int ret; u64 alloc_hint = 0; - const u64 sectorsize = btrfs_inode_sectorsize(inode); + const u64 sectorsize = btrfs_inode_sectorsize(BTRFS_I(inode)); u64 alloc_start = round_down(offset, sectorsize); u64 alloc_end = round_up(offset + len, sectorsize); u64 bytes_to_reserve = 0; @@ -3167,7 +3242,8 @@ static int btrfs_zero_range(struct inode *inode, * to cover them. */ if (!IS_ALIGNED(offset, sectorsize)) { - ret = btrfs_zero_range_check_range_boundary(inode, offset); + ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode), + offset); if (ret < 0) goto out; if (ret == RANGE_BOUNDARY_HOLE) { @@ -3183,7 +3259,7 @@ static int btrfs_zero_range(struct inode *inode, } if (!IS_ALIGNED(offset + len, sectorsize)) { - ret = btrfs_zero_range_check_range_boundary(inode, + ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode), offset + len); if (ret < 0) goto out; @@ -3258,7 +3334,7 @@ static long btrfs_fallocate(struct file *file, int mode, u64 locked_end; u64 actual_end = 0; struct extent_map *em; - int blocksize = btrfs_inode_sectorsize(inode); + int blocksize = btrfs_inode_sectorsize(BTRFS_I(inode)); int ret; alloc_start = round_down(offset, blocksize); @@ -3340,7 +3416,8 @@ static long btrfs_fallocate(struct file *file, int mode, */ lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, &cached_state); - ordered = btrfs_lookup_first_ordered_extent(inode, locked_end); + ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode), + locked_end); if (ordered && ordered->file_offset + ordered->num_bytes > alloc_start && @@ -3541,9 +3618,26 @@ static int btrfs_file_open(struct inode *inode, struct file *filp) return generic_file_open(inode, filp); } +static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + ssize_t ret = 0; + + if (iocb->ki_flags & IOCB_DIRECT) { + struct inode *inode = file_inode(iocb->ki_filp); + + inode_lock_shared(inode); + ret = btrfs_direct_IO(iocb, to); + inode_unlock_shared(inode); + if (ret < 0) + return ret; + } + + return generic_file_buffered_read(iocb, to, ret); +} + const struct file_operations btrfs_file_operations = { .llseek = btrfs_file_llseek, - .read_iter = generic_file_read_iter, + .read_iter = btrfs_file_read_iter, .splice_read = generic_file_splice_read, .write_iter = btrfs_file_write_iter, .splice_write = iter_file_splice_write, diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index dc82fd0c80cb..af0013d3df63 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -413,8 +413,6 @@ static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate) static void io_ctl_set_generation(struct btrfs_io_ctl *io_ctl, u64 generation) { - __le64 *val; - io_ctl_map_page(io_ctl, 1); /* @@ -429,14 +427,13 @@ static void io_ctl_set_generation(struct btrfs_io_ctl *io_ctl, u64 generation) io_ctl->size -= sizeof(u64) * 2; } - val = io_ctl->cur; - *val = cpu_to_le64(generation); + put_unaligned_le64(generation, io_ctl->cur); io_ctl->cur += sizeof(u64); } static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation) { - __le64 *gen; + u64 cache_gen; /* * Skip the crc area. If we don't check crcs then we just have a 64bit @@ -451,11 +448,11 @@ static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation) io_ctl->size -= sizeof(u64) * 2; } - gen = io_ctl->cur; - if (le64_to_cpu(*gen) != generation) { + cache_gen = get_unaligned_le64(io_ctl->cur); + if (cache_gen != generation) { btrfs_err_rl(io_ctl->fs_info, "space cache generation (%llu) does not match inode (%llu)", - *gen, generation); + cache_gen, generation); io_ctl_unmap_page(io_ctl); return -EIO; } @@ -525,8 +522,8 @@ static int io_ctl_add_entry(struct btrfs_io_ctl *io_ctl, u64 offset, u64 bytes, return -ENOSPC; entry = io_ctl->cur; - entry->offset = cpu_to_le64(offset); - entry->bytes = cpu_to_le64(bytes); + put_unaligned_le64(offset, &entry->offset); + put_unaligned_le64(bytes, &entry->bytes); entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP : BTRFS_FREE_SPACE_EXTENT; io_ctl->cur += sizeof(struct btrfs_free_space_entry); @@ -599,8 +596,8 @@ static int io_ctl_read_entry(struct btrfs_io_ctl *io_ctl, } e = io_ctl->cur; - entry->offset = le64_to_cpu(e->offset); - entry->bytes = le64_to_cpu(e->bytes); + entry->offset = get_unaligned_le64(&e->offset); + entry->bytes = get_unaligned_le64(&e->bytes); *type = e->type; io_ctl->cur += sizeof(struct btrfs_free_space_entry); io_ctl->size -= sizeof(struct btrfs_free_space_entry); @@ -1353,7 +1350,7 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, /* * at this point the pages are under IO and we're happy, - * The caller is responsible for waiting on them and updating the + * The caller is responsible for waiting on them and updating * the cache and the inode */ io_ctl->entries = entries; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 9570458aa847..936c3137c646 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6,7 +6,6 @@ #include <crypto/hash.h> #include <linux/kernel.h> #include <linux/bio.h> -#include <linux/buffer_head.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/pagemap.h> @@ -31,6 +30,7 @@ #include <linux/swap.h> #include <linux/migrate.h> #include <linux/sched/mm.h> +#include <linux/iomap.h> #include <asm/unaligned.h> #include "misc.h" #include "ctree.h" @@ -59,9 +59,10 @@ struct btrfs_iget_args { struct btrfs_dio_data { u64 reserve; - u64 unsubmitted_oe_range_start; - u64 unsubmitted_oe_range_end; - int overwrite; + loff_t length; + ssize_t submitted; + struct extent_changeset *data_reserved; + bool sync; }; static const struct inode_operations btrfs_dir_inode_operations; @@ -70,7 +71,6 @@ static const struct inode_operations btrfs_special_inode_operations; static const struct inode_operations btrfs_file_inode_operations; static const struct address_space_operations btrfs_aops; static const struct file_operations btrfs_dir_file_operations; -static const struct extent_io_ops btrfs_extent_io_ops; static struct kmem_cache *btrfs_inode_cachep; struct kmem_cache *btrfs_trans_handle_cachep; @@ -140,13 +140,6 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, static int btrfs_dirty_inode(struct inode *inode); -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS -void btrfs_test_inode_set_ops(struct inode *inode) -{ - BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; -} -#endif - static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, struct inode *inode, struct inode *dir, const struct qstr *qstr) @@ -2183,9 +2176,8 @@ static blk_status_t btrfs_submit_bio_start(void *private_data, struct bio *bio, * * c-3) otherwise: async submit */ -static blk_status_t btrfs_submit_bio_hook(struct inode *inode, struct bio *bio, - int mirror_num, - unsigned long bio_flags) +blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio, + int mirror_num, unsigned long bio_flags) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -2245,16 +2237,15 @@ out: * given a list of ordered sums record them in the inode. This happens * at IO completion time based on sums calculated at bio submission time. */ -static noinline int add_pending_csums(struct btrfs_trans_handle *trans, - struct inode *inode, struct list_head *list) +static int add_pending_csums(struct btrfs_trans_handle *trans, + struct list_head *list) { struct btrfs_ordered_sum *sum; int ret; list_for_each_entry(sum, list, list) { trans->adding_csums = true; - ret = btrfs_csum_file_blocks(trans, - BTRFS_I(inode)->root->fs_info->csum_root, sum); + ret = btrfs_csum_file_blocks(trans, trans->fs_info->csum_root, sum); trans->adding_csums = false; if (ret) return ret; @@ -2357,7 +2348,7 @@ again: unlock_extent_cached(&inode->io_tree, page_start, page_end, &cached_state); unlock_page(page); - btrfs_start_ordered_extent(&inode->vfs_inode, ordered, 1); + btrfs_start_ordered_extent(ordered, 1); btrfs_put_ordered_extent(ordered); goto again; } @@ -2548,7 +2539,6 @@ static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info, } static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans, - struct inode *inode, struct btrfs_ordered_extent *oe) { struct btrfs_file_extent_item stack_fi; @@ -2568,8 +2558,9 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans, btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type); /* Encryption and other encoding is reserved and all 0 */ - return insert_reserved_file_extent(trans, BTRFS_I(inode), oe->file_offset, - &stack_fi, oe->qgroup_rsv); + return insert_reserved_file_extent(trans, BTRFS_I(oe->inode), + oe->file_offset, &stack_fi, + oe->qgroup_rsv); } /* @@ -2666,8 +2657,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) logical_len); } else { BUG_ON(root == fs_info->tree_root); - ret = insert_ordered_extent_file_extent(trans, inode, - ordered_extent); + ret = insert_ordered_extent_file_extent(trans, ordered_extent); if (!ret) { clear_reserved_extent = false; btrfs_release_delalloc_bytes(fs_info, @@ -2683,7 +2673,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) goto out; } - ret = add_pending_csums(trans, inode, &ordered_extent->list); + ret = add_pending_csums(trans, &ordered_extent->list); if (ret) { btrfs_abort_transaction(trans, ret); goto out; @@ -2752,7 +2742,7 @@ out: * This needs to be done to make sure anybody waiting knows we are done * updating everything for this ordered extent. */ - btrfs_remove_ordered_extent(inode, ordered_extent); + btrfs_remove_ordered_extent(BTRFS_I(inode), ordered_extent); /* once for us */ btrfs_put_ordered_extent(ordered_extent); @@ -2772,8 +2762,8 @@ static void finish_ordered_fn(struct btrfs_work *work) void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start, u64 end, int uptodate) { - struct inode *inode = page->mapping->host; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_inode *inode = BTRFS_I(page->mapping->host); + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_ordered_extent *ordered_extent = NULL; struct btrfs_workqueue *wq; @@ -2784,7 +2774,7 @@ void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start, end - start + 1, uptodate)) return; - if (btrfs_is_free_space_inode(BTRFS_I(inode))) + if (btrfs_is_free_space_inode(inode)) wq = fs_info->endio_freespace_worker; else wq = fs_info->endio_write_workers; @@ -2833,9 +2823,8 @@ zeroit: * if there's a match, we allow the bio to finish. If not, the code in * extent_io.c will try to find good copies for us. */ -static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio, - u64 phy_offset, struct page *page, - u64 start, u64 end, int mirror) +int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u64 phy_offset, + struct page *page, u64 start, u64 end, int mirror) { size_t offset = start - page_offset(page); struct inode *inode = page->mapping->host; @@ -3055,7 +3044,6 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) if (ret == -ENOENT && root == fs_info->tree_root) { struct btrfs_root *dead_root; - struct btrfs_fs_info *fs_info = root->fs_info; int is_dead_root = 0; /* @@ -3395,7 +3383,6 @@ cache_acl: switch (inode->i_mode & S_IFMT) { case S_IFREG: inode->i_mapping->a_ops = &btrfs_aops; - BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; inode->i_fop = &btrfs_file_operations; inode->i_op = &btrfs_file_inode_operations; break; @@ -4051,7 +4038,7 @@ out_end_trans: err = ret; inode->i_flags |= S_DEAD; out_release: - btrfs_subvolume_release_metadata(fs_info, &block_rsv); + btrfs_subvolume_release_metadata(root, &block_rsv); out_up_write: up_write(&fs_info->subvol_sem); if (err) { @@ -4583,7 +4570,7 @@ again: &cached_state); unlock_page(page); put_page(page); - btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_start_ordered_extent(ordered, 1); btrfs_put_ordered_extent(ordered); goto again; } @@ -4848,19 +4835,16 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) /* * We're truncating a file that used to have good data down to - * zero. Make sure it gets into the ordered flush list so that - * any new writes get down to disk quickly. + * zero. Make sure any new writes to the file get on disk + * on close. */ if (newsize == 0) - set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, + set_bit(BTRFS_INODE_FLUSH_ON_CLOSE, &BTRFS_I(inode)->runtime_flags); truncate_setsize(inode, newsize); - /* Disable nonlocked read DIO to avoid the endless truncate */ - btrfs_inode_block_unlocked_dio(BTRFS_I(inode)); inode_dio_wait(inode); - btrfs_inode_resume_unlocked_dio(BTRFS_I(inode)); ret = btrfs_truncate(inode, newsize == oldsize); if (ret && inode->i_nlink) { @@ -5305,15 +5289,15 @@ static void inode_tree_add(struct inode *inode) spin_unlock(&root->inode_lock); } -static void inode_tree_del(struct inode *inode) +static void inode_tree_del(struct btrfs_inode *inode) { - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_root *root = inode->root; int empty = 0; spin_lock(&root->inode_lock); - if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) { - rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree); - RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); + if (!RB_EMPTY_NODE(&inode->rb_node)) { + rb_erase(&inode->rb_node, &root->inode_tree); + RB_CLEAR_NODE(&inode->rb_node); empty = RB_EMPTY_ROOT(&root->inode_tree); } spin_unlock(&root->inode_lock); @@ -6311,7 +6295,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, if (err) goto out_unlock; - BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; d_instantiate_new(dentry, inode); out_unlock: @@ -6374,7 +6357,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, drop_inode = 1; } else { struct dentry *parent = dentry->d_parent; - int ret; err = btrfs_update_inode(trans, root, inode); if (err) @@ -6389,12 +6371,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, goto fail; } d_instantiate(dentry, inode); - ret = btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent, - true, NULL); - if (ret == BTRFS_NEED_TRANS_COMMIT) { - err = btrfs_commit_transaction(trans); - trans = NULL; - } + btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent); } fail: @@ -6540,8 +6517,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, u64 start, u64 len) { struct btrfs_fs_info *fs_info = inode->root->fs_info; - int ret; - int err = 0; + int ret = 0; u64 extent_start = 0; u64 extent_end = 0; u64 objectid = btrfs_ino(inode); @@ -6569,7 +6545,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, } em = alloc_extent_map(); if (!em) { - err = -ENOMEM; + ret = -ENOMEM; goto out; } em->start = EXTENT_MAP_HOLE; @@ -6579,7 +6555,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, path = btrfs_alloc_path(); if (!path) { - err = -ENOMEM; + ret = -ENOMEM; goto out; } @@ -6592,14 +6568,16 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, */ path->leave_spinning = 1; + path->recurse = btrfs_is_free_space_inode(inode); + ret = btrfs_lookup_file_extent(NULL, root, path, objectid, start, 0); if (ret < 0) { - err = ret; goto out; } else if (ret > 0) { if (path->slots[0] == 0) goto not_found; path->slots[0]--; + ret = 0; } leaf = path->nodes[0]; @@ -6625,7 +6603,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, extent_type == BTRFS_FILE_EXTENT_PREALLOC) { /* Only regular file could have regular/prealloc extent */ if (!S_ISREG(inode->vfs_inode.i_mode)) { - err = -EUCLEAN; + ret = -EUCLEAN; btrfs_crit(fs_info, "regular/prealloc extent found for non-regular inode %llu", btrfs_ino(inode)); @@ -6643,12 +6621,11 @@ next: path->slots[0]++; if (path->slots[0] >= btrfs_header_nritems(leaf)) { ret = btrfs_next_leaf(root, path); - if (ret < 0) { - err = ret; + if (ret < 0) goto out; - } else if (ret > 0) { + else if (ret > 0) goto not_found; - } + leaf = path->nodes[0]; } btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); @@ -6699,10 +6676,8 @@ next: BTRFS_COMPRESS_NONE) { ret = uncompress_inline(path, page, pg_offset, extent_offset, item); - if (ret) { - err = ret; + if (ret) goto out; - } } else { map = kmap(page); read_extent_buffer(leaf, map + pg_offset, ptr, @@ -6726,29 +6701,28 @@ not_found: em->len = len; em->block_start = EXTENT_MAP_HOLE; insert: + ret = 0; btrfs_release_path(path); if (em->start > start || extent_map_end(em) <= start) { btrfs_err(fs_info, "bad extent! em: [%llu %llu] passed [%llu %llu]", em->start, em->len, start, len); - err = -EIO; + ret = -EIO; goto out; } - err = 0; write_lock(&em_tree->lock); - err = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len); + ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len); write_unlock(&em_tree->lock); out: btrfs_free_path(path); trace_btrfs_get_extent(root, inode, em); - if (err) { + if (ret) { free_extent_map(em); - return ERR_PTR(err); + return ERR_PTR(ret); } - BUG_ON(!em); /* Error is always set */ return em; } @@ -7111,7 +7085,7 @@ out: } static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, - struct extent_state **cached_state, int writing) + struct extent_state **cached_state, bool writing) { struct btrfs_ordered_extent *ordered; int ret = 0; @@ -7160,7 +7134,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, */ if (writing || test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) - btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_start_ordered_extent(ordered, 1); else ret = -ENOTBLK; btrfs_put_ordered_extent(ordered); @@ -7249,30 +7223,7 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, } -static int btrfs_get_blocks_direct_read(struct extent_map *em, - struct buffer_head *bh_result, - struct inode *inode, - u64 start, u64 len) -{ - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - - if (em->block_start == EXTENT_MAP_HOLE || - test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) - return -ENOENT; - - len = min(len, em->len - (start - em->start)); - - bh_result->b_blocknr = (em->block_start + (start - em->start)) >> - inode->i_blkbits; - bh_result->b_size = len; - bh_result->b_bdev = fs_info->fs_devices->latest_bdev; - set_buffer_mapped(bh_result); - - return 0; -} - static int btrfs_get_blocks_direct_write(struct extent_map **map, - struct buffer_head *bh_result, struct inode *inode, struct btrfs_dio_data *dio_data, u64 start, u64 len) @@ -7333,7 +7284,6 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, } /* this will cow the extent */ - len = bh_result->b_size; free_extent_map(em); *map = em = btrfs_new_extent_direct(BTRFS_I(inode), start, len); if (IS_ERR(em)) { @@ -7344,64 +7294,88 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, len = min(len, em->len - (start - em->start)); skip_cow: - bh_result->b_blocknr = (em->block_start + (start - em->start)) >> - inode->i_blkbits; - bh_result->b_size = len; - bh_result->b_bdev = fs_info->fs_devices->latest_bdev; - set_buffer_mapped(bh_result); - - if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) - set_buffer_new(bh_result); - /* * Need to update the i_size under the extent lock so buffered * readers will get the updated i_size when we unlock. */ - if (!dio_data->overwrite && start + len > i_size_read(inode)) + if (start + len > i_size_read(inode)) i_size_write(inode, start + len); - WARN_ON(dio_data->reserve < len); dio_data->reserve -= len; - dio_data->unsubmitted_oe_range_end = start + len; - current->journal_info = dio_data; out: return ret; } -static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) +static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, + loff_t length, unsigned int flags, struct iomap *iomap, + struct iomap *srcmap) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_map *em; struct extent_state *cached_state = NULL; struct btrfs_dio_data *dio_data = NULL; - u64 start = iblock << inode->i_blkbits; u64 lockstart, lockend; - u64 len = bh_result->b_size; + const bool write = !!(flags & IOMAP_WRITE); int ret = 0; + u64 len = length; + bool unlock_extents = false; + bool sync = (current->journal_info == BTRFS_DIO_SYNC_STUB); + + /* + * We used current->journal_info here to see if we were sync, but + * there's a lot of tests in the enospc machinery to not do flushing if + * we have a journal_info set, so we need to clear this out and re-set + * it in iomap_end. + */ + ASSERT(current->journal_info == NULL || + current->journal_info == BTRFS_DIO_SYNC_STUB); + current->journal_info = NULL; - if (!create) + if (!write) len = min_t(u64, len, fs_info->sectorsize); lockstart = start; lockend = start + len - 1; - if (current->journal_info) { - /* - * Need to pull our outstanding extents and set journal_info to NULL so - * that anything that needs to check if there's a transaction doesn't get - * confused. - */ - dio_data = current->journal_info; - current->journal_info = NULL; + /* + * The generic stuff only does filemap_write_and_wait_range, which + * isn't enough if we've written compressed pages to this area, so we + * need to flush the dirty pages again to make absolutely sure that any + * outstanding dirty pages are on disk. + */ + if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, + &BTRFS_I(inode)->runtime_flags)) { + ret = filemap_fdatawrite_range(inode->i_mapping, start, + start + length - 1); + if (ret) + return ret; + } + + dio_data = kzalloc(sizeof(*dio_data), GFP_NOFS); + if (!dio_data) + return -ENOMEM; + + dio_data->sync = sync; + dio_data->length = length; + if (write) { + dio_data->reserve = round_up(length, fs_info->sectorsize); + ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), + &dio_data->data_reserved, + start, dio_data->reserve); + if (ret) { + extent_changeset_free(dio_data->data_reserved); + kfree(dio_data); + return ret; + } } + iomap->private = dio_data; + /* * If this errors out it's because we couldn't invalidate pagecache for * this range and we need to fallback to buffered. */ - if (lock_extent_direct(inode, lockstart, lockend, &cached_state, - create)) { + if (lock_extent_direct(inode, lockstart, lockend, &cached_state, write)) { ret = -ENOTBLK; goto err; } @@ -7433,35 +7407,47 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, goto unlock_err; } - if (create) { - ret = btrfs_get_blocks_direct_write(&em, bh_result, inode, - dio_data, start, len); + len = min(len, em->len - (start - em->start)); + if (write) { + ret = btrfs_get_blocks_direct_write(&em, inode, dio_data, + start, len); if (ret < 0) goto unlock_err; - - unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, - lockend, &cached_state); + unlock_extents = true; + /* Recalc len in case the new em is smaller than requested */ + len = min(len, em->len - (start - em->start)); } else { - ret = btrfs_get_blocks_direct_read(em, bh_result, inode, - start, len); - /* Can be negative only if we read from a hole */ - if (ret < 0) { - ret = 0; - free_extent_map(em); - goto unlock_err; - } /* * We need to unlock only the end area that we aren't using. * The rest is going to be unlocked by the endio routine. */ - lockstart = start + bh_result->b_size; - if (lockstart < lockend) { - unlock_extent_cached(&BTRFS_I(inode)->io_tree, - lockstart, lockend, &cached_state); - } else { - free_extent_state(cached_state); - } + lockstart = start + len; + if (lockstart < lockend) + unlock_extents = true; + } + + if (unlock_extents) + unlock_extent_cached(&BTRFS_I(inode)->io_tree, + lockstart, lockend, &cached_state); + else + free_extent_state(cached_state); + + /* + * Translate extent map information to iomap. + * We trim the extents (and move the addr) even though iomap code does + * that, since we have locked only the parts we are performing I/O in. + */ + if ((em->block_start == EXTENT_MAP_HOLE) || + (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && !write)) { + iomap->addr = IOMAP_NULL_ADDR; + iomap->type = IOMAP_HOLE; + } else { + iomap->addr = em->block_start + (start - em->start); + iomap->type = IOMAP_MAPPED; } + iomap->offset = start; + iomap->bdev = fs_info->fs_devices->latest_bdev; + iomap->length = len; free_extent_map(em); @@ -7471,8 +7457,63 @@ unlock_err: unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, &cached_state); err: - if (dio_data) - current->journal_info = dio_data; + if (dio_data) { + btrfs_delalloc_release_space(BTRFS_I(inode), + dio_data->data_reserved, start, + dio_data->reserve, true); + btrfs_delalloc_release_extents(BTRFS_I(inode), dio_data->reserve); + extent_changeset_free(dio_data->data_reserved); + kfree(dio_data); + } + return ret; +} + +static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, + ssize_t written, unsigned int flags, struct iomap *iomap) +{ + int ret = 0; + struct btrfs_dio_data *dio_data = iomap->private; + size_t submitted = dio_data->submitted; + const bool write = !!(flags & IOMAP_WRITE); + + if (!write && (iomap->type == IOMAP_HOLE)) { + /* If reading from a hole, unlock and return */ + unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1); + goto out; + } + + if (submitted < length) { + pos += submitted; + length -= submitted; + if (write) + __endio_write_update_ordered(BTRFS_I(inode), pos, + length, false); + else + unlock_extent(&BTRFS_I(inode)->io_tree, pos, + pos + length - 1); + ret = -ENOTBLK; + } + + if (write) { + if (dio_data->reserve) + btrfs_delalloc_release_space(BTRFS_I(inode), + dio_data->data_reserved, pos, + dio_data->reserve, true); + btrfs_delalloc_release_extents(BTRFS_I(inode), dio_data->length); + extent_changeset_free(dio_data->data_reserved); + } +out: + /* + * We're all done, we can re-set the current->journal_info now safely + * for our endio. + */ + if (dio_data->sync) { + ASSERT(current->journal_info == NULL); + current->journal_info = BTRFS_DIO_SYNC_STUB; + } + kfree(dio_data); + iomap->private = NULL; + return ret; } @@ -7496,7 +7537,7 @@ static void btrfs_dio_private_put(struct btrfs_dio_private *dip) dip->logical_offset + dip->bytes - 1); } - dio_end_io(dip->dio_bio); + bio_endio(dip->dio_bio); kfree(dip); } @@ -7730,24 +7771,11 @@ static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio, dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9; dip->dio_bio = dio_bio; refcount_set(&dip->refs, 1); - - if (write) { - struct btrfs_dio_data *dio_data = current->journal_info; - - /* - * Setting range start and end to the same value means that - * no cleanup will happen in btrfs_direct_IO - */ - dio_data->unsubmitted_oe_range_end = dip->logical_offset + - dip->bytes; - dio_data->unsubmitted_oe_range_start = - dio_data->unsubmitted_oe_range_end; - } return dip; } -static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode, - loff_t file_offset) +static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap, + struct bio *dio_bio, loff_t file_offset) { const bool write = (bio_op(dio_bio) == REQ_OP_WRITE); const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM); @@ -7764,6 +7792,7 @@ static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode, int ret; blk_status_t status; struct btrfs_io_geometry geom; + struct btrfs_dio_data *dio_data = iomap->private; dip = btrfs_create_dio_private(dio_bio, inode, file_offset); if (!dip) { @@ -7772,8 +7801,8 @@ static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode, file_offset + dio_bio->bi_iter.bi_size - 1); } dio_bio->bi_status = BLK_STS_RESOURCE; - dio_end_io(dio_bio); - return; + bio_endio(dio_bio); + return BLK_QC_T_NONE; } if (!write && csum) { @@ -7844,15 +7873,17 @@ static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode, goto out_err; } + dio_data->submitted += clone_len; clone_offset += clone_len; start_sector += clone_len >> 9; file_offset += clone_len; } while (submit_len > 0); - return; + return BLK_QC_T_NONE; out_err: dip->dio_bio->bi_status = status; btrfs_dio_private_put(dip); + return BLK_QC_T_NONE; } static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info, @@ -7888,37 +7919,59 @@ out: return retval; } -static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) +static inline int btrfs_maybe_fsync_end_io(struct kiocb *iocb, ssize_t size, + int error, unsigned flags) +{ + /* + * Now if we're still in the context of our submitter we know we can't + * safely run generic_write_sync(), so clear our flag here so that the + * caller knows to follow up with a sync. + */ + if (current->journal_info == BTRFS_DIO_SYNC_STUB) { + current->journal_info = NULL; + return error; + } + + if (error) + return error; + + if (size) { + iocb->ki_flags |= IOCB_DSYNC; + return generic_write_sync(iocb, size); + } + + return 0; +} + +static const struct iomap_ops btrfs_dio_iomap_ops = { + .iomap_begin = btrfs_dio_iomap_begin, + .iomap_end = btrfs_dio_iomap_end, +}; + +static const struct iomap_dio_ops btrfs_dio_ops = { + .submit_io = btrfs_submit_direct, +}; + +static const struct iomap_dio_ops btrfs_sync_dops = { + .submit_io = btrfs_submit_direct, + .end_io = btrfs_maybe_fsync_end_io, +}; + +ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_dio_data dio_data = { 0 }; struct extent_changeset *data_reserved = NULL; loff_t offset = iocb->ki_pos; size_t count = 0; - int flags = 0; - bool wakeup = true; bool relock = false; ssize_t ret; if (check_direct_IO(fs_info, iter, offset)) return 0; - inode_dio_begin(inode); - - /* - * The generic stuff only does filemap_write_and_wait_range, which - * isn't enough if we've written compressed pages to this area, so - * we need to flush the dirty pages again to make absolutely sure - * that any outstanding dirty pages are on disk. - */ count = iov_iter_count(iter); - if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, - &BTRFS_I(inode)->runtime_flags)) - filemap_fdatawrite_range(inode->i_mapping, offset, - offset + count - 1); - if (iov_iter_rw(iter) == WRITE) { /* * If the write DIO is beyond the EOF, we need update @@ -7926,66 +7979,29 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) * not unlock the i_mutex at this case. */ if (offset + count <= inode->i_size) { - dio_data.overwrite = 1; inode_unlock(inode); relock = true; } - ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved, - offset, count); - if (ret) - goto out; - - /* - * We need to know how many extents we reserved so that we can - * do the accounting properly if we go over the number we - * originally calculated. Abuse current->journal_info for this. - */ - dio_data.reserve = round_up(count, - fs_info->sectorsize); - dio_data.unsubmitted_oe_range_start = (u64)offset; - dio_data.unsubmitted_oe_range_end = (u64)offset; - current->journal_info = &dio_data; down_read(&BTRFS_I(inode)->dio_sem); - } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK, - &BTRFS_I(inode)->runtime_flags)) { - inode_dio_end(inode); - flags = DIO_LOCKING | DIO_SKIP_HOLES; - wakeup = false; } - ret = __blockdev_direct_IO(iocb, inode, - fs_info->fs_devices->latest_bdev, - iter, btrfs_get_blocks_direct, NULL, - btrfs_submit_direct, flags); - if (iov_iter_rw(iter) == WRITE) { + /* + * We have are actually a sync iocb, so we need our fancy endio to know + * if we need to sync. + */ + if (current->journal_info) + ret = iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, + &btrfs_sync_dops, is_sync_kiocb(iocb)); + else + ret = iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, + &btrfs_dio_ops, is_sync_kiocb(iocb)); + + if (ret == -ENOTBLK) + ret = 0; + + if (iov_iter_rw(iter) == WRITE) up_read(&BTRFS_I(inode)->dio_sem); - current->journal_info = NULL; - if (ret < 0 && ret != -EIOCBQUEUED) { - if (dio_data.reserve) - btrfs_delalloc_release_space(BTRFS_I(inode), - data_reserved, offset, dio_data.reserve, - true); - /* - * On error we might have left some ordered extents - * without submitting corresponding bios for them, so - * cleanup them up to avoid other tasks getting them - * and waiting for them to complete forever. - */ - if (dio_data.unsubmitted_oe_range_start < - dio_data.unsubmitted_oe_range_end) - __endio_write_update_ordered(BTRFS_I(inode), - dio_data.unsubmitted_oe_range_start, - dio_data.unsubmitted_oe_range_end - - dio_data.unsubmitted_oe_range_start, - false); - } else if (ret >= 0 && (size_t)ret < count) - btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, - offset, count - (size_t)ret, true); - btrfs_delalloc_release_extents(BTRFS_I(inode), count); - } -out: - if (wakeup) - inode_dio_end(inode); + if (relock) inode_lock(inode); @@ -8002,12 +8018,24 @@ static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, if (ret) return ret; - return extent_fiemap(inode, fieinfo, start, len); + return extent_fiemap(BTRFS_I(inode), fieinfo, start, len); } int btrfs_readpage(struct file *file, struct page *page) { - return extent_read_full_page(page, btrfs_get_extent, 0); + struct btrfs_inode *inode = BTRFS_I(page->mapping->host); + u64 start = page_offset(page); + u64 end = start + PAGE_SIZE - 1; + unsigned long bio_flags = 0; + struct bio *bio = NULL; + int ret; + + btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); + + ret = btrfs_do_readpage(page, NULL, &bio, &bio_flags, 0, NULL); + if (bio) + ret = submit_one_bio(bio, 0, bio_flags); + return ret; } static int btrfs_writepage(struct page *page, struct writeback_control *wbc) @@ -8091,15 +8119,15 @@ static int btrfs_migratepage(struct address_space *mapping, static void btrfs_invalidatepage(struct page *page, unsigned int offset, unsigned int length) { - struct inode *inode = page->mapping->host; - struct extent_io_tree *tree; + struct btrfs_inode *inode = BTRFS_I(page->mapping->host); + struct extent_io_tree *tree = &inode->io_tree; struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; u64 page_start = page_offset(page); u64 page_end = page_start + PAGE_SIZE - 1; u64 start; u64 end; - int inode_evicting = inode->i_state & I_FREEING; + int inode_evicting = inode->vfs_inode.i_state & I_FREEING; /* * we have the page locked, so new writeback can't start, @@ -8110,7 +8138,6 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, */ wait_on_page_writeback(page); - tree = &BTRFS_I(inode)->io_tree; if (offset) { btrfs_releasepage(page, GFP_NOFS); return; @@ -8120,8 +8147,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, lock_extent_bits(tree, page_start, page_end, &cached_state); again: start = page_start; - ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), start, - page_end - start + 1); + ordered = btrfs_lookup_ordered_range(inode, start, page_end - start + 1); if (ordered) { end = min(page_end, ordered->file_offset + ordered->num_bytes - 1); @@ -8142,7 +8168,7 @@ again: struct btrfs_ordered_inode_tree *tree; u64 new_len; - tree = &BTRFS_I(inode)->ordered_tree; + tree = &inode->ordered_tree; spin_lock_irq(&tree->lock); set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags); @@ -8181,7 +8207,7 @@ again: * bit of its io_tree, and free the qgroup reserved data space. * Since the IO will never happen for this page. */ - btrfs_qgroup_free_data(BTRFS_I(inode), NULL, page_start, PAGE_SIZE); + btrfs_qgroup_free_data(inode, NULL, page_start, PAGE_SIZE); if (!inode_evicting) { clear_extent_bit(tree, page_start, page_end, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | @@ -8283,7 +8309,7 @@ again: unlock_extent_cached(io_tree, page_start, page_end, &cached_state); unlock_page(page); - btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_start_ordered_extent(ordered, 1); btrfs_put_ordered_extent(ordered); goto again; } @@ -8614,21 +8640,21 @@ void btrfs_free_inode(struct inode *inode) kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); } -void btrfs_destroy_inode(struct inode *inode) +void btrfs_destroy_inode(struct inode *vfs_inode) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_ordered_extent *ordered; - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_inode *inode = BTRFS_I(vfs_inode); + struct btrfs_root *root = inode->root; - WARN_ON(!hlist_empty(&inode->i_dentry)); - WARN_ON(inode->i_data.nrpages); - WARN_ON(BTRFS_I(inode)->block_rsv.reserved); - WARN_ON(BTRFS_I(inode)->block_rsv.size); - WARN_ON(BTRFS_I(inode)->outstanding_extents); - WARN_ON(BTRFS_I(inode)->delalloc_bytes); - WARN_ON(BTRFS_I(inode)->new_delalloc_bytes); - WARN_ON(BTRFS_I(inode)->csum_bytes); - WARN_ON(BTRFS_I(inode)->defrag_bytes); + WARN_ON(!hlist_empty(&vfs_inode->i_dentry)); + WARN_ON(vfs_inode->i_data.nrpages); + WARN_ON(inode->block_rsv.reserved); + WARN_ON(inode->block_rsv.size); + WARN_ON(inode->outstanding_extents); + WARN_ON(inode->delalloc_bytes); + WARN_ON(inode->new_delalloc_bytes); + WARN_ON(inode->csum_bytes); + WARN_ON(inode->defrag_bytes); /* * This can happen where we create an inode, but somebody else also @@ -8643,7 +8669,7 @@ void btrfs_destroy_inode(struct inode *inode) if (!ordered) break; else { - btrfs_err(fs_info, + btrfs_err(root->fs_info, "found ordered extent %llu %llu on inode cleanup", ordered->file_offset, ordered->num_bytes); btrfs_remove_ordered_extent(inode, ordered); @@ -8651,11 +8677,11 @@ void btrfs_destroy_inode(struct inode *inode) btrfs_put_ordered_extent(ordered); } } - btrfs_qgroup_check_reserved_leak(BTRFS_I(inode)); + btrfs_qgroup_check_reserved_leak(inode); inode_tree_del(inode); - btrfs_drop_extent_cache(BTRFS_I(inode), 0, (u64)-1, 0); - btrfs_inode_clear_file_extent_range(BTRFS_I(inode), 0, (u64)-1); - btrfs_put_root(BTRFS_I(inode)->root); + btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); + btrfs_inode_clear_file_extent_range(inode, 0, (u64)-1); + btrfs_put_root(inode->root); } int btrfs_drop_inode(struct inode *inode) @@ -8780,27 +8806,19 @@ static int btrfs_rename_exchange(struct inode *old_dir, struct inode *new_inode = new_dentry->d_inode; struct inode *old_inode = old_dentry->d_inode; struct timespec64 ctime = current_time(old_inode); - struct dentry *parent; u64 old_ino = btrfs_ino(BTRFS_I(old_inode)); u64 new_ino = btrfs_ino(BTRFS_I(new_inode)); u64 old_idx = 0; u64 new_idx = 0; int ret; + int ret2; bool root_log_pinned = false; bool dest_log_pinned = false; - struct btrfs_log_ctx ctx_root; - struct btrfs_log_ctx ctx_dest; - bool sync_log_root = false; - bool sync_log_dest = false; - bool commit_transaction = false; /* we only allow rename subvolume link between subvolumes */ if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest) return -EXDEV; - btrfs_init_log_ctx(&ctx_root, old_inode); - btrfs_init_log_ctx(&ctx_dest, new_inode); - /* close the race window with snapshot create/destroy ioctl */ if (old_ino == BTRFS_FIRST_FREE_OBJECTID || new_ino == BTRFS_FIRST_FREE_OBJECTID) @@ -8942,30 +8960,14 @@ static int btrfs_rename_exchange(struct inode *old_dir, BTRFS_I(new_inode)->dir_index = new_idx; if (root_log_pinned) { - parent = new_dentry->d_parent; - ret = btrfs_log_new_name(trans, BTRFS_I(old_inode), - BTRFS_I(old_dir), parent, - false, &ctx_root); - if (ret == BTRFS_NEED_LOG_SYNC) - sync_log_root = true; - else if (ret == BTRFS_NEED_TRANS_COMMIT) - commit_transaction = true; - ret = 0; + btrfs_log_new_name(trans, BTRFS_I(old_inode), BTRFS_I(old_dir), + new_dentry->d_parent); btrfs_end_log_trans(root); root_log_pinned = false; } if (dest_log_pinned) { - if (!commit_transaction) { - parent = old_dentry->d_parent; - ret = btrfs_log_new_name(trans, BTRFS_I(new_inode), - BTRFS_I(new_dir), parent, - false, &ctx_dest); - if (ret == BTRFS_NEED_LOG_SYNC) - sync_log_dest = true; - else if (ret == BTRFS_NEED_TRANS_COMMIT) - commit_transaction = true; - ret = 0; - } + btrfs_log_new_name(trans, BTRFS_I(new_inode), BTRFS_I(new_dir), + old_dentry->d_parent); btrfs_end_log_trans(dest); dest_log_pinned = false; } @@ -8998,46 +9000,13 @@ out_fail: dest_log_pinned = false; } } - if (!ret && sync_log_root && !commit_transaction) { - ret = btrfs_sync_log(trans, BTRFS_I(old_inode)->root, - &ctx_root); - if (ret) - commit_transaction = true; - } - if (!ret && sync_log_dest && !commit_transaction) { - ret = btrfs_sync_log(trans, BTRFS_I(new_inode)->root, - &ctx_dest); - if (ret) - commit_transaction = true; - } - if (commit_transaction) { - /* - * We may have set commit_transaction when logging the new name - * in the destination root, in which case we left the source - * root context in the list of log contextes. So make sure we - * remove it to avoid invalid memory accesses, since the context - * was allocated in our stack frame. - */ - if (sync_log_root) { - mutex_lock(&root->log_mutex); - list_del_init(&ctx_root.list); - mutex_unlock(&root->log_mutex); - } - ret = btrfs_commit_transaction(trans); - } else { - int ret2; - - ret2 = btrfs_end_transaction(trans); - ret = ret ? ret : ret2; - } + ret2 = btrfs_end_transaction(trans); + ret = ret ? ret : ret2; out_notrans: if (new_ino == BTRFS_FIRST_FREE_OBJECTID || old_ino == BTRFS_FIRST_FREE_OBJECTID) up_read(&fs_info->subvol_sem); - ASSERT(list_empty(&ctx_root.list)); - ASSERT(list_empty(&ctx_dest.list)); - return ret; } @@ -9105,11 +9074,9 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *old_inode = d_inode(old_dentry); u64 index = 0; int ret; + int ret2; u64 old_ino = btrfs_ino(BTRFS_I(old_inode)); bool log_pinned = false; - struct btrfs_log_ctx ctx; - bool sync_log = false; - bool commit_transaction = false; if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) return -EPERM; @@ -9259,17 +9226,8 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, BTRFS_I(old_inode)->dir_index = index; if (log_pinned) { - struct dentry *parent = new_dentry->d_parent; - - btrfs_init_log_ctx(&ctx, old_inode); - ret = btrfs_log_new_name(trans, BTRFS_I(old_inode), - BTRFS_I(old_dir), parent, - false, &ctx); - if (ret == BTRFS_NEED_LOG_SYNC) - sync_log = true; - else if (ret == BTRFS_NEED_TRANS_COMMIT) - commit_transaction = true; - ret = 0; + btrfs_log_new_name(trans, BTRFS_I(old_inode), BTRFS_I(old_dir), + new_dentry->d_parent); btrfs_end_log_trans(root); log_pinned = false; } @@ -9306,23 +9264,8 @@ out_fail: btrfs_end_log_trans(root); log_pinned = false; } - if (!ret && sync_log) { - ret = btrfs_sync_log(trans, BTRFS_I(old_inode)->root, &ctx); - if (ret) - commit_transaction = true; - } else if (sync_log) { - mutex_lock(&root->log_mutex); - list_del(&ctx.list); - mutex_unlock(&root->log_mutex); - } - if (commit_transaction) { - ret = btrfs_commit_transaction(trans); - } else { - int ret2; - - ret2 = btrfs_end_transaction(trans); - ret = ret ? ret : ret2; - } + ret2 = btrfs_end_transaction(trans); + ret = ret ? ret : ret2; out_notrans: if (old_ino == BTRFS_FIRST_FREE_OBJECTID) up_read(&fs_info->subvol_sem); @@ -9388,7 +9331,7 @@ static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode * some fairly slow code that needs optimization. This walks the list * of all the inodes with pending delalloc and forces them to disk. */ -static int start_delalloc_inodes(struct btrfs_root *root, int nr, bool snapshot) +static int start_delalloc_inodes(struct btrfs_root *root, u64 *nr, bool snapshot) { struct btrfs_inode *binode; struct inode *inode; @@ -9428,9 +9371,11 @@ static int start_delalloc_inodes(struct btrfs_root *root, int nr, bool snapshot) list_add_tail(&work->list, &works); btrfs_queue_work(root->fs_info->flush_workers, &work->work); - ret++; - if (nr != -1 && ret >= nr) - goto out; + if (*nr != U64_MAX) { + (*nr)--; + if (*nr == 0) + goto out; + } cond_resched(); spin_lock(&root->delalloc_lock); } @@ -9455,18 +9400,15 @@ out: int btrfs_start_delalloc_snapshot(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; - int ret; + u64 nr = U64_MAX; if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) return -EROFS; - ret = start_delalloc_inodes(root, -1, true); - if (ret > 0) - ret = 0; - return ret; + return start_delalloc_inodes(root, &nr, true); } -int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr) +int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, u64 nr) { struct btrfs_root *root; struct list_head splice; @@ -9489,15 +9431,10 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr) &fs_info->delalloc_roots); spin_unlock(&fs_info->delalloc_root_lock); - ret = start_delalloc_inodes(root, nr, false); + ret = start_delalloc_inodes(root, &nr, false); btrfs_put_root(root); if (ret < 0) goto out; - - if (nr != -1) { - nr -= ret; - WARN_ON(nr < 0); - } spin_lock(&fs_info->delalloc_root_lock); } spin_unlock(&fs_info->delalloc_root_lock); @@ -9568,7 +9505,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, inode->i_fop = &btrfs_file_operations; inode->i_op = &btrfs_file_inode_operations; inode->i_mapping->a_ops = &btrfs_aops; - BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); if (err) @@ -9633,11 +9569,15 @@ out_unlock: return err; } -static int insert_prealloc_file_extent(struct btrfs_trans_handle *trans, +static struct btrfs_trans_handle *insert_prealloc_file_extent( + struct btrfs_trans_handle *trans_in, struct inode *inode, struct btrfs_key *ins, u64 file_offset) { struct btrfs_file_extent_item stack_fi; + struct btrfs_replace_extent_info extent_info; + struct btrfs_trans_handle *trans = trans_in; + struct btrfs_path *path; u64 start = ins->objectid; u64 len = ins->offset; int ret; @@ -9654,10 +9594,40 @@ static int insert_prealloc_file_extent(struct btrfs_trans_handle *trans, ret = btrfs_qgroup_release_data(BTRFS_I(inode), file_offset, len); if (ret < 0) - return ret; - return insert_reserved_file_extent(trans, BTRFS_I(inode), file_offset, - &stack_fi, ret); + return ERR_PTR(ret); + + if (trans) { + ret = insert_reserved_file_extent(trans, BTRFS_I(inode), + file_offset, &stack_fi, ret); + if (ret) + return ERR_PTR(ret); + return trans; + } + + extent_info.disk_offset = start; + extent_info.disk_len = len; + extent_info.data_offset = 0; + extent_info.data_len = len; + extent_info.file_offset = file_offset; + extent_info.extent_buf = (char *)&stack_fi; + extent_info.is_new_extent = true; + extent_info.qgroup_reserved = ret; + extent_info.insertions = 0; + + path = btrfs_alloc_path(); + if (!path) + return ERR_PTR(-ENOMEM); + + ret = btrfs_replace_file_extents(inode, path, file_offset, + file_offset + len - 1, &extent_info, + &trans); + btrfs_free_path(path); + if (ret) + return ERR_PTR(ret); + + return trans; } + static int __btrfs_prealloc_file_range(struct inode *inode, int mode, u64 start, u64 num_bytes, u64 min_size, loff_t actual_len, u64 *alloc_hint, @@ -9680,14 +9650,6 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, if (trans) own_trans = false; while (num_bytes > 0) { - if (own_trans) { - trans = btrfs_start_transaction(root, 3); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - break; - } - } - cur_bytes = min_t(u64, num_bytes, SZ_256M); cur_bytes = max(cur_bytes, min_size); /* @@ -9699,11 +9661,8 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, cur_bytes = min(cur_bytes, last_alloc); ret = btrfs_reserve_extent(root, cur_bytes, cur_bytes, min_size, 0, *alloc_hint, &ins, 1, 0); - if (ret) { - if (own_trans) - btrfs_end_transaction(trans); + if (ret) break; - } /* * We've reserved this space, and thus converted it from @@ -9716,13 +9675,11 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, btrfs_dec_block_group_reservations(fs_info, ins.objectid); last_alloc = ins.offset; - ret = insert_prealloc_file_extent(trans, inode, &ins, cur_offset); - if (ret) { + trans = insert_prealloc_file_extent(trans, inode, &ins, cur_offset); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0); - btrfs_abort_transaction(trans, ret); - if (own_trans) - btrfs_end_transaction(trans); break; } @@ -9785,8 +9742,10 @@ next: break; } - if (own_trans) + if (own_trans) { btrfs_end_transaction(trans); + trans = NULL; + } } if (clear_offset < end) btrfs_free_reserved_data_space(BTRFS_I(inode), NULL, clear_offset, @@ -9865,7 +9824,6 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) inode->i_op = &btrfs_file_inode_operations; inode->i_mapping->a_ops = &btrfs_aops; - BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; ret = btrfs_init_inode_security(trans, inode, dir, NULL); if (ret) @@ -10072,14 +10030,14 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, /* * Balance or device remove/replace/resize can move stuff around from - * under us. The EXCL_OP flag makes sure they aren't running/won't run - * concurrently while we are mapping the swap extents, and - * fs_info->swapfile_pins prevents them from running while the swap file - * is active and moving the extents. Note that this also prevents a - * concurrent device add which isn't actually necessary, but it's not + * under us. The exclop protection makes sure they aren't running/won't + * run concurrently while we are mapping the swap extents, and + * fs_info->swapfile_pins prevents them from running while the swap + * file is active and moving the extents. Note that this also prevents + * a concurrent device add which isn't actually necessary, but it's not * really worth the trouble to allow it. */ - if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_SWAP_ACTIVATE)) { btrfs_warn(fs_info, "cannot activate swapfile while exclusive operation is running"); return -EBUSY; @@ -10225,7 +10183,7 @@ out: if (ret) btrfs_swap_deactivate(file); - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + btrfs_exclop_finish(fs_info); if (ret) return ret; @@ -10283,12 +10241,6 @@ static const struct file_operations btrfs_dir_file_operations = { .fsync = btrfs_sync_file, }; -static const struct extent_io_ops btrfs_extent_io_ops = { - /* mandatory callbacks */ - .submit_bio_hook = btrfs_submit_bio_hook, - .readpage_end_io_hook = btrfs_readpage_end_io_hook, -}; - /* * btrfs doesn't support the bmap operation because swapfiles * use bmap to make a mapping of extents in the file. They assume @@ -10306,7 +10258,7 @@ static const struct address_space_operations btrfs_aops = { .writepage = btrfs_writepage, .writepages = btrfs_writepages, .readahead = btrfs_readahead, - .direct_IO = btrfs_direct_IO, + .direct_IO = noop_direct_IO, .invalidatepage = btrfs_invalidatepage, .releasepage = btrfs_releasepage, #ifdef CONFIG_MIGRATION diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index ac45f022b495..ab408a23ba32 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -378,6 +378,18 @@ static int check_xflags(unsigned int flags) return 0; } +bool btrfs_exclop_start(struct btrfs_fs_info *fs_info, + enum btrfs_exclusive_operation type) +{ + return !cmpxchg(&fs_info->exclusive_operation, BTRFS_EXCLOP_NONE, type); +} + +void btrfs_exclop_finish(struct btrfs_fs_info *fs_info) +{ + WRITE_ONCE(fs_info->exclusive_operation, BTRFS_EXCLOP_NONE); + sysfs_notify(&fs_info->fs_devices->fsid_kobj, NULL, "exclusive_operation"); +} + /* * Set the xflags from the internal inode flags. The remaining items of fsxattr * are zeroed. @@ -618,7 +630,7 @@ static noinline int create_subvol(struct inode *dir, trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); - btrfs_subvolume_release_metadata(fs_info, &block_rsv); + btrfs_subvolume_release_metadata(root, &block_rsv); goto fail_free; } trans->block_rsv = &block_rsv; @@ -628,7 +640,8 @@ static noinline int create_subvol(struct inode *dir, if (ret) goto fail; - leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0); + leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0, + BTRFS_NESTING_NORMAL); if (IS_ERR(leaf)) { ret = PTR_ERR(leaf); goto fail; @@ -742,7 +755,7 @@ fail: kfree(root_item); trans->block_rsv = NULL; trans->bytes_reserved = 0; - btrfs_subvolume_release_metadata(fs_info, &block_rsv); + btrfs_subvolume_release_metadata(root, &block_rsv); err = btrfs_commit_transaction(trans); if (err && !ret) @@ -856,7 +869,7 @@ fail: if (ret && pending_snapshot->snap) pending_snapshot->snap->anon_dev = 0; btrfs_put_root(pending_snapshot->snap); - btrfs_subvolume_release_metadata(fs_info, &pending_snapshot->block_rsv); + btrfs_subvolume_release_metadata(root, &pending_snapshot->block_rsv); free_pending: if (pending_snapshot->anon_dev) free_anon_bdev(pending_snapshot->anon_dev); @@ -1306,7 +1319,7 @@ again: break; unlock_page(page); - btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_start_ordered_extent(ordered, 1); btrfs_put_ordered_extent(ordered); lock_page(page); /* @@ -1638,7 +1651,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, if (ret) return ret; - if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_RESIZE)) { mnt_drop_write_file(file); return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; } @@ -1752,7 +1765,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, out_free: kfree(vol_args); out: - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + btrfs_exclop_finish(fs_info); mnt_drop_write_file(file); return ret; } @@ -2193,7 +2206,8 @@ static noinline int search_ioctl(struct inode *inode, key.offset = sk->min_offset; while (1) { - ret = fault_in_pages_writeable(ubuf, *buf_size - sk_offset); + ret = fault_in_pages_writeable(ubuf + sk_offset, + *buf_size - sk_offset); if (ret) break; @@ -3125,7 +3139,7 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_ADD)) return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; vol_args = memdup_user(arg, sizeof(*vol_args)); @@ -3142,7 +3156,7 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg) kfree(vol_args); out: - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + btrfs_exclop_finish(fs_info); return ret; } @@ -3171,7 +3185,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) goto out; } - if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REMOVE)) { ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; goto out; } @@ -3182,7 +3196,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0'; ret = btrfs_rm_device(fs_info, vol_args->name, 0); } - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + btrfs_exclop_finish(fs_info); if (!ret) { if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) @@ -3213,7 +3227,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) if (ret) return ret; - if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REMOVE)) { ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; goto out_drop_write; } @@ -3231,7 +3245,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) btrfs_info(fs_info, "disk deleted %s", vol_args->name); kfree(vol_args); out: - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + btrfs_exclop_finish(fs_info); out_drop_write: mnt_drop_write_file(file); @@ -3461,15 +3475,12 @@ static long btrfs_ioctl_space_info(struct btrfs_fs_info *fs_info, struct btrfs_space_info *tmp; info = NULL; - rcu_read_lock(); - list_for_each_entry_rcu(tmp, &fs_info->space_info, - list) { + list_for_each_entry(tmp, &fs_info->space_info, list) { if (tmp->flags == types[i]) { info = tmp; break; } } - rcu_read_unlock(); if (!info) continue; @@ -3517,15 +3528,12 @@ static long btrfs_ioctl_space_info(struct btrfs_fs_info *fs_info, break; info = NULL; - rcu_read_lock(); - list_for_each_entry_rcu(tmp, &fs_info->space_info, - list) { + list_for_each_entry(tmp, &fs_info->space_info, list) { if (tmp->flags == types[i]) { info = tmp; break; } } - rcu_read_unlock(); if (!info) continue; @@ -3735,11 +3743,11 @@ static long btrfs_ioctl_dev_replace(struct btrfs_fs_info *fs_info, ret = -EROFS; goto out; } - if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REPLACE)) { ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; } else { ret = btrfs_dev_replace_by_ioctl(fs_info, p); - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + btrfs_exclop_finish(fs_info); } break; case BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS: @@ -3950,7 +3958,7 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg) return ret; again: - if (!test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { + if (btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) { mutex_lock(&fs_info->balance_mutex); need_unlock = true; goto locked; @@ -3996,7 +4004,6 @@ again: } locked: - BUG_ON(!test_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)); if (arg) { bargs = memdup_user(arg, sizeof(*bargs)); @@ -4051,10 +4058,10 @@ locked: do_balance: /* - * Ownership of bctl and filesystem flag BTRFS_FS_EXCL_OP goes to - * btrfs_balance. bctl is freed in reset_balance_state, or, if - * restriper was paused all the way until unmount, in free_fs_info. - * The flag should be cleared after reset_balance_state. + * Ownership of bctl and exclusive operation goes to btrfs_balance. + * bctl is freed in reset_balance_state, or, if restriper was paused + * all the way until unmount, in free_fs_info. The flag should be + * cleared after reset_balance_state. */ need_unlock = false; @@ -4073,7 +4080,7 @@ out_bargs: out_unlock: mutex_unlock(&fs_info->balance_mutex); if (need_unlock) - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + btrfs_exclop_finish(fs_info); out: mnt_drop_write_file(file); return ret; @@ -4896,7 +4903,7 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_SYNC: { int ret; - ret = btrfs_start_delalloc_roots(fs_info, -1); + ret = btrfs_start_delalloc_roots(fs_info, U64_MAX); if (ret) return ret; ret = btrfs_sync_fs(inode->i_sb, 1); diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index f75612e18a82..66e02ebdd340 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -57,8 +57,8 @@ * performance reasons. * * - * Lock nesting - * ------------ + * Lock recursion + * -------------- * * A write operation on a tree might indirectly start a look up on the same * tree. This can happen when btrfs_cow_block locks the tree and needs to @@ -201,7 +201,7 @@ void btrfs_set_lock_blocking_read(struct extent_buffer *eb) * lock, but it won't change to or away from us. If we have the write * lock, we are the owner and it'll never change. */ - if (eb->lock_nested && current->pid == eb->lock_owner) + if (eb->lock_recursed && current->pid == eb->lock_owner) return; btrfs_assert_tree_read_locked(eb); atomic_inc(&eb->blocking_readers); @@ -225,7 +225,7 @@ void btrfs_set_lock_blocking_write(struct extent_buffer *eb) * lock, but it won't change to or away from us. If we have the write * lock, we are the owner and it'll never change. */ - if (eb->lock_nested && current->pid == eb->lock_owner) + if (eb->lock_recursed && current->pid == eb->lock_owner) return; if (eb->blocking_writers == 0) { btrfs_assert_spinning_writers_put(eb); @@ -244,7 +244,8 @@ void btrfs_set_lock_blocking_write(struct extent_buffer *eb) * * The rwlock is held upon exit. */ -void btrfs_tree_read_lock(struct extent_buffer *eb) +void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest, + bool recurse) { u64 start_ns = 0; @@ -263,8 +264,9 @@ again: * depends on this as it may be called on a partly * (write-)locked tree. */ - BUG_ON(eb->lock_nested); - eb->lock_nested = true; + WARN_ON(!recurse); + BUG_ON(eb->lock_recursed); + eb->lock_recursed = true; read_unlock(&eb->lock); trace_btrfs_tree_read_lock(eb, start_ns); return; @@ -279,6 +281,11 @@ again: trace_btrfs_tree_read_lock(eb, start_ns); } +void btrfs_tree_read_lock(struct extent_buffer *eb) +{ + __btrfs_tree_read_lock(eb, BTRFS_NESTING_NORMAL, false); +} + /* * Lock extent buffer for read, optimistically expecting that there are no * contending blocking writers. If there are, don't wait. @@ -362,11 +369,11 @@ void btrfs_tree_read_unlock(struct extent_buffer *eb) /* * if we're nested, we have the write lock. No new locking * is needed as long as we are the lock owner. - * The write unlock will do a barrier for us, and the lock_nested + * The write unlock will do a barrier for us, and the lock_recursed * field only matters to the lock owner. */ - if (eb->lock_nested && current->pid == eb->lock_owner) { - eb->lock_nested = false; + if (eb->lock_recursed && current->pid == eb->lock_owner) { + eb->lock_recursed = false; return; } btrfs_assert_tree_read_locked(eb); @@ -388,11 +395,11 @@ void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb) /* * if we're nested, we have the write lock. No new locking * is needed as long as we are the lock owner. - * The write unlock will do a barrier for us, and the lock_nested + * The write unlock will do a barrier for us, and the lock_recursed * field only matters to the lock owner. */ - if (eb->lock_nested && current->pid == eb->lock_owner) { - eb->lock_nested = false; + if (eb->lock_recursed && current->pid == eb->lock_owner) { + eb->lock_recursed = false; return; } btrfs_assert_tree_read_locked(eb); @@ -409,7 +416,7 @@ void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb) * * The rwlock is held for write upon exit. */ -void btrfs_tree_lock(struct extent_buffer *eb) +void __btrfs_tree_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest) __acquires(&eb->lock) { u64 start_ns = 0; @@ -434,6 +441,11 @@ again: trace_btrfs_tree_lock(eb, start_ns); } +void btrfs_tree_lock(struct extent_buffer *eb) +{ + __btrfs_tree_lock(eb, BTRFS_NESTING_NORMAL); +} + /* * Release the write lock, either blocking or spinning (ie. there's no need * for an explicit blocking unlock, like btrfs_tree_read_unlock_blocking). @@ -552,13 +564,14 @@ struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root) * * Return: root extent buffer with read lock held */ -struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root) +struct extent_buffer *__btrfs_read_lock_root_node(struct btrfs_root *root, + bool recurse) { struct extent_buffer *eb; while (1) { eb = btrfs_root_node(root); - btrfs_tree_read_lock(eb); + __btrfs_tree_read_lock(eb, BTRFS_NESTING_NORMAL, recurse); if (eb == root->node) break; btrfs_tree_read_unlock(eb); diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index d715846c10b8..3ea81ed3320b 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -16,11 +16,81 @@ #define BTRFS_WRITE_LOCK_BLOCKING 3 #define BTRFS_READ_LOCK_BLOCKING 4 +/* + * We are limited in number of subclasses by MAX_LOCKDEP_SUBCLASSES, which at + * the time of this patch is 8, which is how many we use. Keep this in mind if + * you decide you want to add another subclass. + */ +enum btrfs_lock_nesting { + BTRFS_NESTING_NORMAL, + + /* + * When we COW a block we are holding the lock on the original block, + * and since our lockdep maps are rootid+level, this confuses lockdep + * when we lock the newly allocated COW'd block. Handle this by having + * a subclass for COW'ed blocks so that lockdep doesn't complain. + */ + BTRFS_NESTING_COW, + + /* + * Oftentimes we need to lock adjacent nodes on the same level while + * still holding the lock on the original node we searched to, such as + * for searching forward or for split/balance. + * + * Because of this we need to indicate to lockdep that this is + * acceptable by having a different subclass for each of these + * operations. + */ + BTRFS_NESTING_LEFT, + BTRFS_NESTING_RIGHT, + + /* + * When splitting we will be holding a lock on the left/right node when + * we need to cow that node, thus we need a new set of subclasses for + * these two operations. + */ + BTRFS_NESTING_LEFT_COW, + BTRFS_NESTING_RIGHT_COW, + + /* + * When splitting we may push nodes to the left or right, but still use + * the subsequent nodes in our path, keeping our locks on those adjacent + * blocks. Thus when we go to allocate a new split block we've already + * used up all of our available subclasses, so this subclass exists to + * handle this case where we need to allocate a new split block. + */ + BTRFS_NESTING_SPLIT, + + /* + * When promoting a new block to a root we need to have a special + * subclass so we don't confuse lockdep, as it will appear that we are + * locking a higher level node before a lower level one. Copying also + * has this problem as it appears we're locking the same block again + * when we make a snapshot of an existing root. + */ + BTRFS_NESTING_NEW_ROOT, + + /* + * We are limited to MAX_LOCKDEP_SUBLCLASSES number of subclasses, so + * add this in here and add a static_assert to keep us from going over + * the limit. As of this writing we're limited to 8, and we're + * definitely using 8, hence this check to keep us from messing up in + * the future. + */ + BTRFS_NESTING_MAX, +}; + +static_assert(BTRFS_NESTING_MAX <= MAX_LOCKDEP_SUBCLASSES, + "too many lock subclasses defined"); + struct btrfs_path; +void __btrfs_tree_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest); void btrfs_tree_lock(struct extent_buffer *eb); void btrfs_tree_unlock(struct extent_buffer *eb); +void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest, + bool recurse); void btrfs_tree_read_lock(struct extent_buffer *eb); void btrfs_tree_read_unlock(struct extent_buffer *eb); void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb); @@ -29,6 +99,14 @@ void btrfs_set_lock_blocking_write(struct extent_buffer *eb); int btrfs_try_tree_read_lock(struct extent_buffer *eb); int btrfs_try_tree_write_lock(struct extent_buffer *eb); int btrfs_tree_read_lock_atomic(struct extent_buffer *eb); +struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root); +struct extent_buffer *__btrfs_read_lock_root_node(struct btrfs_root *root, + bool recurse); + +static inline struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root) +{ + return __btrfs_read_lock_root_node(root, false); +} #ifdef CONFIG_BTRFS_DEBUG static inline void btrfs_assert_tree_locked(struct extent_buffer *eb) { diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index ebac13389e7e..87bac9ecdf4c 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -212,11 +212,12 @@ static int __btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset refcount_set(&entry->refs, 1); init_waitqueue_head(&entry->wait); INIT_LIST_HEAD(&entry->list); + INIT_LIST_HEAD(&entry->log_list); INIT_LIST_HEAD(&entry->root_extent_list); INIT_LIST_HEAD(&entry->work_list); init_completion(&entry->completion); - trace_btrfs_ordered_extent_add(&inode->vfs_inode, entry); + trace_btrfs_ordered_extent_add(inode, entry); spin_lock_irq(&tree->lock); node = tree_insert(&tree->tree, file_offset, @@ -377,17 +378,16 @@ out: * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used * to make sure this function only returns 1 once for a given ordered extent. */ -int btrfs_dec_test_ordered_pending(struct inode *inode, +int btrfs_dec_test_ordered_pending(struct btrfs_inode *inode, struct btrfs_ordered_extent **cached, u64 file_offset, u64 io_size, int uptodate) { - struct btrfs_ordered_inode_tree *tree; + struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; unsigned long flags; int ret; - tree = &BTRFS_I(inode)->ordered_tree; spin_lock_irqsave(&tree->lock, flags); if (cached && *cached) { entry = *cached; @@ -408,7 +408,7 @@ have_entry: } if (io_size > entry->bytes_left) { - btrfs_crit(BTRFS_I(inode)->root->fs_info, + btrfs_crit(inode->root->fs_info, "bad ordered accounting left %llu size %llu", entry->bytes_left, io_size); } @@ -441,10 +441,11 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) struct list_head *cur; struct btrfs_ordered_sum *sum; - trace_btrfs_ordered_extent_put(entry->inode, entry); + trace_btrfs_ordered_extent_put(BTRFS_I(entry->inode), entry); if (refcount_dec_and_test(&entry->refs)) { ASSERT(list_empty(&entry->root_extent_list)); + ASSERT(list_empty(&entry->log_list)); ASSERT(RB_EMPTY_NODE(&entry->rb_node)); if (entry->inode) btrfs_add_delayed_iput(entry->inode); @@ -462,14 +463,14 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) * remove an ordered extent from the tree. No references are dropped * and waiters are woken up. */ -void btrfs_remove_ordered_extent(struct inode *inode, +void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, struct btrfs_ordered_extent *entry) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_ordered_inode_tree *tree; - struct btrfs_inode *btrfs_inode = BTRFS_I(inode); struct btrfs_root *root = btrfs_inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; struct rb_node *node; + bool pending; /* This is paired with btrfs_add_ordered_extent. */ spin_lock(&btrfs_inode->lock); @@ -491,13 +492,41 @@ void btrfs_remove_ordered_extent(struct inode *inode, if (tree->last == node) tree->last = NULL; set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); + pending = test_and_clear_bit(BTRFS_ORDERED_PENDING, &entry->flags); spin_unlock_irq(&tree->lock); + /* + * The current running transaction is waiting on us, we need to let it + * know that we're complete and wake it up. + */ + if (pending) { + struct btrfs_transaction *trans; + + /* + * The checks for trans are just a formality, it should be set, + * but if it isn't we don't want to deref/assert under the spin + * lock, so be nice and check if trans is set, but ASSERT() so + * if it isn't set a developer will notice. + */ + spin_lock(&fs_info->trans_lock); + trans = fs_info->running_transaction; + if (trans) + refcount_inc(&trans->use_count); + spin_unlock(&fs_info->trans_lock); + + ASSERT(trans); + if (trans) { + if (atomic_dec_and_test(&trans->pending_ordered)) + wake_up(&trans->pending_wait); + btrfs_put_transaction(trans); + } + } + spin_lock(&root->ordered_extent_lock); list_del_init(&entry->root_extent_list); root->nr_ordered_extents--; - trace_btrfs_ordered_extent_remove(inode, entry); + trace_btrfs_ordered_extent_remove(btrfs_inode, entry); if (!root->nr_ordered_extents) { spin_lock(&fs_info->ordered_root_lock); @@ -514,7 +543,7 @@ static void btrfs_run_ordered_extent_work(struct btrfs_work *work) struct btrfs_ordered_extent *ordered; ordered = container_of(work, struct btrfs_ordered_extent, flush_work); - btrfs_start_ordered_extent(ordered->inode, ordered, 1); + btrfs_start_ordered_extent(ordered, 1); complete(&ordered->completion); } @@ -620,12 +649,11 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, * in the extent, and it waits on the io completion code to insert * metadata into the btree corresponding to the extent */ -void btrfs_start_ordered_extent(struct inode *inode, - struct btrfs_ordered_extent *entry, - int wait) +void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait) { u64 start = entry->file_offset; u64 end = start + entry->num_bytes - 1; + struct btrfs_inode *inode = BTRFS_I(entry->inode); trace_btrfs_ordered_extent_start(inode, entry); @@ -635,7 +663,7 @@ void btrfs_start_ordered_extent(struct inode *inode, * for the flusher thread to find them */ if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags)) - filemap_fdatawrite_range(inode->i_mapping, start, end); + filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, end); if (wait) { wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, &entry->flags)); @@ -679,7 +707,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) end = orig_end; while (1) { - ordered = btrfs_lookup_first_ordered_extent(inode, end); + ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode), end); if (!ordered) break; if (ordered->file_offset > orig_end) { @@ -690,7 +718,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) btrfs_put_ordered_extent(ordered); break; } - btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_start_ordered_extent(ordered, 1); end = ordered->file_offset; /* * If the ordered extent had an error save the error but don't @@ -775,17 +803,45 @@ out: } /* + * Adds all ordered extents to the given list. The list ends up sorted by the + * file_offset of the ordered extents. + */ +void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode, + struct list_head *list) +{ + struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; + struct rb_node *n; + + ASSERT(inode_is_locked(&inode->vfs_inode)); + + spin_lock_irq(&tree->lock); + for (n = rb_first(&tree->tree); n; n = rb_next(n)) { + struct btrfs_ordered_extent *ordered; + + ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node); + + if (test_bit(BTRFS_ORDERED_LOGGED, &ordered->flags)) + continue; + + ASSERT(list_empty(&ordered->log_list)); + list_add_tail(&ordered->log_list, list); + refcount_inc(&ordered->refs); + } + spin_unlock_irq(&tree->lock); +} + +/* * lookup and return any extent before 'file_offset'. NULL is returned * if none is found */ struct btrfs_ordered_extent * -btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset) +btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset) { struct btrfs_ordered_inode_tree *tree; struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; - tree = &BTRFS_I(inode)->ordered_tree; + tree = &inode->ordered_tree; spin_lock_irq(&tree->lock); node = tree_search(tree, file_offset); if (!node) @@ -803,20 +859,21 @@ out: * try to find a checksum. This is used because we allow pages to * be reclaimed before their checksum is actually put into the btree */ -int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, - u8 *sum, int len) +int btrfs_find_ordered_sum(struct btrfs_inode *inode, u64 offset, + u64 disk_bytenr, u8 *sum, int len) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_ordered_sum *ordered_sum; struct btrfs_ordered_extent *ordered; - struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; + struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; unsigned long num_sectors; unsigned long i; u32 sectorsize = btrfs_inode_sectorsize(inode); + const u8 blocksize_bits = inode->vfs_inode.i_sb->s_blocksize_bits; const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); int index = 0; - ordered = btrfs_lookup_ordered_extent(BTRFS_I(inode), offset); + ordered = btrfs_lookup_ordered_extent(inode, offset); if (!ordered) return 0; @@ -824,10 +881,8 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, list_for_each_entry_reverse(ordered_sum, &ordered->list, list) { if (disk_bytenr >= ordered_sum->bytenr && disk_bytenr < ordered_sum->bytenr + ordered_sum->len) { - i = (disk_bytenr - ordered_sum->bytenr) >> - inode->i_sb->s_blocksize_bits; - num_sectors = ordered_sum->len >> - inode->i_sb->s_blocksize_bits; + i = (disk_bytenr - ordered_sum->bytenr) >> blocksize_bits; + num_sectors = ordered_sum->len >> blocksize_bits; num_sectors = min_t(int, len - index, num_sectors - i); memcpy(sum + index, ordered_sum->sums + i * csum_size, num_sectors * csum_size); @@ -883,7 +938,7 @@ void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start, break; } unlock_extent_cached(&inode->io_tree, start, end, cachedp); - btrfs_start_ordered_extent(&inode->vfs_inode, ordered, 1); + btrfs_start_ordered_extent(ordered, 1); btrfs_put_ordered_extent(ordered); } } diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index d61ea9c880a3..c3a2325e64a4 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -56,6 +56,12 @@ enum { BTRFS_ORDERED_TRUNCATED, /* Regular IO for COW */ BTRFS_ORDERED_REGULAR, + /* Used during fsync to track already logged extents */ + BTRFS_ORDERED_LOGGED, + /* We have already logged all the csums of the ordered extent */ + BTRFS_ORDERED_LOGGED_CSUM, + /* We wait for this extent to complete in the current transaction */ + BTRFS_ORDERED_PENDING, }; struct btrfs_ordered_extent { @@ -104,6 +110,9 @@ struct btrfs_ordered_extent { /* list of checksums for insertion when the extent io is done */ struct list_head list; + /* used for fast fsyncs */ + struct list_head log_list; + /* used to wait for the BTRFS_ORDERED_COMPLETE bit */ wait_queue_head_t wait; @@ -142,9 +151,9 @@ btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t) } void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry); -void btrfs_remove_ordered_extent(struct inode *inode, +void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, struct btrfs_ordered_extent *entry); -int btrfs_dec_test_ordered_pending(struct inode *inode, +int btrfs_dec_test_ordered_pending(struct btrfs_inode *inode, struct btrfs_ordered_extent **cached, u64 file_offset, u64 io_size, int uptodate); int btrfs_dec_test_first_ordered_pending(struct btrfs_inode *inode, @@ -165,17 +174,18 @@ void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, struct btrfs_ordered_sum *sum); struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode, u64 file_offset); -void btrfs_start_ordered_extent(struct inode *inode, - struct btrfs_ordered_extent *entry, int wait); +void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait); int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len); struct btrfs_ordered_extent * -btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset); +btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset); struct btrfs_ordered_extent *btrfs_lookup_ordered_range( struct btrfs_inode *inode, u64 file_offset, u64 len); -int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, - u8 *sum, int len); +void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode, + struct list_head *list); +int btrfs_find_ordered_sum(struct btrfs_inode *inode, u64 offset, + u64 disk_bytenr, u8 *sum, int len); u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr, const u64 range_start, const u64 range_len); void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 61f44e78e3c9..7695c4783d33 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -7,6 +7,44 @@ #include "disk-io.h" #include "print-tree.h" +struct root_name_map { + u64 id; + char name[16]; +}; + +static const struct root_name_map root_map[] = { + { BTRFS_ROOT_TREE_OBJECTID, "ROOT_TREE" }, + { BTRFS_EXTENT_TREE_OBJECTID, "EXTENT_TREE" }, + { BTRFS_CHUNK_TREE_OBJECTID, "CHUNK_TREE" }, + { BTRFS_DEV_TREE_OBJECTID, "DEV_TREE" }, + { BTRFS_FS_TREE_OBJECTID, "FS_TREE" }, + { BTRFS_CSUM_TREE_OBJECTID, "CSUM_TREE" }, + { BTRFS_TREE_LOG_OBJECTID, "TREE_LOG" }, + { BTRFS_QUOTA_TREE_OBJECTID, "QUOTA_TREE" }, + { BTRFS_UUID_TREE_OBJECTID, "UUID_TREE" }, + { BTRFS_FREE_SPACE_TREE_OBJECTID, "FREE_SPACE_TREE" }, + { BTRFS_DATA_RELOC_TREE_OBJECTID, "DATA_RELOC_TREE" }, +}; + +const char *btrfs_root_name(u64 objectid, char *buf) +{ + int i; + + if (objectid == BTRFS_TREE_RELOC_OBJECTID) { + snprintf(buf, BTRFS_ROOT_NAME_BUF_LEN, + "TREE_RELOC offset=%llu", objectid); + return buf; + } + + for (i = 0; i < ARRAY_SIZE(root_map); i++) { + if (root_map[i].id == objectid) + return root_map[i].name; + } + + snprintf(buf, BTRFS_ROOT_NAME_BUF_LEN, "%llu", objectid); + return buf; +} + static void print_chunk(struct extent_buffer *eb, struct btrfs_chunk *chunk) { int num_stripes = btrfs_chunk_num_stripes(eb, chunk); @@ -95,9 +133,10 @@ static void print_extent_item(struct extent_buffer *eb, int slot, int type) * offset is supposed to be a tree block which * must be aligned to nodesize. */ - if (!IS_ALIGNED(offset, eb->fs_info->nodesize)) - pr_info("\t\t\t(parent %llu is NOT ALIGNED to nodesize %llu)\n", - offset, (unsigned long long)eb->fs_info->nodesize); + if (!IS_ALIGNED(offset, eb->fs_info->sectorsize)) + pr_info( + "\t\t\t(parent %llu not aligned to sectorsize %u)\n", + offset, eb->fs_info->sectorsize); break; case BTRFS_EXTENT_DATA_REF_KEY: dref = (struct btrfs_extent_data_ref *)(&iref->offset); @@ -112,8 +151,9 @@ static void print_extent_item(struct extent_buffer *eb, int slot, int type) * must be aligned to nodesize. */ if (!IS_ALIGNED(offset, eb->fs_info->nodesize)) - pr_info("\t\t\t(parent %llu is NOT ALIGNED to nodesize %llu)\n", - offset, (unsigned long long)eb->fs_info->nodesize); + pr_info( + "\t\t\t(parent %llu not aligned to sectorsize %u)\n", + offset, eb->fs_info->sectorsize); break; default: pr_cont("(extent %llu has INVALID ref type %d)\n", diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h index e6bb38fd75ad..78b99385a503 100644 --- a/fs/btrfs/print-tree.h +++ b/fs/btrfs/print-tree.h @@ -6,7 +6,11 @@ #ifndef BTRFS_PRINT_TREE_H #define BTRFS_PRINT_TREE_H +/* Buffer size to contain tree name and possibly additional data (offset) */ +#define BTRFS_ROOT_NAME_BUF_LEN 48 + void btrfs_print_leaf(struct extent_buffer *l); void btrfs_print_tree(struct extent_buffer *c, bool follow); +const char *btrfs_root_name(u64 objectid, char *buf); #endif diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index c0f350c3a0cf..580899bdb991 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -2315,7 +2315,7 @@ static int qgroup_update_refcnt(struct btrfs_fs_info *fs_info, * Update qgroup rfer/excl counters. * Rfer update is easy, codes can explain themselves. * - * Excl update is tricky, the update is split into 2 part. + * Excl update is tricky, the update is split into 2 parts. * Part 1: Possible exclusive <-> sharing detect: * | A | !A | * ------------------------------------- diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 243a2e44526e..9d4f5316a7e8 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -767,31 +767,39 @@ static void reada_start_machine_worker(struct btrfs_work *work) kfree(rmw); } -static void __reada_start_machine(struct btrfs_fs_info *fs_info) +/* Try to start up to 10k READA requests for a group of devices */ +static int reada_start_for_fsdevs(struct btrfs_fs_devices *fs_devices) { - struct btrfs_device *device; - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; u64 enqueued; u64 total = 0; - int i; + struct btrfs_device *device; -again: do { enqueued = 0; - mutex_lock(&fs_devices->device_list_mutex); list_for_each_entry(device, &fs_devices->devices, dev_list) { if (atomic_read(&device->reada_in_flight) < MAX_IN_FLIGHT) enqueued += reada_start_machine_dev(device); } - mutex_unlock(&fs_devices->device_list_mutex); total += enqueued; } while (enqueued && total < 10000); - if (fs_devices->seed) { - fs_devices = fs_devices->seed; - goto again; - } + return total; +} + +static void __reada_start_machine(struct btrfs_fs_info *fs_info) +{ + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; + int i; + u64 enqueued = 0; + + mutex_lock(&fs_devices->device_list_mutex); + + enqueued += reada_start_for_fsdevs(fs_devices); + list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) + enqueued += reada_start_for_fsdevs(seed_devs); + + mutex_unlock(&fs_devices->device_list_mutex); if (enqueued == 0) return; diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index 5cd02514cf4d..99aa87c08912 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -45,7 +45,7 @@ out: return ret; } -static int copy_inline_to_page(struct inode *inode, +static int copy_inline_to_page(struct btrfs_inode *inode, const u64 file_offset, char *inline_data, const u64 size, @@ -58,6 +58,7 @@ static int copy_inline_to_page(struct inode *inode, char *data_start = inline_data + btrfs_file_extent_calc_inline_size(0); struct extent_changeset *data_reserved = NULL; struct page *page = NULL; + struct address_space *mapping = inode->vfs_inode.i_mapping; int ret; ASSERT(IS_ALIGNED(file_offset, block_size)); @@ -68,24 +69,23 @@ static int copy_inline_to_page(struct inode *inode, * reservation here. Also we must not do the reservation while holding * a transaction open, otherwise we would deadlock. */ - ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved, - file_offset, block_size); + ret = btrfs_delalloc_reserve_space(inode, &data_reserved, file_offset, + block_size); if (ret) goto out; - page = find_or_create_page(inode->i_mapping, file_offset >> PAGE_SHIFT, - btrfs_alloc_write_mask(inode->i_mapping)); + page = find_or_create_page(mapping, file_offset >> PAGE_SHIFT, + btrfs_alloc_write_mask(mapping)); if (!page) { ret = -ENOMEM; goto out_unlock; } set_page_extent_mapped(page); - clear_extent_bit(&BTRFS_I(inode)->io_tree, file_offset, range_end, + clear_extent_bit(&inode->io_tree, file_offset, range_end, EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, NULL); - ret = btrfs_set_extent_delalloc(BTRFS_I(inode), file_offset, range_end, - 0, NULL); + ret = btrfs_set_extent_delalloc(inode, file_offset, range_end, 0, NULL); if (ret) goto out_unlock; @@ -134,9 +134,9 @@ out_unlock: put_page(page); } if (ret) - btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, - file_offset, block_size, true); - btrfs_delalloc_release_extents(BTRFS_I(inode), block_size); + btrfs_delalloc_release_space(inode, data_reserved, file_offset, + block_size, true); + btrfs_delalloc_release_extents(inode, block_size); out: extent_changeset_free(data_reserved); @@ -167,8 +167,8 @@ static int clone_copy_inline_extent(struct inode *dst, struct btrfs_key key; if (new_key->offset > 0) { - ret = copy_inline_to_page(dst, new_key->offset, inline_data, - size, datal, comp_type); + ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset, + inline_data, size, datal, comp_type); goto out; } @@ -194,7 +194,7 @@ static int clone_copy_inline_extent(struct inode *dst, * inline extent's data to the page. */ ASSERT(key.offset > 0); - ret = copy_inline_to_page(dst, new_key->offset, + ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset, inline_data, size, datal, comp_type); goto out; @@ -213,8 +213,8 @@ static int clone_copy_inline_extent(struct inode *dst, BTRFS_FILE_EXTENT_INLINE) goto copy_inline_extent; - ret = copy_inline_to_page(dst, new_key->offset, inline_data, - size, datal, comp_type); + ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset, + inline_data, size, datal, comp_type); goto out; } @@ -231,8 +231,8 @@ copy_inline_extent: * clone. Deal with all these cases by copying the inline extent * data into the respective page at the destination inode. */ - ret = copy_inline_to_page(dst, new_key->offset, inline_data, - size, datal, comp_type); + ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset, + inline_data, size, datal, comp_type); goto out; } @@ -439,7 +439,7 @@ process_slot: if (type == BTRFS_FILE_EXTENT_REG || type == BTRFS_FILE_EXTENT_PREALLOC) { - struct btrfs_clone_extent_info clone_info; + struct btrfs_replace_extent_info clone_info; /* * a | --- range to clone ---| b @@ -462,8 +462,8 @@ process_slot: clone_info.data_len = datal; clone_info.file_offset = new_key.offset; clone_info.extent_buf = buf; - clone_info.item_size = size; - ret = btrfs_punch_hole_range(inode, path, drop_start, + clone_info.is_new_extent = false; + ret = btrfs_replace_file_extents(inode, path, drop_start, new_key.offset + datal - 1, &clone_info, &trans); if (ret) @@ -520,6 +520,8 @@ process_slot: ret = -EINTR; goto out; } + + cond_resched(); } ret = 0; @@ -533,7 +535,7 @@ process_slot: btrfs_release_path(path); path->leave_spinning = 0; - ret = btrfs_punch_hole_range(inode, path, last_dest_end, + ret = btrfs_replace_file_extents(inode, path, last_dest_end, destoff + len - 1, NULL, &trans); if (ret) goto out; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 4ba1ab9cc76d..3602806d71bd 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1206,7 +1206,8 @@ again: } if (cow) { - ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb); + ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb, + BTRFS_NESTING_COW); BUG_ON(ret); } btrfs_set_lock_blocking_write(eb); @@ -1274,7 +1275,8 @@ again: btrfs_tree_lock(eb); if (cow) { ret = btrfs_cow_block(trans, dest, eb, parent, - slot, &eb); + slot, &eb, + BTRFS_NESTING_COW); BUG_ON(ret); } btrfs_set_lock_blocking_write(eb); @@ -1781,7 +1783,8 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, * relocated and the block is tree root. */ leaf = btrfs_lock_root_node(root); - ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf); + ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf, + BTRFS_NESTING_COW); btrfs_tree_unlock(leaf); free_extent_buffer(leaf); if (ret < 0) @@ -2308,7 +2311,7 @@ static int do_relocation(struct btrfs_trans_handle *trans, if (!node->eb) { ret = btrfs_cow_block(trans, root, eb, upper->eb, - slot, &eb); + slot, &eb, BTRFS_NESTING_COW); btrfs_tree_unlock(eb); free_extent_buffer(eb); if (ret < 0) { diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index c89697486366..702dc5441f03 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -512,11 +512,20 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, if (ret && qgroup_num_bytes) btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes); + if (!ret) { + spin_lock(&rsv->lock); + rsv->qgroup_rsv_reserved += qgroup_num_bytes; + spin_unlock(&rsv->lock); + } return ret; } -void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info, +void btrfs_subvolume_release_metadata(struct btrfs_root *root, struct btrfs_block_rsv *rsv) { - btrfs_block_rsv_release(fs_info, rsv, (u64)-1, NULL); + struct btrfs_fs_info *fs_info = root->fs_info; + u64 qgroup_to_release; + + btrfs_block_rsv_release(fs_info, rsv, (u64)-1, &qgroup_to_release); + btrfs_qgroup_convert_reserved_meta(root, qgroup_to_release); } diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 354ab9985a34..cf63f1e27a27 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -835,7 +835,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) int success; bool full_stripe_locked; unsigned int nofs_flag; - static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, + static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); BUG_ON(sblock_to_check->page_count < 1); @@ -969,14 +969,14 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) spin_lock(&sctx->stat_lock); sctx->stat.read_errors++; spin_unlock(&sctx->stat_lock); - if (__ratelimit(&_rs)) + if (__ratelimit(&rs)) scrub_print_warning("i/o error", sblock_to_check); btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); } else if (sblock_bad->checksum_error) { spin_lock(&sctx->stat_lock); sctx->stat.csum_errors++; spin_unlock(&sctx->stat_lock); - if (__ratelimit(&_rs)) + if (__ratelimit(&rs)) scrub_print_warning("checksum error", sblock_to_check); btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS); @@ -984,7 +984,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) spin_lock(&sctx->stat_lock); sctx->stat.verify_errors++; spin_unlock(&sctx->stat_lock); - if (__ratelimit(&_rs)) + if (__ratelimit(&rs)) scrub_print_warning("checksum/header error", sblock_to_check); if (sblock_bad->generation_error) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index d9813a5b075a..340c76a12ce1 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -122,8 +122,6 @@ struct send_ctx { struct file_ra_state ra; - char *read_buf; - /* * We process inodes by their increasing order, so if before an * incremental send we reverse the parent/child relationship of @@ -278,11 +276,6 @@ enum btrfs_compare_tree_result { BTRFS_COMPARE_TREE_CHANGED, BTRFS_COMPARE_TREE_SAME, }; -typedef int (*btrfs_changed_cb_t)(struct btrfs_path *left_path, - struct btrfs_path *right_path, - struct btrfs_key *key, - enum btrfs_compare_tree_result result, - void *ctx); __cold static void inconsistent_snapshot_error(struct send_ctx *sctx, @@ -584,8 +577,8 @@ static int tlv_put(struct send_ctx *sctx, u16 attr, const void *data, int len) return -EOVERFLOW; hdr = (struct btrfs_tlv_header *) (sctx->send_buf + sctx->send_size); - hdr->tlv_type = cpu_to_le16(attr); - hdr->tlv_len = cpu_to_le16(len); + put_unaligned_le16(attr, &hdr->tlv_type); + put_unaligned_le16(len, &hdr->tlv_len); memcpy(hdr + 1, data, len); sctx->send_size += total_len; @@ -695,7 +688,7 @@ static int begin_cmd(struct send_ctx *sctx, int cmd) sctx->send_size += sizeof(*hdr); hdr = (struct btrfs_cmd_header *)sctx->send_buf; - hdr->cmd = cpu_to_le16(cmd); + put_unaligned_le16(cmd, &hdr->cmd); return 0; } @@ -707,17 +700,17 @@ static int send_cmd(struct send_ctx *sctx) u32 crc; hdr = (struct btrfs_cmd_header *)sctx->send_buf; - hdr->len = cpu_to_le32(sctx->send_size - sizeof(*hdr)); - hdr->crc = 0; + put_unaligned_le32(sctx->send_size - sizeof(*hdr), &hdr->len); + put_unaligned_le32(0, &hdr->crc); crc = btrfs_crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size); - hdr->crc = cpu_to_le32(crc); + put_unaligned_le32(crc, &hdr->crc); ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size, &sctx->send_off); sctx->total_send_size += sctx->send_size; - sctx->cmd_send_size[le16_to_cpu(hdr->cmd)] += sctx->send_size; + sctx->cmd_send_size[get_unaligned_le16(&hdr->cmd)] += sctx->send_size; sctx->send_size = 0; return ret; @@ -3813,6 +3806,72 @@ static int update_ref_path(struct send_ctx *sctx, struct recorded_ref *ref) } /* + * When processing the new references for an inode we may orphanize an existing + * directory inode because its old name conflicts with one of the new references + * of the current inode. Later, when processing another new reference of our + * inode, we might need to orphanize another inode, but the path we have in the + * reference reflects the pre-orphanization name of the directory we previously + * orphanized. For example: + * + * parent snapshot looks like: + * + * . (ino 256) + * |----- f1 (ino 257) + * |----- f2 (ino 258) + * |----- d1/ (ino 259) + * |----- d2/ (ino 260) + * + * send snapshot looks like: + * + * . (ino 256) + * |----- d1 (ino 258) + * |----- f2/ (ino 259) + * |----- f2_link/ (ino 260) + * | |----- f1 (ino 257) + * | + * |----- d2 (ino 258) + * + * When processing inode 257 we compute the name for inode 259 as "d1", and we + * cache it in the name cache. Later when we start processing inode 258, when + * collecting all its new references we set a full path of "d1/d2" for its new + * reference with name "d2". When we start processing the new references we + * start by processing the new reference with name "d1", and this results in + * orphanizing inode 259, since its old reference causes a conflict. Then we + * move on the next new reference, with name "d2", and we find out we must + * orphanize inode 260, as its old reference conflicts with ours - but for the + * orphanization we use a source path corresponding to the path we stored in the + * new reference, which is "d1/d2" and not "o259-6-0/d2" - this makes the + * receiver fail since the path component "d1/" no longer exists, it was renamed + * to "o259-6-0/" when processing the previous new reference. So in this case we + * must recompute the path in the new reference and use it for the new + * orphanization operation. + */ +static int refresh_ref_path(struct send_ctx *sctx, struct recorded_ref *ref) +{ + char *name; + int ret; + + name = kmemdup(ref->name, ref->name_len, GFP_KERNEL); + if (!name) + return -ENOMEM; + + fs_path_reset(ref->full_path); + ret = get_cur_path(sctx, ref->dir, ref->dir_gen, ref->full_path); + if (ret < 0) + goto out; + + ret = fs_path_add(ref->full_path, name, ref->name_len); + if (ret < 0) + goto out; + + /* Update the reference's base name pointer. */ + set_ref_path(ref, ref->full_path); +out: + kfree(name); + return ret; +} + +/* * This does all the move/link/unlink/rmdir magic. */ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) @@ -3880,52 +3939,56 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) goto out; } + /* + * Before doing any rename and link operations, do a first pass on the + * new references to orphanize any unprocessed inodes that may have a + * reference that conflicts with one of the new references of the current + * inode. This needs to happen first because a new reference may conflict + * with the old reference of a parent directory, so we must make sure + * that the path used for link and rename commands don't use an + * orphanized name when an ancestor was not yet orphanized. + * + * Example: + * + * Parent snapshot: + * + * . (ino 256) + * |----- testdir/ (ino 259) + * | |----- a (ino 257) + * | + * |----- b (ino 258) + * + * Send snapshot: + * + * . (ino 256) + * |----- testdir_2/ (ino 259) + * | |----- a (ino 260) + * | + * |----- testdir (ino 257) + * |----- b (ino 257) + * |----- b2 (ino 258) + * + * Processing the new reference for inode 257 with name "b" may happen + * before processing the new reference with name "testdir". If so, we + * must make sure that by the time we send a link command to create the + * hard link "b", inode 259 was already orphanized, since the generated + * path in "valid_path" already contains the orphanized name for 259. + * We are processing inode 257, so only later when processing 259 we do + * the rename operation to change its temporary (orphanized) name to + * "testdir_2". + */ list_for_each_entry(cur, &sctx->new_refs, list) { - /* - * We may have refs where the parent directory does not exist - * yet. This happens if the parent directories inum is higher - * than the current inum. To handle this case, we create the - * parent directory out of order. But we need to check if this - * did already happen before due to other refs in the same dir. - */ ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen); if (ret < 0) goto out; - if (ret == inode_state_will_create) { - ret = 0; - /* - * First check if any of the current inodes refs did - * already create the dir. - */ - list_for_each_entry(cur2, &sctx->new_refs, list) { - if (cur == cur2) - break; - if (cur2->dir == cur->dir) { - ret = 1; - break; - } - } - - /* - * If that did not happen, check if a previous inode - * did already create the dir. - */ - if (!ret) - ret = did_create_dir(sctx, cur->dir); - if (ret < 0) - goto out; - if (!ret) { - ret = send_create_inode(sctx, cur->dir); - if (ret < 0) - goto out; - } - } + if (ret == inode_state_will_create) + continue; /* - * Check if this new ref would overwrite the first ref of - * another unprocessed inode. If yes, orphanize the - * overwritten inode. If we find an overwritten ref that is - * not the first ref, simply unlink it. + * Check if this new ref would overwrite the first ref of another + * unprocessed inode. If yes, orphanize the overwritten inode. + * If we find an overwritten ref that is not the first ref, + * simply unlink it. */ ret = will_overwrite_ref(sctx, cur->dir, cur->dir_gen, cur->name, cur->name_len, @@ -3942,6 +4005,12 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) struct name_cache_entry *nce; struct waiting_dir_move *wdm; + if (orphanized_dir) { + ret = refresh_ref_path(sctx, cur); + if (ret < 0) + goto out; + } + ret = orphanize_inode(sctx, ow_inode, ow_gen, cur->full_path); if (ret < 0) @@ -4004,6 +4073,49 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) } } + } + + list_for_each_entry(cur, &sctx->new_refs, list) { + /* + * We may have refs where the parent directory does not exist + * yet. This happens if the parent directories inum is higher + * than the current inum. To handle this case, we create the + * parent directory out of order. But we need to check if this + * did already happen before due to other refs in the same dir. + */ + ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen); + if (ret < 0) + goto out; + if (ret == inode_state_will_create) { + ret = 0; + /* + * First check if any of the current inodes refs did + * already create the dir. + */ + list_for_each_entry(cur2, &sctx->new_refs, list) { + if (cur == cur2) + break; + if (cur2->dir == cur->dir) { + ret = 1; + break; + } + } + + /* + * If that did not happen, check if a previous inode + * did already create the dir. + */ + if (!ret) + ret = did_create_dir(sctx, cur->dir); + if (ret < 0) + goto out; + if (!ret) { + ret = send_create_inode(sctx, cur->dir); + if (ret < 0) + goto out; + } + } + if (S_ISDIR(sctx->cur_inode_mode) && sctx->parent_root) { ret = wait_for_dest_dir_move(sctx, cur, is_orphan); if (ret < 0) @@ -4799,7 +4911,25 @@ out: return ret; } -static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len) +static inline u64 max_send_read_size(const struct send_ctx *sctx) +{ + return sctx->send_max_size - SZ_16K; +} + +static int put_data_header(struct send_ctx *sctx, u32 len) +{ + struct btrfs_tlv_header *hdr; + + if (sctx->send_max_size - sctx->send_size < sizeof(*hdr) + len) + return -EOVERFLOW; + hdr = (struct btrfs_tlv_header *)(sctx->send_buf + sctx->send_size); + put_unaligned_le16(BTRFS_SEND_A_DATA, &hdr->tlv_type); + put_unaligned_le16(len, &hdr->tlv_len); + sctx->send_size += sizeof(*hdr); + return 0; +} + +static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len) { struct btrfs_root *root = sctx->send_root; struct btrfs_fs_info *fs_info = root->fs_info; @@ -4809,21 +4939,16 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len) pgoff_t index = offset >> PAGE_SHIFT; pgoff_t last_index; unsigned pg_offset = offset_in_page(offset); - ssize_t ret = 0; + int ret; + + ret = put_data_header(sctx, len); + if (ret) + return ret; inode = btrfs_iget(fs_info->sb, sctx->cur_ino, root); if (IS_ERR(inode)) return PTR_ERR(inode); - if (offset + len > i_size_read(inode)) { - if (offset > i_size_read(inode)) - len = 0; - else - len = offset - i_size_read(inode); - } - if (len == 0) - goto out; - last_index = (offset + len - 1) >> PAGE_SHIFT; /* initial readahead */ @@ -4864,16 +4989,16 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len) } addr = kmap(page); - memcpy(sctx->read_buf + ret, addr + pg_offset, cur_len); + memcpy(sctx->send_buf + sctx->send_size, addr + pg_offset, + cur_len); kunmap(page); unlock_page(page); put_page(page); index++; pg_offset = 0; len -= cur_len; - ret += cur_len; + sctx->send_size += cur_len; } -out: iput(inode); return ret; } @@ -4887,7 +5012,6 @@ static int send_write(struct send_ctx *sctx, u64 offset, u32 len) struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; int ret = 0; struct fs_path *p; - ssize_t num_read = 0; p = fs_path_alloc(); if (!p) @@ -4895,13 +5019,6 @@ static int send_write(struct send_ctx *sctx, u64 offset, u32 len) btrfs_debug(fs_info, "send_write offset=%llu, len=%d", offset, len); - num_read = fill_read_buf(sctx, offset, len); - if (num_read <= 0) { - if (num_read < 0) - ret = num_read; - goto out; - } - ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE); if (ret < 0) goto out; @@ -4912,16 +5029,16 @@ static int send_write(struct send_ctx *sctx, u64 offset, u32 len) TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); - TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, num_read); + ret = put_file_data(sctx, offset, len); + if (ret < 0) + goto out; ret = send_cmd(sctx); tlv_put_failure: out: fs_path_free(p); - if (ret < 0) - return ret; - return num_read; + return ret; } /* @@ -5033,8 +5150,8 @@ out: static int send_hole(struct send_ctx *sctx, u64 end) { struct fs_path *p = NULL; + u64 read_size = max_send_read_size(sctx); u64 offset = sctx->cur_inode_last_extent; - u64 len; int ret = 0; /* @@ -5061,16 +5178,19 @@ static int send_hole(struct send_ctx *sctx, u64 end) ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); if (ret < 0) goto tlv_put_failure; - memset(sctx->read_buf, 0, BTRFS_SEND_READ_SIZE); while (offset < end) { - len = min_t(u64, end - offset, BTRFS_SEND_READ_SIZE); + u64 len = min(end - offset, read_size); ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE); if (ret < 0) break; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); - TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, len); + ret = put_data_header(sctx, len); + if (ret < 0) + break; + memset(sctx->send_buf + sctx->send_size, 0, len); + sctx->send_size += len; ret = send_cmd(sctx); if (ret < 0) break; @@ -5086,23 +5206,20 @@ static int send_extent_data(struct send_ctx *sctx, const u64 offset, const u64 len) { + u64 read_size = max_send_read_size(sctx); u64 sent = 0; if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) return send_update_extent(sctx, offset, len); while (sent < len) { - u64 size = len - sent; + u64 size = min(len - sent, read_size); int ret; - if (size > BTRFS_SEND_READ_SIZE) - size = BTRFS_SEND_READ_SIZE; ret = send_write(sctx, offset + sent, size); if (ret < 0) return ret; - if (!ret) - break; - sent += ret; + sent += size; } return 0; } @@ -5402,51 +5519,29 @@ static int send_write_or_clone(struct send_ctx *sctx, struct clone_root *clone_root) { int ret = 0; - struct btrfs_file_extent_item *ei; u64 offset = key->offset; - u64 len; - u8 type; + u64 end; u64 bs = sctx->send_root->fs_info->sb->s_blocksize; - ei = btrfs_item_ptr(path->nodes[0], path->slots[0], - struct btrfs_file_extent_item); - type = btrfs_file_extent_type(path->nodes[0], ei); - if (type == BTRFS_FILE_EXTENT_INLINE) { - len = btrfs_file_extent_ram_bytes(path->nodes[0], ei); - /* - * it is possible the inline item won't cover the whole page, - * but there may be items after this page. Make - * sure to send the whole thing - */ - len = PAGE_ALIGN(len); - } else { - len = btrfs_file_extent_num_bytes(path->nodes[0], ei); - } - - if (offset >= sctx->cur_inode_size) { - ret = 0; - goto out; - } - if (offset + len > sctx->cur_inode_size) - len = sctx->cur_inode_size - offset; - if (len == 0) { - ret = 0; - goto out; - } + end = min_t(u64, btrfs_file_extent_end(path), sctx->cur_inode_size); + if (offset >= end) + return 0; - if (clone_root && IS_ALIGNED(offset + len, bs)) { + if (clone_root && IS_ALIGNED(end, bs)) { + struct btrfs_file_extent_item *ei; u64 disk_byte; u64 data_offset; + ei = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_file_extent_item); disk_byte = btrfs_file_extent_disk_bytenr(path->nodes[0], ei); data_offset = btrfs_file_extent_offset(path->nodes[0], ei); ret = clone_range(sctx, clone_root, disk_byte, data_offset, - offset, len); + offset, end - offset); } else { - ret = send_extent_data(sctx, offset, len); + ret = send_extent_data(sctx, offset, end - offset); } - sctx->cur_inode_next_write_offset = offset + len; -out: + sctx->cur_inode_next_write_offset = end; return ret; } @@ -6692,8 +6787,7 @@ static int tree_compare_item(struct btrfs_path *left_path, * If it detects a change, it aborts immediately. */ static int btrfs_compare_trees(struct btrfs_root *left_root, - struct btrfs_root *right_root, - btrfs_changed_cb_t changed_cb, void *ctx) + struct btrfs_root *right_root, void *ctx) { struct btrfs_fs_info *fs_info = left_root->fs_info; int ret; @@ -6960,8 +7054,7 @@ static int send_subvol(struct send_ctx *sctx) goto out; if (sctx->parent_root) { - ret = btrfs_compare_trees(sctx->send_root, sctx->parent_root, - changed_cb, sctx); + ret = btrfs_compare_trees(sctx->send_root, sctx->parent_root, sctx); if (ret < 0) goto out; ret = finish_inode_if_needed(sctx, 1); @@ -7087,7 +7180,7 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) u32 i; u64 *clone_sources_tmp = NULL; int clone_sources_to_rollback = 0; - unsigned alloc_size; + size_t alloc_size; int sort_clone_roots = 0; if (!capable(CAP_SYS_ADMIN)) @@ -7169,25 +7262,20 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) goto out; } - sctx->read_buf = kvmalloc(BTRFS_SEND_READ_SIZE, GFP_KERNEL); - if (!sctx->read_buf) { - ret = -ENOMEM; - goto out; - } - sctx->pending_dir_moves = RB_ROOT; sctx->waiting_dir_moves = RB_ROOT; sctx->orphan_dirs = RB_ROOT; - alloc_size = sizeof(struct clone_root) * (arg->clone_sources_count + 1); - - sctx->clone_roots = kzalloc(alloc_size, GFP_KERNEL); + sctx->clone_roots = kvcalloc(sizeof(*sctx->clone_roots), + arg->clone_sources_count + 1, + GFP_KERNEL); if (!sctx->clone_roots) { ret = -ENOMEM; goto out; } - alloc_size = arg->clone_sources_count * sizeof(*arg->clone_sources); + alloc_size = array_size(sizeof(*arg->clone_sources), + arg->clone_sources_count); if (arg->clone_sources_count) { clone_sources_tmp = kvmalloc(alloc_size, GFP_KERNEL); @@ -7378,7 +7466,6 @@ out: kvfree(sctx->clone_roots); kvfree(sctx->send_buf); - kvfree(sctx->read_buf); name_cache_free(sctx); diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h index ead397f7034f..de91488b7cd0 100644 --- a/fs/btrfs/send.h +++ b/fs/btrfs/send.h @@ -13,7 +13,6 @@ #define BTRFS_SEND_STREAM_VERSION 1 #define BTRFS_SEND_BUF_SIZE SZ_64K -#define BTRFS_SEND_READ_SIZE (48 * SZ_1K) enum btrfs_tlv_type { BTRFS_TLV_U8, diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 475968ccbd1d..64099565ab8f 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -175,10 +175,8 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info) struct list_head *head = &info->space_info; struct btrfs_space_info *found; - rcu_read_lock(); - list_for_each_entry_rcu(found, head, list) + list_for_each_entry(found, head, list) found->full = 0; - rcu_read_unlock(); } static int create_space_info(struct btrfs_fs_info *info, u64 flags) @@ -213,7 +211,7 @@ static int create_space_info(struct btrfs_fs_info *info, u64 flags) if (ret) return ret; - list_add_rcu(&space_info->list, &info->space_info); + list_add(&space_info->list, &info->space_info); if (flags & BTRFS_BLOCK_GROUP_DATA) info->data_sinfo = space_info; @@ -290,22 +288,13 @@ struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info, flags &= BTRFS_BLOCK_GROUP_TYPE_MASK; - rcu_read_lock(); - list_for_each_entry_rcu(found, head, list) { - if (found->flags & flags) { - rcu_read_unlock(); + list_for_each_entry(found, head, list) { + if (found->flags & flags) return found; - } } - rcu_read_unlock(); return NULL; } -static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global) -{ - return (global->size << 1); -} - static u64 calc_available_free_space(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, enum btrfs_reserve_flush_enum flush) @@ -476,28 +465,6 @@ again: up_read(&info->groups_sem); } -static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info, - unsigned long nr_pages, int nr_items) -{ - struct super_block *sb = fs_info->sb; - - if (down_read_trylock(&sb->s_umount)) { - writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE); - up_read(&sb->s_umount); - } else { - /* - * We needn't worry the filesystem going from r/w to r/o though - * we don't acquire ->s_umount mutex, because the filesystem - * should guarantee the delalloc inodes list be empty after - * the filesystem is readonly(all dirty pages are written to - * the disk). - */ - btrfs_start_delalloc_roots(fs_info, nr_items); - if (!current->journal_info) - btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1); - } -} - static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info, u64 to_reclaim) { @@ -516,25 +483,33 @@ static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info, /* * shrink metadata reservation for delalloc */ -static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim, - u64 orig, bool wait_ordered) +static void shrink_delalloc(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + u64 to_reclaim, bool wait_ordered) { - struct btrfs_space_info *space_info; struct btrfs_trans_handle *trans; u64 delalloc_bytes; u64 dio_bytes; - u64 async_pages; u64 items; long time_left; - unsigned long nr_pages; int loops; /* Calc the number of the pages we need flush for space reservation */ - items = calc_reclaim_items_nr(fs_info, to_reclaim); - to_reclaim = items * EXTENT_SIZE_PER_ITEM; + if (to_reclaim == U64_MAX) { + items = U64_MAX; + } else { + /* + * to_reclaim is set to however much metadata we need to + * reclaim, but reclaiming that much data doesn't really track + * exactly, so increase the amount to reclaim by 2x in order to + * make sure we're flushing enough delalloc to hopefully reclaim + * some metadata reservations. + */ + items = calc_reclaim_items_nr(fs_info, to_reclaim) * 2; + to_reclaim = items * EXTENT_SIZE_PER_ITEM; + } trans = (struct btrfs_trans_handle *)current->journal_info; - space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); delalloc_bytes = percpu_counter_sum_positive( &fs_info->delalloc_bytes); @@ -557,37 +532,17 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim, loops = 0; while ((delalloc_bytes || dio_bytes) && loops < 3) { - nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT; - - /* - * Triggers inode writeback for up to nr_pages. This will invoke - * ->writepages callback and trigger delalloc filling - * (btrfs_run_delalloc_range()). - */ - btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items); + btrfs_start_delalloc_roots(fs_info, items); - /* - * We need to wait for the compressed pages to start before - * we continue. - */ - async_pages = atomic_read(&fs_info->async_delalloc_pages); - if (!async_pages) - goto skip_async; - - /* - * Calculate how many compressed pages we want to be written - * before we continue. I.e if there are more async pages than we - * require wait_event will wait until nr_pages are written. - */ - if (async_pages <= nr_pages) - async_pages = 0; - else - async_pages -= nr_pages; + loops++; + if (wait_ordered && !trans) { + btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1); + } else { + time_left = schedule_timeout_killable(1); + if (time_left) + break; + } - wait_event(fs_info->async_submit_wait, - atomic_read(&fs_info->async_delalloc_pages) <= - (int)async_pages); -skip_async: spin_lock(&space_info->lock); if (list_empty(&space_info->tickets) && list_empty(&space_info->priority_tickets)) { @@ -596,14 +551,6 @@ skip_async: } spin_unlock(&space_info->lock); - loops++; - if (wait_ordered && !trans) { - btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1); - } else { - time_left = schedule_timeout_killable(1); - if (time_left) - break; - } delalloc_bytes = percpu_counter_sum_positive( &fs_info->delalloc_bytes); dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes); @@ -628,8 +575,8 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; struct btrfs_block_rsv *trans_rsv = &fs_info->trans_block_rsv; struct btrfs_trans_handle *trans; - u64 bytes_needed; u64 reclaim_bytes = 0; + u64 bytes_needed = 0; u64 cur_free_bytes = 0; trans = (struct btrfs_trans_handle *)current->journal_info; @@ -649,7 +596,8 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info, else if (!list_empty(&space_info->tickets)) ticket = list_first_entry(&space_info->tickets, struct reserve_ticket, list); - bytes_needed = (ticket) ? ticket->bytes : 0; + if (ticket) + bytes_needed = ticket->bytes; if (bytes_needed > cur_free_bytes) bytes_needed -= cur_free_bytes; @@ -676,8 +624,10 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info, goto commit; /* - * See if there is some space in the delayed insertion reservation for - * this reservation. + * See if there is some space in the delayed insertion reserve for this + * reservation. If the space_info's don't match (like for DATA or + * SYSTEM) then just go enospc, reclaiming this space won't recover any + * space to satisfy those reservations. */ if (space_info != delayed_rsv->space_info) goto enospc; @@ -742,7 +692,7 @@ static void flush_space(struct btrfs_fs_info *fs_info, break; case FLUSH_DELALLOC: case FLUSH_DELALLOC_WAIT: - shrink_delalloc(fs_info, num_bytes * 2, num_bytes, + shrink_delalloc(fs_info, space_info, num_bytes, state == FLUSH_DELALLOC_WAIT); break; case FLUSH_DELAYED_REFS_NR: @@ -767,7 +717,7 @@ static void flush_space(struct btrfs_fs_info *fs_info, break; } ret = btrfs_chunk_alloc(trans, - btrfs_metadata_alloc_profile(fs_info), + btrfs_get_alloc_profile(fs_info, space_info->flags), (state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE : CHUNK_ALLOC_FORCE); btrfs_end_transaction(trans); @@ -1037,9 +987,132 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) } while (flush_state <= COMMIT_TRANS); } -void btrfs_init_async_reclaim_work(struct work_struct *work) +/* + * FLUSH_DELALLOC_WAIT: + * Space is freed from flushing delalloc in one of two ways. + * + * 1) compression is on and we allocate less space than we reserved + * 2) we are overwriting existing space + * + * For #1 that extra space is reclaimed as soon as the delalloc pages are + * COWed, by way of btrfs_add_reserved_bytes() which adds the actual extent + * length to ->bytes_reserved, and subtracts the reserved space from + * ->bytes_may_use. + * + * For #2 this is trickier. Once the ordered extent runs we will drop the + * extent in the range we are overwriting, which creates a delayed ref for + * that freed extent. This however is not reclaimed until the transaction + * commits, thus the next stages. + * + * RUN_DELAYED_IPUTS + * If we are freeing inodes, we want to make sure all delayed iputs have + * completed, because they could have been on an inode with i_nlink == 0, and + * thus have been truncated and freed up space. But again this space is not + * immediately re-usable, it comes in the form of a delayed ref, which must be + * run and then the transaction must be committed. + * + * FLUSH_DELAYED_REFS + * The above two cases generate delayed refs that will affect + * ->total_bytes_pinned. However this counter can be inconsistent with + * reality if there are outstanding delayed refs. This is because we adjust + * the counter based solely on the current set of delayed refs and disregard + * any on-disk state which might include more refs. So for example, if we + * have an extent with 2 references, but we only drop 1, we'll see that there + * is a negative delayed ref count for the extent and assume that the space + * will be freed, and thus increase ->total_bytes_pinned. + * + * Running the delayed refs gives us the actual real view of what will be + * freed at the transaction commit time. This stage will not actually free + * space for us, it just makes sure that may_commit_transaction() has all of + * the information it needs to make the right decision. + * + * COMMIT_TRANS + * This is where we reclaim all of the pinned space generated by the previous + * two stages. We will not commit the transaction if we don't think we're + * likely to satisfy our request, which means if our current free space + + * total_bytes_pinned < reservation we will not commit. This is why the + * previous states are actually important, to make sure we know for sure + * whether committing the transaction will allow us to make progress. + * + * ALLOC_CHUNK_FORCE + * For data we start with alloc chunk force, however we could have been full + * before, and then the transaction commit could have freed new block groups, + * so if we now have space to allocate do the force chunk allocation. + */ +static const enum btrfs_flush_state data_flush_states[] = { + FLUSH_DELALLOC_WAIT, + RUN_DELAYED_IPUTS, + FLUSH_DELAYED_REFS, + COMMIT_TRANS, + ALLOC_CHUNK_FORCE, +}; + +static void btrfs_async_reclaim_data_space(struct work_struct *work) +{ + struct btrfs_fs_info *fs_info; + struct btrfs_space_info *space_info; + u64 last_tickets_id; + int flush_state = 0; + + fs_info = container_of(work, struct btrfs_fs_info, async_data_reclaim_work); + space_info = fs_info->data_sinfo; + + spin_lock(&space_info->lock); + if (list_empty(&space_info->tickets)) { + space_info->flush = 0; + spin_unlock(&space_info->lock); + return; + } + last_tickets_id = space_info->tickets_id; + spin_unlock(&space_info->lock); + + while (!space_info->full) { + flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE); + spin_lock(&space_info->lock); + if (list_empty(&space_info->tickets)) { + space_info->flush = 0; + spin_unlock(&space_info->lock); + return; + } + last_tickets_id = space_info->tickets_id; + spin_unlock(&space_info->lock); + } + + while (flush_state < ARRAY_SIZE(data_flush_states)) { + flush_space(fs_info, space_info, U64_MAX, + data_flush_states[flush_state]); + spin_lock(&space_info->lock); + if (list_empty(&space_info->tickets)) { + space_info->flush = 0; + spin_unlock(&space_info->lock); + return; + } + + if (last_tickets_id == space_info->tickets_id) { + flush_state++; + } else { + last_tickets_id = space_info->tickets_id; + flush_state = 0; + } + + if (flush_state >= ARRAY_SIZE(data_flush_states)) { + if (space_info->full) { + if (maybe_fail_all_tickets(fs_info, space_info)) + flush_state = 0; + else + space_info->flush = 0; + } else { + flush_state = 0; + } + } + spin_unlock(&space_info->lock); + } +} + +void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info) { - INIT_WORK(work, btrfs_async_reclaim_metadata_space); + INIT_WORK(&fs_info->async_reclaim_work, btrfs_async_reclaim_metadata_space); + INIT_WORK(&fs_info->async_data_reclaim_work, btrfs_async_reclaim_data_space); } static const enum btrfs_flush_state priority_flush_states[] = { @@ -1089,6 +1162,21 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, } while (flush_state < states_nr); } +static void priority_reclaim_data_space(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + struct reserve_ticket *ticket) +{ + while (!space_info->full) { + flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE); + spin_lock(&space_info->lock); + if (ticket->bytes == 0) { + spin_unlock(&space_info->lock); + return; + } + spin_unlock(&space_info->lock); + } +} + static void wait_reserve_ticket(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, struct reserve_ticket *ticket) @@ -1141,6 +1229,7 @@ static int handle_reserve_ticket(struct btrfs_fs_info *fs_info, int ret; switch (flush) { + case BTRFS_RESERVE_FLUSH_DATA: case BTRFS_RESERVE_FLUSH_ALL: case BTRFS_RESERVE_FLUSH_ALL_STEAL: wait_reserve_ticket(fs_info, space_info, ticket); @@ -1155,6 +1244,9 @@ static int handle_reserve_ticket(struct btrfs_fs_info *fs_info, evict_flush_states, ARRAY_SIZE(evict_flush_states)); break; + case BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE: + priority_reclaim_data_space(fs_info, space_info, ticket); + break; default: ASSERT(0); break; @@ -1214,11 +1306,11 @@ static inline bool is_normal_flushing(enum btrfs_reserve_flush_enum flush) * regain reservations will be made and this will fail if there is not enough * space already. */ -static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, - u64 orig_bytes, - enum btrfs_reserve_flush_enum flush) +static int __reserve_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, u64 orig_bytes, + enum btrfs_reserve_flush_enum flush) { + struct work_struct *async_work; struct reserve_ticket ticket; u64 used; int ret = 0; @@ -1227,6 +1319,11 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, ASSERT(orig_bytes); ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL); + if (flush == BTRFS_RESERVE_FLUSH_DATA) + async_work = &fs_info->async_data_reclaim_work; + else + async_work = &fs_info->async_reclaim_work; + spin_lock(&space_info->lock); ret = -ENOSPC; used = btrfs_space_info_used(space_info, true); @@ -1268,7 +1365,8 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, init_waitqueue_head(&ticket.wait); ticket.steal = (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL); if (flush == BTRFS_RESERVE_FLUSH_ALL || - flush == BTRFS_RESERVE_FLUSH_ALL_STEAL) { + flush == BTRFS_RESERVE_FLUSH_ALL_STEAL || + flush == BTRFS_RESERVE_FLUSH_DATA) { list_add_tail(&ticket.list, &space_info->tickets); if (!space_info->flush) { space_info->flush = 1; @@ -1276,8 +1374,7 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, space_info->flags, orig_bytes, flush, "enospc"); - queue_work(system_unbound_wq, - &fs_info->async_reclaim_work); + queue_work(system_unbound_wq, async_work); } } else { list_add_tail(&ticket.list, @@ -1329,8 +1426,7 @@ int btrfs_reserve_metadata_bytes(struct btrfs_root *root, struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; int ret; - ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info, - orig_bytes, flush); + ret = __reserve_bytes(fs_info, block_rsv->space_info, orig_bytes, flush); if (ret == -ENOSPC && unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) { if (block_rsv != global_rsv && @@ -1348,3 +1444,32 @@ int btrfs_reserve_metadata_bytes(struct btrfs_root *root, } return ret; } + +/** + * btrfs_reserve_data_bytes - try to reserve data bytes for an allocation + * @fs_info - the filesystem + * @bytes - the number of bytes we need + * @flush - how we are allowed to flush + * + * This will reserve bytes from the data space info. If there is not enough + * space then we will attempt to flush space as specified by flush. + */ +int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes, + enum btrfs_reserve_flush_enum flush) +{ + struct btrfs_space_info *data_sinfo = fs_info->data_sinfo; + int ret; + + ASSERT(flush == BTRFS_RESERVE_FLUSH_DATA || + flush == BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE); + ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_DATA); + + ret = __reserve_bytes(fs_info, data_sinfo, bytes, flush); + if (ret == -ENOSPC) { + trace_btrfs_space_reservation(fs_info, "space_info:enospc", + data_sinfo->flags, bytes, 1); + if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) + btrfs_dump_space_info(fs_info, data_sinfo, bytes, 0); + } + return ret; +} diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index c3c64019950a..5646393b928c 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -149,5 +149,7 @@ static inline void btrfs_space_info_free_bytes_may_use( btrfs_try_granting_tickets(fs_info, space_info); spin_unlock(&space_info->lock); } +int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes, + enum btrfs_reserve_flush_enum flush); #endif /* BTRFS_SPACE_INFO_H */ diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c index 079b059818e9..c46be27be700 100644 --- a/fs/btrfs/struct-funcs.c +++ b/fs/btrfs/struct-funcs.c @@ -7,16 +7,6 @@ #include "ctree.h" -static inline u8 get_unaligned_le8(const void *p) -{ - return *(u8 *)p; -} - -static inline void put_unaligned_le8(u8 val, void *p) -{ - *(u8 *)p = val; -} - static bool check_setget_bounds(const struct extent_buffer *eb, const void *ptr, unsigned off, int size) { diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 25967ecaaf0a..8840a4fa81eb 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1871,6 +1871,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) * the filesystem is busy. */ cancel_work_sync(&fs_info->async_reclaim_work); + cancel_work_sync(&fs_info->async_data_reclaim_work); btrfs_discard_cleanup(fs_info); @@ -2163,8 +2164,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) u64 thresh = 0; int mixed = 0; - rcu_read_lock(); - list_for_each_entry_rcu(found, &fs_info->space_info, list) { + list_for_each_entry(found, &fs_info->space_info, list) { if (found->flags & BTRFS_BLOCK_GROUP_DATA) { int i; @@ -2193,8 +2193,6 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) total_used += found->disk_used; } - rcu_read_unlock(); - buf->f_blocks = div_u64(btrfs_super_total_bytes(disk_super), factor); buf->f_blocks >>= bits; buf->f_bfree = buf->f_blocks - (div_u64(total_used, factor) >> bits); diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index c8df2edafd85..279d9262b676 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -14,6 +14,7 @@ #include "ctree.h" #include "discard.h" #include "disk-io.h" +#include "send.h" #include "transaction.h" #include "sysfs.h" #include "volumes.h" @@ -321,9 +322,17 @@ static ssize_t supported_checksums_show(struct kobject *kobj, } BTRFS_ATTR(static_feature, supported_checksums, supported_checksums_show); +static ssize_t send_stream_version_show(struct kobject *kobj, + struct kobj_attribute *ka, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%d\n", BTRFS_SEND_STREAM_VERSION); +} +BTRFS_ATTR(static_feature, send_stream_version, send_stream_version_show); + static struct attribute *btrfs_supported_static_feature_attrs[] = { BTRFS_ATTR_PTR(static_feature, rmdir_subvol), BTRFS_ATTR_PTR(static_feature, supported_checksums), + BTRFS_ATTR_PTR(static_feature, send_stream_version), NULL }; @@ -809,6 +818,42 @@ static ssize_t btrfs_checksum_show(struct kobject *kobj, BTRFS_ATTR(, checksum, btrfs_checksum_show); +static ssize_t btrfs_exclusive_operation_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + const char *str; + + switch (READ_ONCE(fs_info->exclusive_operation)) { + case BTRFS_EXCLOP_NONE: + str = "none\n"; + break; + case BTRFS_EXCLOP_BALANCE: + str = "balance\n"; + break; + case BTRFS_EXCLOP_DEV_ADD: + str = "device add\n"; + break; + case BTRFS_EXCLOP_DEV_REMOVE: + str = "device remove\n"; + break; + case BTRFS_EXCLOP_DEV_REPLACE: + str = "device replace\n"; + break; + case BTRFS_EXCLOP_RESIZE: + str = "resize\n"; + break; + case BTRFS_EXCLOP_SWAP_ACTIVATE: + str = "swap activate\n"; + break; + default: + str = "UNKNOWN\n"; + break; + } + return scnprintf(buf, PAGE_SIZE, "%s", str); +} +BTRFS_ATTR(, exclusive_operation, btrfs_exclusive_operation_show); + static const struct attribute *btrfs_attrs[] = { BTRFS_ATTR_PTR(, label), BTRFS_ATTR_PTR(, nodesize), @@ -817,6 +862,7 @@ static const struct attribute *btrfs_attrs[] = { BTRFS_ATTR_PTR(, quota_override), BTRFS_ATTR_PTR(, metadata_uuid), BTRFS_ATTR_PTR(, checksum), + BTRFS_ATTR_PTR(, exclusive_operation), NULL, }; @@ -935,12 +981,24 @@ void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs) } } +static void btrfs_sysfs_remove_fs_devices(struct btrfs_fs_devices *fs_devices) +{ + struct btrfs_device *device; + struct btrfs_fs_devices *seed; + + list_for_each_entry(device, &fs_devices->devices, dev_list) + btrfs_sysfs_remove_device(device); + + list_for_each_entry(seed, &fs_devices->seed_list, seed_list) { + list_for_each_entry(device, &seed->devices, dev_list) + btrfs_sysfs_remove_device(device); + } +} + void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info) { struct kobject *fsid_kobj = &fs_info->fs_devices->fsid_kobj; - btrfs_reset_fs_info_ptr(fs_info); - sysfs_remove_link(fsid_kobj, "bdi"); if (fs_info->space_info_kobj) { @@ -964,7 +1022,7 @@ void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info) addrm_unknown_feature_attrs(fs_info, false); sysfs_remove_group(fsid_kobj, &btrfs_feature_attr_group); sysfs_remove_files(fsid_kobj, btrfs_attrs); - btrfs_sysfs_remove_devices_dir(fs_info->fs_devices, NULL); + btrfs_sysfs_remove_fs_devices(fs_info->fs_devices); } static const char * const btrfs_feature_set_names[FEAT_MAX] = { @@ -973,7 +1031,7 @@ static const char * const btrfs_feature_set_names[FEAT_MAX] = { [FEAT_INCOMPAT] = "incompat", }; -const char * const btrfs_feature_set_name(enum btrfs_feature_set set) +const char *btrfs_feature_set_name(enum btrfs_feature_set set) { return btrfs_feature_set_names[set]; } @@ -1079,17 +1137,38 @@ void btrfs_sysfs_add_block_group_type(struct btrfs_block_group *cache) rkobj->flags = cache->flags; kobject_init(&rkobj->kobj, &btrfs_raid_ktype); + + /* + * We call this either on mount, or if we've created a block group for a + * new index type while running (i.e. when restriping). The running + * case is tricky because we could race with other threads, so we need + * to have this check to make sure we didn't already init the kobject. + * + * We don't have to protect on the free side because it only happens on + * unmount. + */ + spin_lock(&space_info->lock); + if (space_info->block_group_kobjs[index]) { + spin_unlock(&space_info->lock); + kobject_put(&rkobj->kobj); + return; + } else { + space_info->block_group_kobjs[index] = &rkobj->kobj; + } + spin_unlock(&space_info->lock); + ret = kobject_add(&rkobj->kobj, &space_info->kobj, "%s", btrfs_bg_type_to_raid_name(rkobj->flags)); memalloc_nofs_restore(nofs_flag); if (ret) { + spin_lock(&space_info->lock); + space_info->block_group_kobjs[index] = NULL; + spin_unlock(&space_info->lock); kobject_put(&rkobj->kobj); btrfs_warn(fs_info, "failed to add kobject for block cache, ignoring"); return; } - - space_info->block_group_kobjs[index] = &rkobj->kobj; } /* @@ -1151,48 +1230,30 @@ int btrfs_sysfs_add_space_info_type(struct btrfs_fs_info *fs_info, return 0; } -/* when one_device is NULL, it removes all device links */ - -int btrfs_sysfs_remove_devices_dir(struct btrfs_fs_devices *fs_devices, - struct btrfs_device *one_device) +void btrfs_sysfs_remove_device(struct btrfs_device *device) { struct hd_struct *disk; struct kobject *disk_kobj; + struct kobject *devices_kobj; - if (!fs_devices->devices_kobj) - return -EINVAL; - - if (one_device) { - if (one_device->bdev) { - disk = one_device->bdev->bd_part; - disk_kobj = &part_to_dev(disk)->kobj; - sysfs_remove_link(fs_devices->devices_kobj, - disk_kobj->name); - } - - kobject_del(&one_device->devid_kobj); - kobject_put(&one_device->devid_kobj); - - wait_for_completion(&one_device->kobj_unregister); + /* + * Seed fs_devices devices_kobj aren't used, fetch kobject from the + * fs_info::fs_devices. + */ + devices_kobj = device->fs_info->fs_devices->devices_kobj; + ASSERT(devices_kobj); - return 0; + if (device->bdev) { + disk = device->bdev->bd_part; + disk_kobj = &part_to_dev(disk)->kobj; + sysfs_remove_link(devices_kobj, disk_kobj->name); } - list_for_each_entry(one_device, &fs_devices->devices, dev_list) { - - if (one_device->bdev) { - disk = one_device->bdev->bd_part; - disk_kobj = &part_to_dev(disk)->kobj; - sysfs_remove_link(fs_devices->devices_kobj, - disk_kobj->name); - } - kobject_del(&one_device->devid_kobj); - kobject_put(&one_device->devid_kobj); - - wait_for_completion(&one_device->kobj_unregister); + if (device->devid_kobj.state_initialized) { + kobject_del(&device->devid_kobj); + kobject_put(&device->devid_kobj); + wait_for_completion(&device->kobj_unregister); } - - return 0; } static ssize_t btrfs_devinfo_in_fs_metadata_show(struct kobject *kobj, @@ -1273,44 +1334,80 @@ static struct kobj_type devid_ktype = { .release = btrfs_release_devid_kobj, }; -int btrfs_sysfs_add_devices_dir(struct btrfs_fs_devices *fs_devices, - struct btrfs_device *one_device) +int btrfs_sysfs_add_device(struct btrfs_device *device) { - int error = 0; - struct btrfs_device *dev; + int ret; unsigned int nofs_flag; + struct kobject *devices_kobj; + struct kobject *devinfo_kobj; - nofs_flag = memalloc_nofs_save(); - list_for_each_entry(dev, &fs_devices->devices, dev_list) { + /* + * Make sure we use the fs_info::fs_devices to fetch the kobjects even + * for the seed fs_devices + */ + devices_kobj = device->fs_info->fs_devices->devices_kobj; + devinfo_kobj = device->fs_info->fs_devices->devinfo_kobj; + ASSERT(devices_kobj); + ASSERT(devinfo_kobj); - if (one_device && one_device != dev) - continue; + nofs_flag = memalloc_nofs_save(); - if (dev->bdev) { - struct hd_struct *disk; - struct kobject *disk_kobj; + if (device->bdev) { + struct hd_struct *disk; + struct kobject *disk_kobj; - disk = dev->bdev->bd_part; - disk_kobj = &part_to_dev(disk)->kobj; + disk = device->bdev->bd_part; + disk_kobj = &part_to_dev(disk)->kobj; - error = sysfs_create_link(fs_devices->devices_kobj, - disk_kobj, disk_kobj->name); - if (error) - break; + ret = sysfs_create_link(devices_kobj, disk_kobj, disk_kobj->name); + if (ret) { + btrfs_warn(device->fs_info, + "creating sysfs device link for devid %llu failed: %d", + device->devid, ret); + goto out; } + } - init_completion(&dev->kobj_unregister); - error = kobject_init_and_add(&dev->devid_kobj, &devid_ktype, - fs_devices->devinfo_kobj, "%llu", - dev->devid); - if (error) { - kobject_put(&dev->devid_kobj); - break; - } + init_completion(&device->kobj_unregister); + ret = kobject_init_and_add(&device->devid_kobj, &devid_ktype, + devinfo_kobj, "%llu", device->devid); + if (ret) { + kobject_put(&device->devid_kobj); + btrfs_warn(device->fs_info, + "devinfo init for devid %llu failed: %d", + device->devid, ret); } + +out: memalloc_nofs_restore(nofs_flag); + return ret; +} - return error; +static int btrfs_sysfs_add_fs_devices(struct btrfs_fs_devices *fs_devices) +{ + int ret; + struct btrfs_device *device; + struct btrfs_fs_devices *seed; + + list_for_each_entry(device, &fs_devices->devices, dev_list) { + ret = btrfs_sysfs_add_device(device); + if (ret) + goto fail; + } + + list_for_each_entry(seed, &fs_devices->seed_list, seed_list) { + list_for_each_entry(device, &seed->devices, dev_list) { + ret = btrfs_sysfs_add_device(device); + if (ret) + goto fail; + } + } + + return 0; + +fail: + btrfs_sysfs_remove_fs_devices(fs_devices); + return ret; } void btrfs_kobject_uevent(struct block_device *bdev, enum kobject_action action) @@ -1324,8 +1421,8 @@ void btrfs_kobject_uevent(struct block_device *bdev, enum kobject_action action) &disk_to_dev(bdev->bd_disk)->kobj); } -void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices, - const u8 *fsid) +void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices) + { char fsid_buf[BTRFS_UUID_UNPARSED_SIZE]; @@ -1333,7 +1430,7 @@ void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices, * Sprouting changes fsid of the mounted filesystem, rename the fsid * directory */ - snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU", fsid); + snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU", fs_devices->fsid); if (kobject_rename(&fs_devices->fsid_kobj, fsid_buf)) btrfs_warn(fs_devices->fs_info, "sysfs: failed to create fsid for sprout"); @@ -1400,15 +1497,13 @@ int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info) struct btrfs_fs_devices *fs_devs = fs_info->fs_devices; struct kobject *fsid_kobj = &fs_devs->fsid_kobj; - btrfs_set_fs_info_ptr(fs_info); - - error = btrfs_sysfs_add_devices_dir(fs_devs, NULL); + error = btrfs_sysfs_add_fs_devices(fs_devs); if (error) return error; error = sysfs_create_files(fsid_kobj, btrfs_attrs); if (error) { - btrfs_sysfs_remove_devices_dir(fs_devs, NULL); + btrfs_sysfs_remove_fs_devices(fs_devs); return error; } @@ -1626,12 +1721,16 @@ void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info, { struct btrfs_fs_devices *fs_devs; struct kobject *fsid_kobj; - u64 features; - int ret; + u64 __maybe_unused features; + int __maybe_unused ret; if (!fs_info) return; + /* + * See 14e46e04958df74 and e410e34fad913dd, feature bit updates are not + * safe when called from some contexts (eg. balance) + */ features = get_features(fs_info, set); ASSERT(bit & supported_feature_masks[set]); diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index cf839c46a131..bacef43f7267 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -13,15 +13,12 @@ enum btrfs_feature_set { }; char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags); -const char * const btrfs_feature_set_name(enum btrfs_feature_set set); -int btrfs_sysfs_add_devices_dir(struct btrfs_fs_devices *fs_devices, - struct btrfs_device *one_device); -int btrfs_sysfs_remove_devices_dir(struct btrfs_fs_devices *fs_devices, - struct btrfs_device *one_device); +const char *btrfs_feature_set_name(enum btrfs_feature_set set); +int btrfs_sysfs_add_device(struct btrfs_device *device); +void btrfs_sysfs_remove_device(struct btrfs_device *device); int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs); void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs); -void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices, - const u8 *fsid); +void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices); void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info, u64 bit, enum btrfs_feature_set set); void btrfs_kobject_uevent(struct block_device *bdev, enum kobject_action action); diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c index a1b9f9b5978e..df54cdfdc250 100644 --- a/fs/btrfs/tests/extent-buffer-tests.c +++ b/fs/btrfs/tests/extent-buffer-tests.c @@ -60,8 +60,7 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) key.type = BTRFS_EXTENT_CSUM_KEY; key.offset = 0; - setup_items_for_insert(root, path, &key, &value_len, value_len, - value_len + sizeof(struct btrfs_item), 1); + setup_items_for_insert(root, path, &key, &value_len, 1); item = btrfs_item_nr(0); write_extent_buffer(eb, value, btrfs_item_ptr_offset(eb, 0), value_len); diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index 894a63a92236..e6719f7db386 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -33,8 +33,7 @@ static void insert_extent(struct btrfs_root *root, u64 start, u64 len, key.type = BTRFS_EXTENT_DATA_KEY; key.offset = start; - setup_items_for_insert(root, &path, &key, &value_len, value_len, - value_len + sizeof(struct btrfs_item), 1); + setup_items_for_insert(root, &path, &key, &value_len, 1); fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); btrfs_set_file_extent_generation(leaf, fi, 1); btrfs_set_file_extent_type(leaf, fi, type); @@ -64,8 +63,7 @@ static void insert_inode_item_key(struct btrfs_root *root) key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; - setup_items_for_insert(root, &path, &key, &value_len, value_len, - value_len + sizeof(struct btrfs_item), 1); + setup_items_for_insert(root, &path, &key, &value_len, 1); } /* @@ -951,7 +949,6 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) } BTRFS_I(inode)->root = root; - btrfs_test_inode_set_ops(inode); /* [BTRFS_MAX_EXTENT_SIZE] */ ret = btrfs_set_extent_delalloc(BTRFS_I(inode), 0, diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 20c6ac1a5de7..52ada47aff50 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -292,6 +292,8 @@ loop: } cur_trans->fs_info = fs_info; + atomic_set(&cur_trans->pending_ordered, 0); + init_waitqueue_head(&cur_trans->pending_wait); atomic_set(&cur_trans->num_writers, 1); extwriter_counter_init(cur_trans, type); init_waitqueue_head(&cur_trans->writer_wait); @@ -1182,7 +1184,7 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans) eb = btrfs_lock_root_node(fs_info->tree_root); ret = btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, - 0, &eb); + 0, &eb, BTRFS_NESTING_COW); btrfs_tree_unlock(eb); free_extent_buffer(eb); @@ -1587,7 +1589,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_set_root_otransid(new_root_item, trans->transid); old = btrfs_lock_root_node(root); - ret = btrfs_cow_block(trans, root, old, NULL, 0, &old); + ret = btrfs_cow_block(trans, root, old, NULL, 0, &old, + BTRFS_NESTING_COW); if (ret) { btrfs_tree_unlock(old); free_extent_buffer(old); @@ -1636,6 +1639,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, pending->snap = btrfs_get_new_fs_root(fs_info, objectid, pending->anon_dev); if (IS_ERR(pending->snap)) { ret = PTR_ERR(pending->snap); + pending->snap = NULL; btrfs_abort_transaction(trans, ret); goto fail; } @@ -2164,6 +2168,14 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) btrfs_wait_delalloc_flush(trans); + /* + * Wait for all ordered extents started by a fast fsync that joined this + * transaction. Otherwise if this transaction commits before the ordered + * extents complete we lose logged data after a power failure. + */ + wait_event(cur_trans->pending_wait, + atomic_read(&cur_trans->pending_ordered) == 0); + btrfs_scrub_pause(fs_info); /* * Ok now we need to make sure to block out any other joins while we diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index d60b055b8695..858d9153a1cd 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -85,6 +85,13 @@ struct btrfs_transaction { spinlock_t dropped_roots_lock; struct btrfs_delayed_ref_root delayed_refs; struct btrfs_fs_info *fs_info; + + /* + * Number of ordered extents the transaction must wait for before + * committing. These are ordered extents started by a fast fsync. + */ + atomic_t pending_ordered; + wait_queue_head_t pending_wait; }; #define __TRANS_FREEZABLE (1U << 0) @@ -105,6 +112,7 @@ struct btrfs_transaction { #define TRANS_EXTWRITERS (__TRANS_START | __TRANS_ATTACH) #define BTRFS_SEND_TRANS_STUB ((void *)1) +#define BTRFS_DIO_SYNC_STUB ((void *)2) struct btrfs_trans_handle { u64 transid; diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 7b1fee630f97..f0ffd5ee77bd 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -1035,7 +1035,7 @@ static int check_root_item(struct extent_buffer *leaf, struct btrfs_key *key, int slot) { struct btrfs_fs_info *fs_info = leaf->fs_info; - struct btrfs_root_item ri; + struct btrfs_root_item ri = { 0 }; const u64 valid_root_flags = BTRFS_ROOT_SUBVOL_RDONLY | BTRFS_ROOT_SUBVOL_DEAD; int ret; @@ -1044,14 +1044,21 @@ static int check_root_item(struct extent_buffer *leaf, struct btrfs_key *key, if (ret < 0) return ret; - if (btrfs_item_size_nr(leaf, slot) != sizeof(ri)) { + if (btrfs_item_size_nr(leaf, slot) != sizeof(ri) && + btrfs_item_size_nr(leaf, slot) != btrfs_legacy_root_item_size()) { generic_err(leaf, slot, - "invalid root item size, have %u expect %zu", - btrfs_item_size_nr(leaf, slot), sizeof(ri)); + "invalid root item size, have %u expect %zu or %u", + btrfs_item_size_nr(leaf, slot), sizeof(ri), + btrfs_legacy_root_item_size()); } + /* + * For legacy root item, the members starting at generation_v2 will be + * all filled with 0. + * And since we allow geneartion_v2 as 0, it will still pass the check. + */ read_extent_buffer(leaf, &ri, btrfs_item_ptr_offset(leaf, slot), - sizeof(ri)); + btrfs_item_size_nr(leaf, slot)); /* Generation related */ if (btrfs_root_generation(&ri) > diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 39da9db35278..56cbc1706b6f 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -96,8 +96,6 @@ enum { static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_inode *inode, int inode_only, - const loff_t start, - const loff_t end, struct btrfs_log_ctx *ctx); static int link_to_fixup_dir(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -176,7 +174,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans, atomic_inc(&root->log_batch); atomic_inc(&root->log_writers); - if (ctx) { + if (ctx && !ctx->logging_new_name) { int index = root->log_transid % 2; list_add_tail(&ctx->list, &root->log_ctxs[index]); ctx->log_transid = root->log_transid; @@ -215,9 +213,7 @@ static int join_running_log_trans(struct btrfs_root *root) */ void btrfs_pin_log_trans(struct btrfs_root *root) { - mutex_lock(&root->log_mutex); atomic_inc(&root->log_writers); - mutex_unlock(&root->log_mutex); } /* @@ -3615,6 +3611,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, * search and this search we'll not find the key again and can just * bail. */ +search: ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); if (ret != 0) goto done; @@ -3634,6 +3631,13 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, if (min_key.objectid != ino || min_key.type != key_type) goto done; + + if (need_resched()) { + btrfs_release_path(path); + cond_resched(); + goto search; + } + ret = overwrite_item(trans, log, dst_path, src, i, &min_key); if (ret) { @@ -4082,10 +4086,14 @@ static int extent_cmp(void *priv, struct list_head *a, struct list_head *b) static int log_extent_csums(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_root *log_root, - const struct extent_map *em) + const struct extent_map *em, + struct btrfs_log_ctx *ctx) { + struct btrfs_ordered_extent *ordered; u64 csum_offset; u64 csum_len; + u64 mod_start = em->mod_start; + u64 mod_len = em->mod_len; LIST_HEAD(ordered_sums); int ret = 0; @@ -4094,13 +4102,71 @@ static int log_extent_csums(struct btrfs_trans_handle *trans, em->block_start == EXTENT_MAP_HOLE) return 0; + list_for_each_entry(ordered, &ctx->ordered_extents, log_list) { + const u64 ordered_end = ordered->file_offset + ordered->num_bytes; + const u64 mod_end = mod_start + mod_len; + struct btrfs_ordered_sum *sums; + + if (mod_len == 0) + break; + + if (ordered_end <= mod_start) + continue; + if (mod_end <= ordered->file_offset) + break; + + /* + * We are going to copy all the csums on this ordered extent, so + * go ahead and adjust mod_start and mod_len in case this ordered + * extent has already been logged. + */ + if (ordered->file_offset > mod_start) { + if (ordered_end >= mod_end) + mod_len = ordered->file_offset - mod_start; + /* + * If we have this case + * + * |--------- logged extent ---------| + * |----- ordered extent ----| + * + * Just don't mess with mod_start and mod_len, we'll + * just end up logging more csums than we need and it + * will be ok. + */ + } else { + if (ordered_end < mod_end) { + mod_len = mod_end - ordered_end; + mod_start = ordered_end; + } else { + mod_len = 0; + } + } + + /* + * To keep us from looping for the above case of an ordered + * extent that falls inside of the logged extent. + */ + if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, &ordered->flags)) + continue; + + list_for_each_entry(sums, &ordered->list, list) { + ret = log_csums(trans, inode, log_root, sums); + if (ret) + return ret; + } + } + + /* We're done, found all csums in the ordered extents. */ + if (mod_len == 0) + return 0; + /* If we're compressed we have to save the entire range of csums. */ if (em->compress_type) { csum_offset = 0; csum_len = max(em->block_len, em->orig_block_len); } else { - csum_offset = em->mod_start - em->start; - csum_len = em->mod_len; + csum_offset = mod_start - em->start; + csum_len = mod_len; } /* block start is already adjusted for the file extent offset. */ @@ -4140,7 +4206,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans, int ret; int extent_inserted = 0; - ret = log_extent_csums(trans, inode, log, em); + ret = log_extent_csums(trans, inode, log, em, ctx); if (ret) return ret; @@ -4342,10 +4408,10 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_inode *inode, struct btrfs_path *path, - struct btrfs_log_ctx *ctx, - const u64 start, - const u64 end) + struct btrfs_log_ctx *ctx) { + struct btrfs_ordered_extent *ordered; + struct btrfs_ordered_extent *tmp; struct extent_map *em, *n; struct list_head extents; struct extent_map_tree *tree = &inode->extent_tree; @@ -4359,23 +4425,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, test_gen = root->fs_info->last_trans_committed; list_for_each_entry_safe(em, n, &tree->modified_extents, list) { - /* - * Skip extents outside our logging range. It's important to do - * it for correctness because if we don't ignore them, we may - * log them before their ordered extent completes, and therefore - * we could log them without logging their respective checksums - * (the checksum items are added to the csum tree at the very - * end of btrfs_finish_ordered_io()). Also leave such extents - * outside of our range in the list, since we may have another - * ranged fsync in the near future that needs them. If an extent - * outside our range corresponds to a hole, log it to avoid - * leaving gaps between extents (fsck will complain when we are - * not using the NO_HOLES feature). - */ - if ((em->start > end || em->start + em->len <= start) && - em->block_start != EXTENT_MAP_HOLE) - continue; - list_del_init(&em->list); /* * Just an arbitrary number, this can be really CPU intensive @@ -4434,8 +4483,32 @@ process: btrfs_release_path(path); if (!ret) ret = btrfs_log_prealloc_extents(trans, inode, path); + if (ret) + return ret; - return ret; + /* + * We have logged all extents successfully, now make sure the commit of + * the current transaction waits for the ordered extents to complete + * before it commits and wipes out the log trees, otherwise we would + * lose data if an ordered extents completes after the transaction + * commits and a power failure happens after the transaction commit. + */ + list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) { + list_del_init(&ordered->log_list); + set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags); + + if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) { + spin_lock_irq(&inode->ordered_tree.lock); + if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) { + set_bit(BTRFS_ORDERED_PENDING, &ordered->flags); + atomic_inc(&trans->transaction->pending_ordered); + } + spin_unlock_irq(&inode->ordered_tree.lock); + } + btrfs_put_ordered_extent(ordered); + } + + return 0; } static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode, @@ -4841,7 +4914,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, ret = btrfs_log_inode(trans, root, BTRFS_I(inode), LOG_OTHER_INODE_ALL, - 0, LLONG_MAX, ctx); + ctx); btrfs_add_delayed_iput(inode); } } @@ -4883,7 +4956,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, * Check the inode's logged_trans only instead of * btrfs_inode_in_log(). This is because the last_log_commit of * the inode is not updated when we only log that it exists and - * and it has the full sync bit set (see btrfs_log_inode()). + * it has the full sync bit set (see btrfs_log_inode()). */ if (BTRFS_I(inode)->logged_trans == trans->transid) { spin_unlock(&BTRFS_I(inode)->lock); @@ -4899,7 +4972,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, * log with the new name before we unpin it. */ ret = btrfs_log_inode(trans, root, BTRFS_I(inode), - LOG_OTHER_INODE, 0, LLONG_MAX, ctx); + LOG_OTHER_INODE, ctx); if (ret) { btrfs_add_delayed_iput(inode); continue; @@ -5112,8 +5185,6 @@ next_key: static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_inode *inode, int inode_only, - const loff_t start, - const loff_t end, struct btrfs_log_ctx *ctx) { struct btrfs_path *path; @@ -5292,7 +5363,7 @@ log_extents: } if (fast_search) { ret = btrfs_log_changed_extents(trans, root, inode, dst_path, - ctx, start, end); + ctx); if (ret) { err = ret; goto out_unlock; @@ -5301,31 +5372,8 @@ log_extents: struct extent_map *em, *n; write_lock(&em_tree->lock); - /* - * We can't just remove every em if we're called for a ranged - * fsync - that is, one that doesn't cover the whole possible - * file range (0 to LLONG_MAX). This is because we can have - * em's that fall outside the range we're logging and therefore - * their ordered operations haven't completed yet - * (btrfs_finish_ordered_io() not invoked yet). This means we - * didn't get their respective file extent item in the fs/subvol - * tree yet, and need to let the next fast fsync (one which - * consults the list of modified extent maps) find the em so - * that it logs a matching file extent item and waits for the - * respective ordered operation to complete (if it's still - * running). - * - * Removing every em outside the range we're logging would make - * the next fast fsync not log their matching file extent items, - * therefore making us lose data after a log replay. - */ - list_for_each_entry_safe(em, n, &em_tree->modified_extents, - list) { - const u64 mod_end = em->mod_start + em->mod_len - 1; - - if (em->mod_start >= start && mod_end <= end) - list_del_init(&em->list); - } + list_for_each_entry_safe(em, n, &em_tree->modified_extents, list) + list_del_init(&em->list); write_unlock(&em_tree->lock); } @@ -5339,19 +5387,34 @@ log_extents: } /* - * Don't update last_log_commit if we logged that an inode exists after - * it was loaded to memory (full_sync bit set). - * This is to prevent data loss when we do a write to the inode, then - * the inode gets evicted after all delalloc was flushed, then we log - * it exists (due to a rename for example) and then fsync it. This last - * fsync would do nothing (not logging the extents previously written). + * If we are logging that an ancestor inode exists as part of logging a + * new name from a link or rename operation, don't mark the inode as + * logged - otherwise if an explicit fsync is made against an ancestor, + * the fsync considers the inode in the log and doesn't sync the log, + * resulting in the ancestor missing after a power failure unless the + * log was synced as part of an fsync against any other unrelated inode. + * So keep it simple for this case and just don't flag the ancestors as + * logged. */ - spin_lock(&inode->lock); - inode->logged_trans = trans->transid; - if (inode_only != LOG_INODE_EXISTS || - !test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags)) - inode->last_log_commit = inode->last_sub_trans; - spin_unlock(&inode->lock); + if (!ctx || + !(S_ISDIR(inode->vfs_inode.i_mode) && ctx->logging_new_name && + &inode->vfs_inode != ctx->inode)) { + spin_lock(&inode->lock); + inode->logged_trans = trans->transid; + /* + * Don't update last_log_commit if we logged that an inode exists + * after it was loaded to memory (full_sync bit set). + * This is to prevent data loss when we do a write to the inode, + * then the inode gets evicted after all delalloc was flushed, + * then we log it exists (due to a rename for example) and then + * fsync it. This last fsync would do nothing (not logging the + * extents previously written). + */ + if (inode_only != LOG_INODE_EXISTS || + !test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags)) + inode->last_log_commit = inode->last_sub_trans; + spin_unlock(&inode->lock); + } out_unlock: mutex_unlock(&inode->log_mutex); @@ -5591,7 +5654,7 @@ process_leaf: if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK) log_mode = LOG_INODE_ALL; ret = btrfs_log_inode(trans, root, BTRFS_I(di_inode), - log_mode, 0, LLONG_MAX, ctx); + log_mode, ctx); if (!ret && btrfs_must_commit_transaction(trans, BTRFS_I(di_inode))) ret = 1; @@ -5735,7 +5798,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, if (ctx) ctx->log_new_dentries = false; ret = btrfs_log_inode(trans, root, BTRFS_I(dir_inode), - LOG_INODE_ALL, 0, LLONG_MAX, ctx); + LOG_INODE_ALL, ctx); if (!ret && btrfs_must_commit_transaction(trans, BTRFS_I(dir_inode))) ret = 1; @@ -5786,8 +5849,7 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans, if (BTRFS_I(inode)->generation > last_committed) ret = btrfs_log_inode(trans, root, BTRFS_I(inode), - LOG_INODE_EXISTS, - 0, LLONG_MAX, ctx); + LOG_INODE_EXISTS, ctx); btrfs_add_delayed_iput(inode); if (ret) return ret; @@ -5842,7 +5904,7 @@ static int log_new_ancestors_fast(struct btrfs_trans_handle *trans, if (inode->generation > fs_info->last_trans_committed) { ret = btrfs_log_inode(trans, root, inode, - LOG_INODE_EXISTS, 0, LLONG_MAX, ctx); + LOG_INODE_EXISTS, ctx); if (ret) break; } @@ -5950,8 +6012,6 @@ out: static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct dentry *parent, - const loff_t start, - const loff_t end, int inode_only, struct btrfs_log_ctx *ctx) { @@ -6004,7 +6064,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, if (ret) goto end_no_trans; - ret = btrfs_log_inode(trans, root, inode, inode_only, start, end, ctx); + ret = btrfs_log_inode(trans, root, inode, inode_only, ctx); if (ret) goto end_trans; @@ -6100,15 +6160,13 @@ end_no_trans: */ int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, struct dentry *dentry, - const loff_t start, - const loff_t end, struct btrfs_log_ctx *ctx) { struct dentry *parent = dget_parent(dentry); int ret; ret = btrfs_log_inode_parent(trans, BTRFS_I(d_inode(dentry)), parent, - start, end, LOG_INODE_ALL, ctx); + LOG_INODE_ALL, ctx); dput(parent); return ret; @@ -6371,26 +6429,13 @@ void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, /* * Call this after adding a new name for a file and it will properly * update the log to reflect the new name. - * - * @ctx can not be NULL when @sync_log is false, and should be NULL when it's - * true (because it's not used). - * - * Return value depends on whether @sync_log is true or false. - * When true: returns BTRFS_NEED_TRANS_COMMIT if the transaction needs to be - * committed by the caller, and BTRFS_DONT_NEED_TRANS_COMMIT - * otherwise. - * When false: returns BTRFS_DONT_NEED_LOG_SYNC if the caller does not need to - * to sync the log, BTRFS_NEED_LOG_SYNC if it needs to sync the log, - * or BTRFS_NEED_TRANS_COMMIT if the transaction needs to be - * committed (without attempting to sync the log). */ -int btrfs_log_new_name(struct btrfs_trans_handle *trans, +void btrfs_log_new_name(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_inode *old_dir, - struct dentry *parent, - bool sync_log, struct btrfs_log_ctx *ctx) + struct dentry *parent) { struct btrfs_fs_info *fs_info = trans->fs_info; - int ret; + struct btrfs_log_ctx ctx; /* * this will force the logging code to walk the dentry chain @@ -6405,34 +6450,17 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans, */ if (inode->logged_trans <= fs_info->last_trans_committed && (!old_dir || old_dir->logged_trans <= fs_info->last_trans_committed)) - return sync_log ? BTRFS_DONT_NEED_TRANS_COMMIT : - BTRFS_DONT_NEED_LOG_SYNC; - - if (sync_log) { - struct btrfs_log_ctx ctx2; - - btrfs_init_log_ctx(&ctx2, &inode->vfs_inode); - ret = btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX, - LOG_INODE_EXISTS, &ctx2); - if (ret == BTRFS_NO_LOG_SYNC) - return BTRFS_DONT_NEED_TRANS_COMMIT; - else if (ret) - return BTRFS_NEED_TRANS_COMMIT; - - ret = btrfs_sync_log(trans, inode->root, &ctx2); - if (ret) - return BTRFS_NEED_TRANS_COMMIT; - return BTRFS_DONT_NEED_TRANS_COMMIT; - } - - ASSERT(ctx); - ret = btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX, - LOG_INODE_EXISTS, ctx); - if (ret == BTRFS_NO_LOG_SYNC) - return BTRFS_DONT_NEED_LOG_SYNC; - else if (ret) - return BTRFS_NEED_TRANS_COMMIT; + return; - return BTRFS_NEED_LOG_SYNC; + btrfs_init_log_ctx(&ctx, &inode->vfs_inode); + ctx.logging_new_name = true; + /* + * We don't care about the return value. If we fail to log the new name + * then we know the next attempt to sync the log will fallback to a full + * transaction commit (due to a call to btrfs_set_log_full_commit()), so + * we don't need to worry about getting a log committed that has an + * inconsistent state after a rename operation. + */ + btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx); } diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index 132e43d29034..731bd9c029f5 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -16,8 +16,11 @@ struct btrfs_log_ctx { int log_ret; int log_transid; bool log_new_dentries; + bool logging_new_name; struct inode *inode; struct list_head list; + /* Only used for fast fsyncs. */ + struct list_head ordered_extents; }; static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx, @@ -26,8 +29,23 @@ static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx, ctx->log_ret = 0; ctx->log_transid = 0; ctx->log_new_dentries = false; + ctx->logging_new_name = false; ctx->inode = inode; INIT_LIST_HEAD(&ctx->list); + INIT_LIST_HEAD(&ctx->ordered_extents); +} + +static inline void btrfs_release_log_ctx_extents(struct btrfs_log_ctx *ctx) +{ + struct btrfs_ordered_extent *ordered; + struct btrfs_ordered_extent *tmp; + + ASSERT(inode_is_locked(ctx->inode)); + + list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) { + list_del_init(&ordered->log_list); + btrfs_put_ordered_extent(ordered); + } } static inline void btrfs_set_log_full_commit(struct btrfs_trans_handle *trans) @@ -49,8 +67,6 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, int btrfs_recover_log_trees(struct btrfs_root *tree_root); int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, struct dentry *dentry, - const loff_t start, - const loff_t end, struct btrfs_log_ctx *ctx); int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -67,16 +83,8 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, int for_rename); void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, struct btrfs_inode *dir); -/* Return values for btrfs_log_new_name() */ -enum { - BTRFS_DONT_NEED_TRANS_COMMIT, - BTRFS_NEED_TRANS_COMMIT, - BTRFS_DONT_NEED_LOG_SYNC, - BTRFS_NEED_LOG_SYNC, -}; -int btrfs_log_new_name(struct btrfs_trans_handle *trans, +void btrfs_log_new_name(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_inode *old_dir, - struct dentry *parent, - bool sync_log, struct btrfs_log_ctx *ctx); + struct dentry *parent); #endif diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 214856c4ccb1..58b9c419a2b6 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4,6 +4,7 @@ */ #include <linux/sched.h> +#include <linux/sched/mm.h> #include <linux/bio.h> #include <linux/slab.h> #include <linux/blkdev.h> @@ -290,8 +291,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, * balance_mutex * * - * Exclusive operations, BTRFS_FS_EXCL_OP - * ====================================== + * Exclusive operations + * ==================== * * Maintains the exclusivity of the following operations that apply to the * whole filesystem and cannot run in parallel. @@ -317,11 +318,11 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, * - system power-cycle and filesystem mounted as read-only * - filesystem or device errors leading to forced read-only * - * BTRFS_FS_EXCL_OP flag is set and cleared using atomic operations. - * During the course of Paused state, the BTRFS_FS_EXCL_OP remains set. + * The status of exclusive operation is set and cleared atomically. + * During the course of Paused state, fs_info::exclusive_operation remains set. * A device operation in Paused or Running state can be canceled or resumed * either by ioctl (Balance only) or when remounted as read-write. - * BTRFS_FS_EXCL_OP flag is cleared when the device operation is canceled or + * The exclusive status is cleared when the device operation is canceled or * completed. */ @@ -355,6 +356,7 @@ static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid, INIT_LIST_HEAD(&fs_devs->devices); INIT_LIST_HEAD(&fs_devs->alloc_list); INIT_LIST_HEAD(&fs_devs->fs_list); + INIT_LIST_HEAD(&fs_devs->seed_list); if (fsid) memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE); @@ -405,7 +407,7 @@ void __exit btrfs_cleanup_fs_uuids(void) * Returned struct is not linked onto any lists and must be destroyed using * btrfs_free_device. */ -static struct btrfs_device *__alloc_device(void) +static struct btrfs_device *__alloc_device(struct btrfs_fs_info *fs_info) { struct btrfs_device *dev; @@ -432,7 +434,8 @@ static struct btrfs_device *__alloc_device(void) btrfs_device_data_ordered_init(dev); INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); - extent_io_tree_init(NULL, &dev->alloc_state, 0, NULL); + extent_io_tree_init(fs_info, &dev->alloc_state, + IO_TREE_DEVICE_ALLOC_STATE, NULL); return dev; } @@ -592,8 +595,6 @@ static int btrfs_free_stale_devices(const char *path, btrfs_free_device(device); ret = 0; - if (fs_devices->num_devices == 0) - break; } mutex_unlock(&fs_devices->device_list_mutex); @@ -940,16 +941,18 @@ static noinline struct btrfs_device *device_list_add(const char *path, bdput(path_bdev); mutex_unlock(&fs_devices->device_list_mutex); btrfs_warn_in_rcu(device->fs_info, - "duplicate device fsid:devid for %pU:%llu old:%s new:%s", - disk_super->fsid, devid, - rcu_str_deref(device->name), path); + "duplicate device %s devid %llu generation %llu scanned by %s (%d)", + path, devid, found_transid, + current->comm, + task_pid_nr(current)); return ERR_PTR(-EEXIST); } bdput(path_bdev); btrfs_info_in_rcu(device->fs_info, - "device fsid %pU devid %llu moved old:%s new:%s", - disk_super->fsid, devid, - rcu_str_deref(device->name), path); + "devid %llu device path %s changed to %s scanned by %s (%d)", + devid, rcu_str_deref(device->name), + path, current->comm, + task_pid_nr(current)); } name = rcu_string_strdup(path, GFP_NOFS); @@ -1034,28 +1037,21 @@ error: return ERR_PTR(ret); } -/* - * After we have read the system tree and know devids belonging to - * this filesystem, remove the device which does not belong there. - */ -void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step) +static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, + int step, struct btrfs_device **latest_dev) { struct btrfs_device *device, *next; - struct btrfs_device *latest_dev = NULL; - mutex_lock(&uuid_mutex); -again: /* This is the initialized path, it is safe to release the devices. */ list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { - if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, - &device->dev_state)) { + if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)) { if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, - &device->dev_state) && + &device->dev_state) && !test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state) && - (!latest_dev || - device->generation > latest_dev->generation)) { - latest_dev = device; + (!*latest_dev || + device->generation > (*latest_dev)->generation)) { + *latest_dev = device; } continue; } @@ -1093,10 +1089,22 @@ again: btrfs_free_device(device); } - if (fs_devices->seed) { - fs_devices = fs_devices->seed; - goto again; - } +} + +/* + * After we have read the system tree and know devids belonging to this + * filesystem, remove the device which does not belong there. + */ +void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step) +{ + struct btrfs_device *latest_dev = NULL; + struct btrfs_fs_devices *seed_dev; + + mutex_lock(&uuid_mutex); + __btrfs_free_extra_devids(fs_devices, step, &latest_dev); + + list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list) + __btrfs_free_extra_devids(seed_dev, step, &latest_dev); fs_devices->latest_bdev = latest_dev->bdev; @@ -1148,47 +1156,41 @@ static void btrfs_close_one_device(struct btrfs_device *device) ASSERT(atomic_read(&device->reada_in_flight) == 0); } -static int close_fs_devices(struct btrfs_fs_devices *fs_devices) +static void close_fs_devices(struct btrfs_fs_devices *fs_devices) { struct btrfs_device *device, *tmp; + lockdep_assert_held(&uuid_mutex); + if (--fs_devices->opened > 0) - return 0; + return; - mutex_lock(&fs_devices->device_list_mutex); - list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) { + list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) btrfs_close_one_device(device); - } - mutex_unlock(&fs_devices->device_list_mutex); WARN_ON(fs_devices->open_devices); WARN_ON(fs_devices->rw_devices); fs_devices->opened = 0; fs_devices->seeding = false; - - return 0; + fs_devices->fs_info = NULL; } -int btrfs_close_devices(struct btrfs_fs_devices *fs_devices) +void btrfs_close_devices(struct btrfs_fs_devices *fs_devices) { - struct btrfs_fs_devices *seed_devices = NULL; - int ret; + LIST_HEAD(list); + struct btrfs_fs_devices *tmp; mutex_lock(&uuid_mutex); - ret = close_fs_devices(fs_devices); - if (!fs_devices->opened) { - seed_devices = fs_devices->seed; - fs_devices->seed = NULL; - } - mutex_unlock(&uuid_mutex); + close_fs_devices(fs_devices); + if (!fs_devices->opened) + list_splice_init(&fs_devices->seed_list, &list); - while (seed_devices) { - fs_devices = seed_devices; - seed_devices = fs_devices->seed; + list_for_each_entry_safe(fs_devices, tmp, &list, seed_list) { close_fs_devices(fs_devices); + list_del(&fs_devices->seed_list); free_fs_devices(fs_devices); } - return ret; + mutex_unlock(&uuid_mutex); } static int open_fs_devices(struct btrfs_fs_devices *fs_devices, @@ -1196,17 +1198,23 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices, { struct btrfs_device *device; struct btrfs_device *latest_dev = NULL; + struct btrfs_device *tmp_device; flags |= FMODE_EXCL; - list_for_each_entry(device, &fs_devices->devices, dev_list) { - /* Just open everything we can; ignore failures here */ - if (btrfs_open_one_device(fs_devices, device, flags, holder)) - continue; + list_for_each_entry_safe(device, tmp_device, &fs_devices->devices, + dev_list) { + int ret; - if (!latest_dev || - device->generation > latest_dev->generation) + ret = btrfs_open_one_device(fs_devices, device, flags, holder); + if (ret == 0 && + (!latest_dev || device->generation > latest_dev->generation)) { latest_dev = device; + } else if (ret == -ENODATA) { + fs_devices->num_devices--; + list_del(&device->dev_list); + btrfs_free_device(device); + } } if (fs_devices->open_devices == 0) return -EINVAL; @@ -1960,16 +1968,13 @@ static struct btrfs_device * btrfs_find_next_active_device( * this_dev) which is active. */ void __cold btrfs_assign_next_active_device(struct btrfs_device *device, - struct btrfs_device *this_dev) + struct btrfs_device *next_device) { struct btrfs_fs_info *fs_info = device->fs_info; - struct btrfs_device *next_device; - if (this_dev) - next_device = this_dev; - else + if (!next_device) next_device = btrfs_find_next_active_device(fs_info->fs_devices, - device); + device); ASSERT(next_device); if (fs_info->sb->s_bdev && @@ -1998,9 +2003,9 @@ static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info) return num_devices; } -static void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, - struct block_device *bdev, - const char *device_path) +void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, + struct block_device *bdev, + const char *device_path) { struct btrfs_super_block *disk_super; int copy_num; @@ -2039,7 +2044,7 @@ static void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, } int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, - u64 devid) + u64 devid) { struct btrfs_device *device; struct btrfs_fs_devices *cur_devices; @@ -2143,7 +2148,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, if (device->bdev) { cur_devices->open_devices--; /* remove sysfs entry */ - btrfs_sysfs_remove_devices_dir(fs_devices, device); + btrfs_sysfs_remove_device(device); } num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1; @@ -2164,14 +2169,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, btrfs_free_device(device); if (cur_devices->open_devices == 0) { - while (fs_devices) { - if (fs_devices->seed == cur_devices) { - fs_devices->seed = cur_devices->seed; - break; - } - fs_devices = fs_devices->seed; - } - cur_devices->seed = NULL; + list_del_init(&cur_devices->seed_list); close_fs_devices(cur_devices); free_fs_devices(cur_devices); } @@ -2220,14 +2218,9 @@ void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev) void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev) { - struct btrfs_fs_info *fs_info = srcdev->fs_info; struct btrfs_fs_devices *fs_devices = srcdev->fs_devices; - if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) { - /* zero out the old super if it is writable */ - btrfs_scratch_superblocks(fs_info, srcdev->bdev, - srcdev->name->str); - } + mutex_lock(&uuid_mutex); btrfs_close_bdev(srcdev); synchronize_rcu(); @@ -2235,8 +2228,6 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev) /* if this is no devs we rather delete the fs_devices */ if (!fs_devices->num_devices) { - struct btrfs_fs_devices *tmp_fs_devices; - /* * On a mounted FS, num_devices can't be zero unless it's a * seed. In case of a seed device being replaced, the replace @@ -2245,18 +2236,11 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev) */ ASSERT(fs_devices->seeding); - tmp_fs_devices = fs_info->fs_devices; - while (tmp_fs_devices) { - if (tmp_fs_devices->seed == fs_devices) { - tmp_fs_devices->seed = fs_devices->seed; - break; - } - tmp_fs_devices = tmp_fs_devices->seed; - } - fs_devices->seed = NULL; + list_del_init(&fs_devices->seed_list); close_fs_devices(fs_devices); free_fs_devices(fs_devices); } + mutex_unlock(&uuid_mutex); } void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev) @@ -2265,7 +2249,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev) mutex_lock(&fs_devices->device_list_mutex); - btrfs_sysfs_remove_devices_dir(fs_devices, tgtdev); + btrfs_sysfs_remove_device(tgtdev); if (tgtdev->bdev) fs_devices->open_devices--; @@ -2374,10 +2358,20 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info) if (!fs_devices->seeding) return -EINVAL; + /* + * Private copy of the seed devices, anchored at + * fs_info->fs_devices->seed_list + */ seed_devices = alloc_fs_devices(NULL, NULL); if (IS_ERR(seed_devices)) return PTR_ERR(seed_devices); + /* + * It's necessary to retain a copy of the original seed fs_devices in + * fs_uuids so that filesystems which have been seeded can successfully + * reference the seed device from open_seed_devices. This also supports + * multiple fs seed. + */ old_devices = clone_fs_devices(fs_devices); if (IS_ERR(old_devices)) { kfree(seed_devices); @@ -2398,16 +2392,12 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info) list_for_each_entry(device, &seed_devices->devices, dev_list) device->fs_devices = seed_devices; - mutex_lock(&fs_info->chunk_mutex); - list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list); - mutex_unlock(&fs_info->chunk_mutex); - fs_devices->seeding = false; fs_devices->num_devices = 0; fs_devices->open_devices = 0; fs_devices->missing_devices = 0; fs_devices->rotating = false; - fs_devices->seed = seed_devices; + list_add(&seed_devices->seed_list, &fs_devices->seed_list); generate_random_uuid(fs_devices->fsid); memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE); @@ -2510,7 +2500,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path u64 orig_super_num_devices; int seeding_dev = 0; int ret = 0; - bool unlocked = false; + bool locked = false; if (sb_rdonly(sb) && !fs_devices->seeding) return -EROFS; @@ -2524,20 +2514,20 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path seeding_dev = 1; down_write(&sb->s_umount); mutex_lock(&uuid_mutex); + locked = true; } - filemap_write_and_wait(bdev->bd_inode->i_mapping); + sync_blockdev(bdev); - mutex_lock(&fs_devices->device_list_mutex); - list_for_each_entry(device, &fs_devices->devices, dev_list) { + rcu_read_lock(); + list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) { if (device->bdev == bdev) { ret = -EEXIST; - mutex_unlock( - &fs_devices->device_list_mutex); + rcu_read_unlock(); goto error; } } - mutex_unlock(&fs_devices->device_list_mutex); + rcu_read_unlock(); device = btrfs_alloc_device(fs_info, NULL, NULL); if (IS_ERR(device)) { @@ -2612,9 +2602,6 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path btrfs_set_super_num_devices(fs_info->super_copy, orig_super_num_devices + 1); - /* add sysfs device entry */ - btrfs_sysfs_add_devices_dir(fs_devices, device); - /* * we've got more storage, clear any full flags on the space * infos @@ -2622,6 +2609,10 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path btrfs_clear_space_info_full(fs_info); mutex_unlock(&fs_info->chunk_mutex); + + /* Add sysfs device entry */ + btrfs_sysfs_add_device(device); + mutex_unlock(&fs_devices->device_list_mutex); if (seeding_dev) { @@ -2647,8 +2638,11 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path goto error_sysfs; } - btrfs_sysfs_update_sprout_fsid(fs_devices, - fs_info->fs_devices->fsid); + /* + * fs_devices now represents the newly sprouted filesystem and + * its fsid has been changed by btrfs_prepare_sprout + */ + btrfs_sysfs_update_sprout_fsid(fs_devices); } ret = btrfs_commit_transaction(trans); @@ -2656,7 +2650,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path if (seeding_dev) { mutex_unlock(&uuid_mutex); up_write(&sb->s_umount); - unlocked = true; + locked = false; if (ret) /* transaction commit */ return ret; @@ -2691,7 +2685,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path return ret; error_sysfs: - btrfs_sysfs_remove_devices_dir(fs_devices, device); + btrfs_sysfs_remove_device(device); mutex_lock(&fs_info->fs_devices->device_list_mutex); mutex_lock(&fs_info->chunk_mutex); list_del_rcu(&device->dev_list); @@ -2717,7 +2711,7 @@ error_free_device: btrfs_free_device(device); error: blkdev_put(bdev, FMODE_EXCL); - if (seeding_dev && !unlocked) { + if (locked) { mutex_unlock(&uuid_mutex); up_write(&sb->s_umount); } @@ -4044,7 +4038,7 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, /* * rw_devices will not change at the moment, device add/delete/replace - * are excluded by EXCL_OP + * are exclusive */ num_devices = fs_info->fs_devices->rw_devices; @@ -4180,7 +4174,7 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, if ((ret && ret != -ECANCELED && ret != -ENOSPC) || balance_need_close(fs_info)) { reset_balance_state(fs_info); - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + btrfs_exclop_finish(fs_info); } wake_up(&fs_info->balance_wait_q); @@ -4191,7 +4185,7 @@ out: reset_balance_state(fs_info); else kfree(bctl); - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + btrfs_exclop_finish(fs_info); return ret; } @@ -4293,7 +4287,7 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info) * is in a paused state and must have fs_info::balance_ctl properly * set up. */ - if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) btrfs_warn(fs_info, "balance: cannot set exclusive op status, resume manually"); @@ -4375,7 +4369,7 @@ int btrfs_cancel_balance(struct btrfs_fs_info *fs_info) if (fs_info->balance_ctl) { reset_balance_state(fs_info); - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + btrfs_exclop_finish(fs_info); btrfs_info(fs_info, "balance: canceled"); } } @@ -6460,11 +6454,21 @@ struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices, bool seed) { struct btrfs_device *device; + struct btrfs_fs_devices *seed_devs; + + if (!fsid || !memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) { + list_for_each_entry(device, &fs_devices->devices, dev_list) { + if (device->devid == devid && + (!uuid || memcmp(device->uuid, uuid, + BTRFS_UUID_SIZE) == 0)) + return device; + } + } - while (fs_devices) { + list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { if (!fsid || - !memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) { - list_for_each_entry(device, &fs_devices->devices, + !memcmp(seed_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE)) { + list_for_each_entry(device, &seed_devs->devices, dev_list) { if (device->devid == devid && (!uuid || memcmp(device->uuid, uuid, @@ -6472,11 +6476,8 @@ struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices, return device; } } - if (seed) - fs_devices = fs_devices->seed; - else - return NULL; } + return NULL; } @@ -6484,8 +6485,17 @@ static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices, u64 devid, u8 *dev_uuid) { struct btrfs_device *device; + unsigned int nofs_flag; + /* + * We call this under the chunk_mutex, so we want to use NOFS for this + * allocation, however we don't want to change btrfs_alloc_device() to + * always do NOFS because we use it in a lot of other GFP_KERNEL safe + * places. + */ + nofs_flag = memalloc_nofs_save(); device = btrfs_alloc_device(NULL, &devid, dev_uuid); + memalloc_nofs_restore(nofs_flag); if (IS_ERR(device)) return device; @@ -6522,7 +6532,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, if (WARN_ON(!devid && !fs_info)) return ERR_PTR(-EINVAL); - dev = __alloc_device(); + dev = __alloc_device(fs_info); if (IS_ERR(dev)) return dev; @@ -6718,13 +6728,11 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, lockdep_assert_held(&uuid_mutex); ASSERT(fsid); - fs_devices = fs_info->fs_devices->seed; - while (fs_devices) { + /* This will match only for multi-device seed fs */ + list_for_each_entry(fs_devices, &fs_info->fs_devices->seed_list, seed_list) if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE)) return fs_devices; - fs_devices = fs_devices->seed; - } fs_devices = find_fsid(fsid, NULL); if (!fs_devices) { @@ -6740,6 +6748,10 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, return fs_devices; } + /* + * Upon first call for a seed fs fsid, just create a private copy of the + * respective fs_devices and anchor it at fs_info->fs_devices->seed_list + */ fs_devices = clone_fs_devices(fs_devices); if (IS_ERR(fs_devices)) return fs_devices; @@ -6747,20 +6759,17 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, ret = open_fs_devices(fs_devices, FMODE_READ, fs_info->bdev_holder); if (ret) { free_fs_devices(fs_devices); - fs_devices = ERR_PTR(ret); - goto out; + return ERR_PTR(ret); } if (!fs_devices->seeding) { close_fs_devices(fs_devices); free_fs_devices(fs_devices); - fs_devices = ERR_PTR(-EINVAL); - goto out; + return ERR_PTR(-EINVAL); } - fs_devices->seed = fs_info->fs_devices->seed; - fs_info->fs_devices->seed = fs_devices; -out: + list_add(&fs_devices->seed_list, &fs_info->fs_devices->seed_list); + return fs_devices; } @@ -7179,17 +7188,22 @@ error: void btrfs_init_devices_late(struct btrfs_fs_info *fs_info) { - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; struct btrfs_device *device; - while (fs_devices) { - mutex_lock(&fs_devices->device_list_mutex); - list_for_each_entry(device, &fs_devices->devices, dev_list) + fs_devices->fs_info = fs_info; + + mutex_lock(&fs_devices->device_list_mutex); + list_for_each_entry(device, &fs_devices->devices, dev_list) + device->fs_info = fs_info; + + list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { + list_for_each_entry(device, &seed_devs->devices, dev_list) device->fs_info = fs_info; - mutex_unlock(&fs_devices->device_list_mutex); - fs_devices = fs_devices->seed; + seed_devs->fs_info = fs_info; } + mutex_unlock(&fs_devices->device_list_mutex); } static u64 btrfs_dev_stats_value(const struct extent_buffer *eb, @@ -7215,17 +7229,53 @@ static void btrfs_set_dev_stats_value(struct extent_buffer *eb, sizeof(val)); } -int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info) +static int btrfs_device_init_dev_stats(struct btrfs_device *device, + struct btrfs_path *path) { - struct btrfs_key key; - struct btrfs_root *dev_root = fs_info->dev_root; - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_dev_stats_item *ptr; struct extent_buffer *eb; - int slot; - int ret = 0; + struct btrfs_key key; + int item_size; + int i, ret, slot; + + key.objectid = BTRFS_DEV_STATS_OBJECTID; + key.type = BTRFS_PERSISTENT_ITEM_KEY; + key.offset = device->devid; + ret = btrfs_search_slot(NULL, device->fs_info->dev_root, &key, path, 0, 0); + if (ret) { + for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) + btrfs_dev_stat_set(device, i, 0); + device->dev_stats_valid = 1; + btrfs_release_path(path); + return ret < 0 ? ret : 0; + } + slot = path->slots[0]; + eb = path->nodes[0]; + item_size = btrfs_item_size_nr(eb, slot); + + ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_stats_item); + + for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { + if (item_size >= (1 + i) * sizeof(__le64)) + btrfs_dev_stat_set(device, i, + btrfs_dev_stats_value(eb, ptr, i)); + else + btrfs_dev_stat_set(device, i, 0); + } + + device->dev_stats_valid = 1; + btrfs_dev_stat_print_on_load(device); + btrfs_release_path(path); + + return 0; +} + +int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info) +{ + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; struct btrfs_device *device; struct btrfs_path *path = NULL; - int i; + int ret = 0; path = btrfs_alloc_path(); if (!path) @@ -7233,43 +7283,22 @@ int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info) mutex_lock(&fs_devices->device_list_mutex); list_for_each_entry(device, &fs_devices->devices, dev_list) { - int item_size; - struct btrfs_dev_stats_item *ptr; - - key.objectid = BTRFS_DEV_STATS_OBJECTID; - key.type = BTRFS_PERSISTENT_ITEM_KEY; - key.offset = device->devid; - ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0); - if (ret) { - for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) - btrfs_dev_stat_set(device, i, 0); - device->dev_stats_valid = 1; - btrfs_release_path(path); - continue; - } - slot = path->slots[0]; - eb = path->nodes[0]; - item_size = btrfs_item_size_nr(eb, slot); - - ptr = btrfs_item_ptr(eb, slot, - struct btrfs_dev_stats_item); - - for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { - if (item_size >= (1 + i) * sizeof(__le64)) - btrfs_dev_stat_set(device, i, - btrfs_dev_stats_value(eb, ptr, i)); - else - btrfs_dev_stat_set(device, i, 0); + ret = btrfs_device_init_dev_stats(device, path); + if (ret) + goto out; + } + list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { + list_for_each_entry(device, &seed_devs->devices, dev_list) { + ret = btrfs_device_init_dev_stats(device, path); + if (ret) + goto out; } - - device->dev_stats_valid = 1; - btrfs_dev_stat_print_on_load(device); - btrfs_release_path(path); } +out: mutex_unlock(&fs_devices->device_list_mutex); btrfs_free_path(path); - return ret < 0 ? ret : 0; + return ret; } static int update_dev_stat_item(struct btrfs_trans_handle *trans, @@ -7486,24 +7515,6 @@ void btrfs_commit_device_sizes(struct btrfs_transaction *trans) mutex_unlock(&trans->fs_info->chunk_mutex); } -void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info) -{ - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; - while (fs_devices) { - fs_devices->fs_info = fs_info; - fs_devices = fs_devices->seed; - } -} - -void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info) -{ - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; - while (fs_devices) { - fs_devices->fs_info = NULL; - fs_devices = fs_devices->seed; - } -} - /* * Multiplicity factor for simple profiles: DUP, RAID1-like and RAID10. */ @@ -7584,8 +7595,11 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, /* It's possible this device is a dummy for seed device */ if (dev->disk_total_bytes == 0) { - dev = btrfs_find_device(fs_info->fs_devices->seed, devid, NULL, - NULL, false); + struct btrfs_fs_devices *devs; + + devs = list_first_entry(&fs_info->fs_devices->seed_list, + struct btrfs_fs_devices, seed_list); + dev = btrfs_find_device(devs, devid, NULL, NULL, false); if (!dev) { btrfs_err(fs_info, "failed to find seed devid %llu", devid); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 5eea93916fbf..bf27ac07d315 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -58,7 +58,7 @@ struct btrfs_device { struct btrfs_fs_devices *fs_devices; struct btrfs_fs_info *fs_info; - struct rcu_string *name; + struct rcu_string __rcu *name; u64 generation; @@ -246,7 +246,7 @@ struct btrfs_fs_devices { */ struct list_head alloc_list; - struct btrfs_fs_devices *seed; + struct list_head seed_list; bool seeding; int opened; @@ -435,7 +435,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags, void *holder); int btrfs_forget_devices(const char *path); -int btrfs_close_devices(struct btrfs_fs_devices *fs_devices); +void btrfs_close_devices(struct btrfs_fs_devices *fs_devices); void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step); void btrfs_assign_next_active_device(struct btrfs_device *device, struct btrfs_device *this_dev); @@ -569,10 +569,11 @@ static inline enum btrfs_raid_types btrfs_bg_flags_to_raid_index(u64 flags) void btrfs_commit_device_sizes(struct btrfs_transaction *trans); struct list_head * __attribute_const__ btrfs_get_fs_uuids(void); -void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info); -void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info); bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info, struct btrfs_device *failing_dev); +void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, + struct block_device *bdev, + const char *device_path); int btrfs_bg_type_to_factor(u64 flags); const char *btrfs_bg_type_to_raid_name(u64 flags); diff --git a/fs/buffer.c b/fs/buffer.c index 50bbc99e3d96..5a28a6aa7f16 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2771,16 +2771,6 @@ int nobh_writepage(struct page *page, get_block_t *get_block, /* Is the page fully outside i_size? (truncate in progress) */ offset = i_size & (PAGE_SIZE-1); if (page->index >= end_index+1 || !offset) { - /* - * The page may have dirty, unmapped buffers. For example, - * they may have been added in ext3_writepage(). Make them - * freeable here, so the page does not leak. - */ -#if 0 - /* Not really sure about this - do we need this ? */ - if (page->mapping->a_ops->invalidatepage) - page->mapping->a_ops->invalidatepage(page, offset); -#endif unlock_page(page); return 0; /* don't care */ } @@ -2975,12 +2965,6 @@ int block_write_full_page(struct page *page, get_block_t *get_block, /* Is the page fully outside i_size? (truncate in progress) */ offset = i_size & (PAGE_SIZE-1); if (page->index >= end_index+1 || !offset) { - /* - * The page may have dirty, unmapped buffers. For example, - * they may have been added in ext3_writepage(). Make them - * freeable here, so the page does not leak. - */ - do_invalidatepage(page, 0, PAGE_SIZE); unlock_page(page); return 0; /* don't care */ } diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 3989d08396ac..1f75b25e559a 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -1017,6 +1017,8 @@ handle_mnt_opt: if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MODE_FROM_SID) { rc = cifs_acl_to_fattr(cifs_sb, &fattr, *inode, true, full_path, fid); + if (rc == -EREMOTE) + rc = 0; if (rc) { cifs_dbg(FYI, "%s: Get mode from SID failed. rc=%d\n", __func__, rc); @@ -1025,6 +1027,8 @@ handle_mnt_opt: } else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) { rc = cifs_acl_to_fattr(cifs_sb, &fattr, *inode, false, full_path, fid); + if (rc == -EREMOTE) + rc = 0; if (rc) { cifs_dbg(FYI, "%s: Getting ACL failed with error: %d\n", __func__, rc); diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 32f90dc82c84..d44df8f95bcd 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -1208,7 +1208,7 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon, rqst[1].rq_iov = si_iov; rqst[1].rq_nvec = 1; - len = sizeof(ea) + ea_name_len + ea_value_len + 1; + len = sizeof(*ea) + ea_name_len + ea_value_len + 1; ea = kzalloc(len, GFP_KERNEL); if (ea == NULL) { rc = -ENOMEM; diff --git a/fs/compat.c b/fs/compat.c deleted file mode 100644 index 436d228cf71c..000000000000 --- a/fs/compat.c +++ /dev/null @@ -1,132 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * linux/fs/compat.c - * - * Kernel compatibililty routines for e.g. 32 bit syscall support - * on 64 bit kernels. - * - * Copyright (C) 2002 Stephen Rothwell, IBM Corporation - * Copyright (C) 1997-2000 Jakub Jelinek (jakub@redhat.com) - * Copyright (C) 1998 Eddie C. Dost (ecd@skynet.be) - * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs - * Copyright (C) 2003 Pavel Machek (pavel@ucw.cz) - */ - -#include <linux/compat.h> -#include <linux/nfs4_mount.h> -#include <linux/syscalls.h> -#include <linux/slab.h> -#include <linux/uaccess.h> -#include "internal.h" - -struct compat_nfs_string { - compat_uint_t len; - compat_uptr_t data; -}; - -static inline void compat_nfs_string(struct nfs_string *dst, - struct compat_nfs_string *src) -{ - dst->data = compat_ptr(src->data); - dst->len = src->len; -} - -struct compat_nfs4_mount_data_v1 { - compat_int_t version; - compat_int_t flags; - compat_int_t rsize; - compat_int_t wsize; - compat_int_t timeo; - compat_int_t retrans; - compat_int_t acregmin; - compat_int_t acregmax; - compat_int_t acdirmin; - compat_int_t acdirmax; - struct compat_nfs_string client_addr; - struct compat_nfs_string mnt_path; - struct compat_nfs_string hostname; - compat_uint_t host_addrlen; - compat_uptr_t host_addr; - compat_int_t proto; - compat_int_t auth_flavourlen; - compat_uptr_t auth_flavours; -}; - -static int do_nfs4_super_data_conv(void *raw_data) -{ - int version = *(compat_uint_t *) raw_data; - - if (version == 1) { - struct compat_nfs4_mount_data_v1 *raw = raw_data; - struct nfs4_mount_data *real = raw_data; - - /* copy the fields backwards */ - real->auth_flavours = compat_ptr(raw->auth_flavours); - real->auth_flavourlen = raw->auth_flavourlen; - real->proto = raw->proto; - real->host_addr = compat_ptr(raw->host_addr); - real->host_addrlen = raw->host_addrlen; - compat_nfs_string(&real->hostname, &raw->hostname); - compat_nfs_string(&real->mnt_path, &raw->mnt_path); - compat_nfs_string(&real->client_addr, &raw->client_addr); - real->acdirmax = raw->acdirmax; - real->acdirmin = raw->acdirmin; - real->acregmax = raw->acregmax; - real->acregmin = raw->acregmin; - real->retrans = raw->retrans; - real->timeo = raw->timeo; - real->wsize = raw->wsize; - real->rsize = raw->rsize; - real->flags = raw->flags; - real->version = raw->version; - } - - return 0; -} - -#define NFS4_NAME "nfs4" - -COMPAT_SYSCALL_DEFINE5(mount, const char __user *, dev_name, - const char __user *, dir_name, - const char __user *, type, compat_ulong_t, flags, - const void __user *, data) -{ - char *kernel_type; - void *options; - char *kernel_dev; - int retval; - - kernel_type = copy_mount_string(type); - retval = PTR_ERR(kernel_type); - if (IS_ERR(kernel_type)) - goto out; - - kernel_dev = copy_mount_string(dev_name); - retval = PTR_ERR(kernel_dev); - if (IS_ERR(kernel_dev)) - goto out1; - - options = copy_mount_options(data); - retval = PTR_ERR(options); - if (IS_ERR(options)) - goto out2; - - if (kernel_type && options) { - if (!strcmp(kernel_type, NFS4_NAME)) { - retval = -EINVAL; - if (do_nfs4_super_data_conv(options)) - goto out3; - } - } - - retval = do_mount(kernel_dev, dir_name, kernel_type, flags, options); - - out3: - kfree(options); - out2: - kfree(kernel_dev); - out1: - kfree(kernel_type); - out: - return retval; -} diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index ca2273727225..b0983e2a4e2c 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -1168,7 +1168,7 @@ EXPORT_SYMBOL(configfs_depend_item); /* * Release the dependent linkage. This is much simpler than - * configfs_depend_item() because we know that that the client driver is + * configfs_depend_item() because we know that the client driver is * pinned, thus the subsystem is pinned, and therefore configfs is pinned. */ void configfs_undepend_item(struct config_item *target) diff --git a/fs/configfs/file.c b/fs/configfs/file.c index fb65b706cc0d..1f0270229d7b 100644 --- a/fs/configfs/file.c +++ b/fs/configfs/file.c @@ -267,7 +267,7 @@ flush_write_buffer(struct file *file, struct configfs_buffer *buffer, size_t cou * There is no easy way for us to know if userspace is only doing a partial * write, so we don't support them. We expect the entire buffer to come * on the first write. - * Hint: if you're writing a value, first read the file, modify only the + * Hint: if you're writing a value, first read the file, modify only * the value you're changing, then write entire buffer back. */ diff --git a/fs/coredump.c b/fs/coredump.c index 76e7c10edfc0..0cd9056d79cc 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -840,17 +840,17 @@ int dump_emit(struct coredump_params *cprm, const void *addr, int nr) ssize_t n; if (cprm->written + nr > cprm->limit) return 0; - while (nr) { - if (dump_interrupted()) - return 0; - n = __kernel_write(file, addr, nr, &pos); - if (n <= 0) - return 0; - file->f_pos = pos; - cprm->written += n; - cprm->pos += n; - nr -= n; - } + + + if (dump_interrupted()) + return 0; + n = __kernel_write(file, addr, nr, &pos); + if (n != nr) + return 0; + file->f_pos = pos; + cprm->written += n; + cprm->pos += n; + return 1; } EXPORT_SYMBOL(dump_emit); @@ -876,6 +876,40 @@ int dump_skip(struct coredump_params *cprm, size_t nr) } EXPORT_SYMBOL(dump_skip); +#ifdef CONFIG_ELF_CORE +int dump_user_range(struct coredump_params *cprm, unsigned long start, + unsigned long len) +{ + unsigned long addr; + + for (addr = start; addr < start + len; addr += PAGE_SIZE) { + struct page *page; + int stop; + + /* + * To avoid having to allocate page tables for virtual address + * ranges that have never been used yet, and also to make it + * easy to generate sparse core files, use a helper that returns + * NULL when encountering an empty page table entry that would + * otherwise have been filled with the zero page. + */ + page = get_dump_page(addr); + if (page) { + void *kaddr = kmap(page); + + stop = !dump_emit(cprm, kaddr, PAGE_SIZE); + kunmap(page); + put_page(page); + } else { + stop = !dump_skip(cprm, PAGE_SIZE); + } + if (stop) + return 0; + } + return 1; +} +#endif + int dump_align(struct coredump_params *cprm, int align) { unsigned mod = cprm->pos & (align - 1); @@ -902,3 +936,183 @@ void dump_truncate(struct coredump_params *cprm) } } EXPORT_SYMBOL(dump_truncate); + +/* + * The purpose of always_dump_vma() is to make sure that special kernel mappings + * that are useful for post-mortem analysis are included in every core dump. + * In that way we ensure that the core dump is fully interpretable later + * without matching up the same kernel and hardware config to see what PC values + * meant. These special mappings include - vDSO, vsyscall, and other + * architecture specific mappings + */ +static bool always_dump_vma(struct vm_area_struct *vma) +{ + /* Any vsyscall mappings? */ + if (vma == get_gate_vma(vma->vm_mm)) + return true; + + /* + * Assume that all vmas with a .name op should always be dumped. + * If this changes, a new vm_ops field can easily be added. + */ + if (vma->vm_ops && vma->vm_ops->name && vma->vm_ops->name(vma)) + return true; + + /* + * arch_vma_name() returns non-NULL for special architecture mappings, + * such as vDSO sections. + */ + if (arch_vma_name(vma)) + return true; + + return false; +} + +/* + * Decide how much of @vma's contents should be included in a core dump. + */ +static unsigned long vma_dump_size(struct vm_area_struct *vma, + unsigned long mm_flags) +{ +#define FILTER(type) (mm_flags & (1UL << MMF_DUMP_##type)) + + /* always dump the vdso and vsyscall sections */ + if (always_dump_vma(vma)) + goto whole; + + if (vma->vm_flags & VM_DONTDUMP) + return 0; + + /* support for DAX */ + if (vma_is_dax(vma)) { + if ((vma->vm_flags & VM_SHARED) && FILTER(DAX_SHARED)) + goto whole; + if (!(vma->vm_flags & VM_SHARED) && FILTER(DAX_PRIVATE)) + goto whole; + return 0; + } + + /* Hugetlb memory check */ + if (is_vm_hugetlb_page(vma)) { + if ((vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_SHARED)) + goto whole; + if (!(vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_PRIVATE)) + goto whole; + return 0; + } + + /* Do not dump I/O mapped devices or special mappings */ + if (vma->vm_flags & VM_IO) + return 0; + + /* By default, dump shared memory if mapped from an anonymous file. */ + if (vma->vm_flags & VM_SHARED) { + if (file_inode(vma->vm_file)->i_nlink == 0 ? + FILTER(ANON_SHARED) : FILTER(MAPPED_SHARED)) + goto whole; + return 0; + } + + /* Dump segments that have been written to. */ + if ((!IS_ENABLED(CONFIG_MMU) || vma->anon_vma) && FILTER(ANON_PRIVATE)) + goto whole; + if (vma->vm_file == NULL) + return 0; + + if (FILTER(MAPPED_PRIVATE)) + goto whole; + + /* + * If this is the beginning of an executable file mapping, + * dump the first page to aid in determining what was mapped here. + */ + if (FILTER(ELF_HEADERS) && + vma->vm_pgoff == 0 && (vma->vm_flags & VM_READ) && + (READ_ONCE(file_inode(vma->vm_file)->i_mode) & 0111) != 0) + return PAGE_SIZE; + +#undef FILTER + + return 0; + +whole: + return vma->vm_end - vma->vm_start; +} + +static struct vm_area_struct *first_vma(struct task_struct *tsk, + struct vm_area_struct *gate_vma) +{ + struct vm_area_struct *ret = tsk->mm->mmap; + + if (ret) + return ret; + return gate_vma; +} + +/* + * Helper function for iterating across a vma list. It ensures that the caller + * will visit `gate_vma' prior to terminating the search. + */ +static struct vm_area_struct *next_vma(struct vm_area_struct *this_vma, + struct vm_area_struct *gate_vma) +{ + struct vm_area_struct *ret; + + ret = this_vma->vm_next; + if (ret) + return ret; + if (this_vma == gate_vma) + return NULL; + return gate_vma; +} + +/* + * Under the mmap_lock, take a snapshot of relevant information about the task's + * VMAs. + */ +int dump_vma_snapshot(struct coredump_params *cprm, int *vma_count, + struct core_vma_metadata **vma_meta, + size_t *vma_data_size_ptr) +{ + struct vm_area_struct *vma, *gate_vma; + struct mm_struct *mm = current->mm; + int i; + size_t vma_data_size = 0; + + /* + * Once the stack expansion code is fixed to not change VMA bounds + * under mmap_lock in read mode, this can be changed to take the + * mmap_lock in read mode. + */ + if (mmap_write_lock_killable(mm)) + return -EINTR; + + gate_vma = get_gate_vma(mm); + *vma_count = mm->map_count + (gate_vma ? 1 : 0); + + *vma_meta = kvmalloc_array(*vma_count, sizeof(**vma_meta), GFP_KERNEL); + if (!*vma_meta) { + mmap_write_unlock(mm); + return -ENOMEM; + } + + for (i = 0, vma = first_vma(current, gate_vma); vma != NULL; + vma = next_vma(vma, gate_vma), i++) { + struct core_vma_metadata *m = (*vma_meta) + i; + + m->start = vma->vm_start; + m->end = vma->vm_end; + m->flags = vma->vm_flags; + m->dump_size = vma_dump_size(vma, cprm->mm_flags); + + vma_data_size += m->dump_size; + } + + mmap_write_unlock(mm); + + if (WARN_ON(i != *vma_count)) + return -EFAULT; + + *vma_data_size_ptr = vma_data_size; + return 0; +} diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index 9212325763b0..4ef3f714046a 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -343,9 +343,11 @@ void fscrypt_msg(const struct inode *inode, const char *level, va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; - if (inode) + if (inode && inode->i_ino) printk("%sfscrypt (%s, inode %lu): %pV\n", level, inode->i_sb->s_id, inode->i_ino, &vaf); + else if (inode) + printk("%sfscrypt (%s): %pV\n", level, inode->i_sb->s_id, &vaf); else printk("%sfscrypt: %pV\n", level, &vaf); va_end(args); diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index 011830f84d8d..1fbe6c24d705 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -61,15 +61,6 @@ struct fscrypt_nokey_name { */ #define FSCRYPT_NOKEY_NAME_MAX offsetofend(struct fscrypt_nokey_name, sha256) -static void fscrypt_do_sha256(const u8 *data, unsigned int data_len, u8 *result) -{ - struct sha256_state sctx; - - sha256_init(&sctx); - sha256_update(&sctx, data, data_len); - sha256_final(&sctx, result); -} - static inline bool fscrypt_is_dot_dotdot(const struct qstr *str) { if (str->len == 1 && str->name[0] == '.') @@ -242,11 +233,11 @@ static int base64_decode(const char *src, int len, u8 *dst) return cp - dst; } -bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len, - u32 max_len, u32 *encrypted_len_ret) +bool fscrypt_fname_encrypted_size(const union fscrypt_policy *policy, + u32 orig_len, u32 max_len, + u32 *encrypted_len_ret) { - const struct fscrypt_info *ci = inode->i_crypt_info; - int padding = 4 << (fscrypt_policy_flags(&ci->ci_policy) & + int padding = 4 << (fscrypt_policy_flags(policy) & FSCRYPT_POLICY_FLAGS_PAD_MASK); u32 encrypted_len; @@ -260,8 +251,6 @@ bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len, /** * fscrypt_fname_alloc_buffer() - allocate a buffer for presented filenames - * @inode: inode of the parent directory (for regular filenames) - * or of the symlink (for symlink targets) * @max_encrypted_len: maximum length of encrypted filenames the buffer will be * used to present * @crypto_str: (output) buffer to allocate @@ -271,8 +260,7 @@ bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len, * * Return: 0 on success, -errno on failure */ -int fscrypt_fname_alloc_buffer(const struct inode *inode, - u32 max_encrypted_len, +int fscrypt_fname_alloc_buffer(u32 max_encrypted_len, struct fscrypt_str *crypto_str) { const u32 max_encoded_len = BASE64_CHARS(FSCRYPT_NOKEY_NAME_MAX); @@ -369,9 +357,9 @@ int fscrypt_fname_disk_to_usr(const struct inode *inode, } else { memcpy(nokey_name.bytes, iname->name, sizeof(nokey_name.bytes)); /* Compute strong hash of remaining part of name. */ - fscrypt_do_sha256(&iname->name[sizeof(nokey_name.bytes)], - iname->len - sizeof(nokey_name.bytes), - nokey_name.sha256); + sha256(&iname->name[sizeof(nokey_name.bytes)], + iname->len - sizeof(nokey_name.bytes), + nokey_name.sha256); size = FSCRYPT_NOKEY_NAME_MAX; } oname->len = base64_encode((const u8 *)&nokey_name, size, oname->name); @@ -394,9 +382,9 @@ EXPORT_SYMBOL(fscrypt_fname_disk_to_usr); * directory's encryption key, then @iname is the plaintext, so we encrypt it to * get the disk_name. * - * Else, for keyless @lookup operations, @iname is the presented ciphertext, so - * we decode it to get the fscrypt_nokey_name. Non-@lookup operations will be - * impossible in this case, so we fail them with ENOKEY. + * Else, for keyless @lookup operations, @iname should be a no-key name, so we + * decode it to get the struct fscrypt_nokey_name. Non-@lookup operations will + * be impossible in this case, so we fail them with ENOKEY. * * If successful, fscrypt_free_filename() must be called later to clean up. * @@ -421,7 +409,8 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, return ret; if (fscrypt_has_encryption_key(dir)) { - if (!fscrypt_fname_encrypted_size(dir, iname->len, + if (!fscrypt_fname_encrypted_size(&dir->i_crypt_info->ci_policy, + iname->len, dir->i_sb->s_cop->max_namelen, &fname->crypto_buf.len)) return -ENAMETOOLONG; @@ -440,7 +429,7 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, } if (!lookup) return -ENOKEY; - fname->is_ciphertext_name = true; + fname->is_nokey_name = true; /* * We don't have the key and we are doing a lookup; decode the @@ -499,7 +488,7 @@ bool fscrypt_match_name(const struct fscrypt_name *fname, { const struct fscrypt_nokey_name *nokey_name = (const void *)fname->crypto_buf.name; - u8 sha256[SHA256_DIGEST_SIZE]; + u8 digest[SHA256_DIGEST_SIZE]; if (likely(fname->disk_name.name)) { if (de_name_len != fname->disk_name.len) @@ -510,9 +499,9 @@ bool fscrypt_match_name(const struct fscrypt_name *fname, return false; if (memcmp(de_name, nokey_name->bytes, sizeof(nokey_name->bytes))) return false; - fscrypt_do_sha256(&de_name[sizeof(nokey_name->bytes)], - de_name_len - sizeof(nokey_name->bytes), sha256); - return !memcmp(sha256, nokey_name->sha256, sizeof(sha256)); + sha256(&de_name[sizeof(nokey_name->bytes)], + de_name_len - sizeof(nokey_name->bytes), digest); + return !memcmp(digest, nokey_name->sha256, sizeof(digest)); } EXPORT_SYMBOL_GPL(fscrypt_match_name); @@ -541,7 +530,7 @@ EXPORT_SYMBOL_GPL(fscrypt_fname_siphash); * Validate dentries in encrypted directories to make sure we aren't potentially * caching stale dentries after a key has been added. */ -static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags) +int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags) { struct dentry *dir; int err; @@ -549,17 +538,17 @@ static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags) /* * Plaintext names are always valid, since fscrypt doesn't support - * reverting to ciphertext names without evicting the directory's inode + * reverting to no-key names without evicting the directory's inode * -- which implies eviction of the dentries in the directory. */ - if (!(dentry->d_flags & DCACHE_ENCRYPTED_NAME)) + if (!(dentry->d_flags & DCACHE_NOKEY_NAME)) return 1; /* - * Ciphertext name; valid if the directory's key is still unavailable. + * No-key name; valid if the directory's key is still unavailable. * - * Although fscrypt forbids rename() on ciphertext names, we still must - * use dget_parent() here rather than use ->d_parent directly. That's + * Although fscrypt forbids rename() on no-key names, we still must use + * dget_parent() here rather than use ->d_parent directly. That's * because a corrupted fs image may contain directory hard links, which * the VFS handles by moving the directory's dentry tree in the dcache * each time ->lookup() finds the directory and it already has a dentry @@ -580,6 +569,7 @@ static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags) return valid; } +EXPORT_SYMBOL_GPL(fscrypt_d_revalidate); const struct dentry_operations fscrypt_d_ops = { .d_revalidate = fscrypt_d_revalidate, diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index 8117a61b6f55..4f5806a3b73d 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -97,7 +97,6 @@ static inline const u8 *fscrypt_context_nonce(const union fscrypt_context *ctx) return NULL; } -#undef fscrypt_policy union fscrypt_policy { u8 version; struct fscrypt_policy_v1 v1; @@ -292,8 +291,9 @@ void fscrypt_generate_iv(union fscrypt_iv *iv, u64 lblk_num, /* fname.c */ int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname, u8 *out, unsigned int olen); -bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len, - u32 max_len, u32 *encrypted_len_ret); +bool fscrypt_fname_encrypted_size(const union fscrypt_policy *policy, + u32 orig_len, u32 max_len, + u32 *encrypted_len_ret); extern const struct dentry_operations fscrypt_d_ops; /* hkdf.c */ @@ -572,6 +572,9 @@ int fscrypt_set_per_file_enc_key(struct fscrypt_info *ci, const u8 *raw_key); int fscrypt_derive_dirhash_key(struct fscrypt_info *ci, const struct fscrypt_master_key *mk); +void fscrypt_hash_inode_number(struct fscrypt_info *ci, + const struct fscrypt_master_key *mk); + /* keysetup_v1.c */ void fscrypt_put_direct_key(struct fscrypt_direct_key *dk); @@ -590,5 +593,6 @@ bool fscrypt_supported_policy(const union fscrypt_policy *policy_u, int fscrypt_policy_from_context(union fscrypt_policy *policy_u, const union fscrypt_context *ctx_u, int ctx_size); +const union fscrypt_policy *fscrypt_policy_to_inherit(struct inode *dir); #endif /* _FSCRYPT_PRIVATE_H */ diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c index 09fb8aa0f2e9..20b0df47fe6a 100644 --- a/fs/crypto/hooks.c +++ b/fs/crypto/hooks.c @@ -60,8 +60,8 @@ int __fscrypt_prepare_link(struct inode *inode, struct inode *dir, if (err) return err; - /* ... in case we looked up ciphertext name before key was added */ - if (dentry->d_flags & DCACHE_ENCRYPTED_NAME) + /* ... in case we looked up no-key name before key was added */ + if (dentry->d_flags & DCACHE_NOKEY_NAME) return -ENOKEY; if (!fscrypt_has_permitted_context(dir, inode)) @@ -85,9 +85,8 @@ int __fscrypt_prepare_rename(struct inode *old_dir, struct dentry *old_dentry, if (err) return err; - /* ... in case we looked up ciphertext name(s) before key was added */ - if ((old_dentry->d_flags | new_dentry->d_flags) & - DCACHE_ENCRYPTED_NAME) + /* ... in case we looked up no-key name(s) before key was added */ + if ((old_dentry->d_flags | new_dentry->d_flags) & DCACHE_NOKEY_NAME) return -ENOKEY; if (old_dir != new_dir) { @@ -114,9 +113,9 @@ int __fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry, if (err && err != -ENOENT) return err; - if (fname->is_ciphertext_name) { + if (fname->is_nokey_name) { spin_lock(&dentry->d_lock); - dentry->d_flags |= DCACHE_ENCRYPTED_NAME; + dentry->d_flags |= DCACHE_NOKEY_NAME; spin_unlock(&dentry->d_lock); d_set_d_op(dentry, &fscrypt_d_ops); } @@ -166,26 +165,51 @@ int fscrypt_prepare_setflags(struct inode *inode, return 0; } -int __fscrypt_prepare_symlink(struct inode *dir, unsigned int len, - unsigned int max_len, - struct fscrypt_str *disk_link) +/** + * fscrypt_prepare_symlink() - prepare to create a possibly-encrypted symlink + * @dir: directory in which the symlink is being created + * @target: plaintext symlink target + * @len: length of @target excluding null terminator + * @max_len: space the filesystem has available to store the symlink target + * @disk_link: (out) the on-disk symlink target being prepared + * + * This function computes the size the symlink target will require on-disk, + * stores it in @disk_link->len, and validates it against @max_len. An + * encrypted symlink may be longer than the original. + * + * Additionally, @disk_link->name is set to @target if the symlink will be + * unencrypted, but left NULL if the symlink will be encrypted. For encrypted + * symlinks, the filesystem must call fscrypt_encrypt_symlink() to create the + * on-disk target later. (The reason for the two-step process is that some + * filesystems need to know the size of the symlink target before creating the + * inode, e.g. to determine whether it will be a "fast" or "slow" symlink.) + * + * Return: 0 on success, -ENAMETOOLONG if the symlink target is too long, + * -ENOKEY if the encryption key is missing, or another -errno code if a problem + * occurred while setting up the encryption key. + */ +int fscrypt_prepare_symlink(struct inode *dir, const char *target, + unsigned int len, unsigned int max_len, + struct fscrypt_str *disk_link) { - int err; + const union fscrypt_policy *policy; /* * To calculate the size of the encrypted symlink target we need to know * the amount of NUL padding, which is determined by the flags set in * the encryption policy which will be inherited from the directory. - * The easiest way to get access to this is to just load the directory's - * fscrypt_info, since we'll need it to create the dir_entry anyway. - * - * Note: in test_dummy_encryption mode, @dir may be unencrypted. */ - err = fscrypt_get_encryption_info(dir); - if (err) - return err; - if (!fscrypt_has_encryption_key(dir)) - return -ENOKEY; + policy = fscrypt_policy_to_inherit(dir); + if (policy == NULL) { + /* Not encrypted */ + disk_link->name = (unsigned char *)target; + disk_link->len = len + 1; + if (disk_link->len > max_len) + return -ENAMETOOLONG; + return 0; + } + if (IS_ERR(policy)) + return PTR_ERR(policy); /* * Calculate the size of the encrypted symlink and verify it won't @@ -198,7 +222,7 @@ int __fscrypt_prepare_symlink(struct inode *dir, unsigned int len, * counting it (even though it is meaningless for ciphertext) is simpler * for now since filesystems will assume it is there and subtract it. */ - if (!fscrypt_fname_encrypted_size(dir, len, + if (!fscrypt_fname_encrypted_size(policy, len, max_len - sizeof(struct fscrypt_symlink_data), &disk_link->len)) return -ENAMETOOLONG; @@ -207,7 +231,7 @@ int __fscrypt_prepare_symlink(struct inode *dir, unsigned int len, disk_link->name = NULL; return 0; } -EXPORT_SYMBOL_GPL(__fscrypt_prepare_symlink); +EXPORT_SYMBOL_GPL(fscrypt_prepare_symlink); int __fscrypt_encrypt_symlink(struct inode *inode, const char *target, unsigned int len, struct fscrypt_str *disk_link) @@ -217,9 +241,13 @@ int __fscrypt_encrypt_symlink(struct inode *inode, const char *target, struct fscrypt_symlink_data *sd; unsigned int ciphertext_len; - err = fscrypt_require_key(inode); - if (err) - return err; + /* + * fscrypt_prepare_new_inode() should have already set up the new + * symlink inode's encryption key. We don't wait until now to do it, + * since we may be in a filesystem transaction now. + */ + if (WARN_ON_ONCE(!fscrypt_has_encryption_key(inode))) + return -ENOKEY; if (disk_link->name) { /* filesystem-provided buffer */ @@ -319,7 +347,7 @@ const char *fscrypt_get_symlink(struct inode *inode, const void *caddr, if (cstr.len + sizeof(*sd) - 1 > max_size) return ERR_PTR(-EUCLEAN); - err = fscrypt_fname_alloc_buffer(inode, cstr.len, &pstr); + err = fscrypt_fname_alloc_buffer(cstr.len, &pstr); if (err) return ERR_PTR(err); diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c index faa25541ccb6..89bffa82ed74 100644 --- a/fs/crypto/inline_crypt.c +++ b/fs/crypto/inline_crypt.c @@ -106,7 +106,7 @@ int fscrypt_select_encryption_impl(struct fscrypt_info *ci) crypto_cfg.data_unit_size = sb->s_blocksize; crypto_cfg.dun_bytes = fscrypt_get_dun_bytes(ci); num_devs = fscrypt_get_num_devices(sb); - devs = kmalloc_array(num_devs, sizeof(*devs), GFP_NOFS); + devs = kmalloc_array(num_devs, sizeof(*devs), GFP_KERNEL); if (!devs) return -ENOMEM; fscrypt_get_devices(sb, num_devs, devs); @@ -135,9 +135,8 @@ int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key, struct fscrypt_blk_crypto_key *blk_key; int err; int i; - unsigned int flags; - blk_key = kzalloc(struct_size(blk_key, devs, num_devs), GFP_NOFS); + blk_key = kzalloc(struct_size(blk_key, devs, num_devs), GFP_KERNEL); if (!blk_key) return -ENOMEM; @@ -166,10 +165,8 @@ int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key, } queue_refs++; - flags = memalloc_nofs_save(); err = blk_crypto_start_using_key(&blk_key->base, blk_key->devs[i]); - memalloc_nofs_restore(flags); if (err) { fscrypt_err(inode, "error %d starting to use blk-crypto", err); diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c index e74f239c4428..53cc552a7b8f 100644 --- a/fs/crypto/keyring.c +++ b/fs/crypto/keyring.c @@ -817,6 +817,7 @@ static int check_for_busy_inodes(struct super_block *sb, struct list_head *pos; size_t busy_count = 0; unsigned long ino; + char ino_str[50] = ""; spin_lock(&mk->mk_decrypted_inodes_lock); @@ -838,11 +839,15 @@ static int check_for_busy_inodes(struct super_block *sb, } spin_unlock(&mk->mk_decrypted_inodes_lock); + /* If the inode is currently being created, ino may still be 0. */ + if (ino) + snprintf(ino_str, sizeof(ino_str), ", including ino %lu", ino); + fscrypt_warn(NULL, - "%s: %zu inode(s) still busy after removing key with %s %*phN, including ino %lu", + "%s: %zu inode(s) still busy after removing key with %s %*phN%s", sb->s_id, busy_count, master_key_spec_type(&mk->mk_spec), master_key_spec_len(&mk->mk_spec), (u8 *)&mk->mk_spec.u, - ino); + ino_str); return -EBUSY; } diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c index fea6226afc2b..d3c3e5d9b41f 100644 --- a/fs/crypto/keysetup.c +++ b/fs/crypto/keysetup.c @@ -10,6 +10,7 @@ #include <crypto/skcipher.h> #include <linux/key.h> +#include <linux/random.h> #include "fscrypt_private.h" @@ -222,6 +223,16 @@ int fscrypt_derive_dirhash_key(struct fscrypt_info *ci, return 0; } +void fscrypt_hash_inode_number(struct fscrypt_info *ci, + const struct fscrypt_master_key *mk) +{ + WARN_ON(ci->ci_inode->i_ino == 0); + WARN_ON(!mk->mk_ino_hash_key_initialized); + + ci->ci_hashed_ino = (u32)siphash_1u64(ci->ci_inode->i_ino, + &mk->mk_ino_hash_key); +} + static int fscrypt_setup_iv_ino_lblk_32_key(struct fscrypt_info *ci, struct fscrypt_master_key *mk) { @@ -254,13 +265,20 @@ unlock: return err; } - ci->ci_hashed_ino = (u32)siphash_1u64(ci->ci_inode->i_ino, - &mk->mk_ino_hash_key); + /* + * New inodes may not have an inode number assigned yet. + * Hashing their inode number is delayed until later. + */ + if (ci->ci_inode->i_ino == 0) + WARN_ON(!(ci->ci_inode->i_state & I_CREATING)); + else + fscrypt_hash_inode_number(ci, mk); return 0; } static int fscrypt_setup_v2_file_key(struct fscrypt_info *ci, - struct fscrypt_master_key *mk) + struct fscrypt_master_key *mk, + bool need_dirhash_key) { int err; @@ -306,7 +324,7 @@ static int fscrypt_setup_v2_file_key(struct fscrypt_info *ci, return err; /* Derive a secret dirhash key for directories that need it. */ - if (S_ISDIR(ci->ci_inode->i_mode) && IS_CASEFOLDED(ci->ci_inode)) { + if (need_dirhash_key) { err = fscrypt_derive_dirhash_key(ci, mk); if (err) return err; @@ -326,6 +344,7 @@ static int fscrypt_setup_v2_file_key(struct fscrypt_info *ci, * key being removed with a new inode starting to use it. */ static int setup_file_encryption_key(struct fscrypt_info *ci, + bool need_dirhash_key, struct key **master_key_ret) { struct key *key; @@ -400,7 +419,7 @@ static int setup_file_encryption_key(struct fscrypt_info *ci, err = fscrypt_setup_v1_file_key(ci, mk->mk_secret.raw); break; case FSCRYPT_POLICY_V2: - err = fscrypt_setup_v2_file_key(ci, mk); + err = fscrypt_setup_v2_file_key(ci, mk, need_dirhash_key); break; default: WARN_ON(1); @@ -454,57 +473,28 @@ static void put_crypt_info(struct fscrypt_info *ci) kmem_cache_free(fscrypt_info_cachep, ci); } -int fscrypt_get_encryption_info(struct inode *inode) +static int +fscrypt_setup_encryption_info(struct inode *inode, + const union fscrypt_policy *policy, + const u8 nonce[FSCRYPT_FILE_NONCE_SIZE], + bool need_dirhash_key) { struct fscrypt_info *crypt_info; - union fscrypt_context ctx; struct fscrypt_mode *mode; struct key *master_key = NULL; int res; - if (fscrypt_has_encryption_key(inode)) - return 0; - res = fscrypt_initialize(inode->i_sb->s_cop->flags); if (res) return res; - res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); - if (res < 0) { - const union fscrypt_context *dummy_ctx = - fscrypt_get_dummy_context(inode->i_sb); - - if (IS_ENCRYPTED(inode) || !dummy_ctx) { - fscrypt_warn(inode, - "Error %d getting encryption context", - res); - return res; - } - /* Fake up a context for an unencrypted directory */ - res = fscrypt_context_size(dummy_ctx); - memcpy(&ctx, dummy_ctx, res); - } - - crypt_info = kmem_cache_zalloc(fscrypt_info_cachep, GFP_NOFS); + crypt_info = kmem_cache_zalloc(fscrypt_info_cachep, GFP_KERNEL); if (!crypt_info) return -ENOMEM; crypt_info->ci_inode = inode; - - res = fscrypt_policy_from_context(&crypt_info->ci_policy, &ctx, res); - if (res) { - fscrypt_warn(inode, - "Unrecognized or corrupt encryption context"); - goto out; - } - - memcpy(crypt_info->ci_nonce, fscrypt_context_nonce(&ctx), - FSCRYPT_FILE_NONCE_SIZE); - - if (!fscrypt_supported_policy(&crypt_info->ci_policy, inode)) { - res = -EINVAL; - goto out; - } + crypt_info->ci_policy = *policy; + memcpy(crypt_info->ci_nonce, nonce, FSCRYPT_FILE_NONCE_SIZE); mode = select_encryption_mode(&crypt_info->ci_policy, inode); if (IS_ERR(mode)) { @@ -514,13 +504,14 @@ int fscrypt_get_encryption_info(struct inode *inode) WARN_ON(mode->ivsize > FSCRYPT_MAX_IV_SIZE); crypt_info->ci_mode = mode; - res = setup_file_encryption_key(crypt_info, &master_key); + res = setup_file_encryption_key(crypt_info, need_dirhash_key, + &master_key); if (res) goto out; /* - * Multiple tasks may race to set ->i_crypt_info, so use - * cmpxchg_release(). This pairs with the smp_load_acquire() in + * For existing inodes, multiple tasks may race to set ->i_crypt_info. + * So use cmpxchg_release(). This pairs with the smp_load_acquire() in * fscrypt_get_info(). I.e., here we publish ->i_crypt_info with a * RELEASE barrier so that other tasks can ACQUIRE it. */ @@ -550,14 +541,113 @@ out: up_read(&mk->mk_secret_sem); key_put(master_key); } + put_crypt_info(crypt_info); + return res; +} + +/** + * fscrypt_get_encryption_info() - set up an inode's encryption key + * @inode: the inode to set up the key for. Must be encrypted. + * + * Set up ->i_crypt_info, if it hasn't already been done. + * + * Note: unless ->i_crypt_info is already set, this isn't %GFP_NOFS-safe. So + * generally this shouldn't be called from within a filesystem transaction. + * + * Return: 0 if ->i_crypt_info was set or was already set, *or* if the + * encryption key is unavailable. (Use fscrypt_has_encryption_key() to + * distinguish these cases.) Also can return another -errno code. + */ +int fscrypt_get_encryption_info(struct inode *inode) +{ + int res; + union fscrypt_context ctx; + union fscrypt_policy policy; + + if (fscrypt_has_encryption_key(inode)) + return 0; + + res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); + if (res < 0) { + fscrypt_warn(inode, "Error %d getting encryption context", res); + return res; + } + + res = fscrypt_policy_from_context(&policy, &ctx, res); + if (res) { + fscrypt_warn(inode, + "Unrecognized or corrupt encryption context"); + return res; + } + + if (!fscrypt_supported_policy(&policy, inode)) + return -EINVAL; + + res = fscrypt_setup_encryption_info(inode, &policy, + fscrypt_context_nonce(&ctx), + IS_CASEFOLDED(inode) && + S_ISDIR(inode->i_mode)); if (res == -ENOKEY) res = 0; - put_crypt_info(crypt_info); return res; } EXPORT_SYMBOL(fscrypt_get_encryption_info); /** + * fscrypt_prepare_new_inode() - prepare to create a new inode in a directory + * @dir: a possibly-encrypted directory + * @inode: the new inode. ->i_mode must be set already. + * ->i_ino doesn't need to be set yet. + * @encrypt_ret: (output) set to %true if the new inode will be encrypted + * + * If the directory is encrypted, set up its ->i_crypt_info in preparation for + * encrypting the name of the new file. Also, if the new inode will be + * encrypted, set up its ->i_crypt_info and set *encrypt_ret=true. + * + * This isn't %GFP_NOFS-safe, and therefore it should be called before starting + * any filesystem transaction to create the inode. For this reason, ->i_ino + * isn't required to be set yet, as the filesystem may not have set it yet. + * + * This doesn't persist the new inode's encryption context. That still needs to + * be done later by calling fscrypt_set_context(). + * + * Return: 0 on success, -ENOKEY if the encryption key is missing, or another + * -errno code + */ +int fscrypt_prepare_new_inode(struct inode *dir, struct inode *inode, + bool *encrypt_ret) +{ + const union fscrypt_policy *policy; + u8 nonce[FSCRYPT_FILE_NONCE_SIZE]; + + policy = fscrypt_policy_to_inherit(dir); + if (policy == NULL) + return 0; + if (IS_ERR(policy)) + return PTR_ERR(policy); + + if (WARN_ON_ONCE(inode->i_mode == 0)) + return -EINVAL; + + /* + * Only regular files, directories, and symlinks are encrypted. + * Special files like device nodes and named pipes aren't. + */ + if (!S_ISREG(inode->i_mode) && + !S_ISDIR(inode->i_mode) && + !S_ISLNK(inode->i_mode)) + return 0; + + *encrypt_ret = true; + + get_random_bytes(nonce, FSCRYPT_FILE_NONCE_SIZE); + return fscrypt_setup_encryption_info(inode, policy, nonce, + IS_CASEFOLDED(dir) && + S_ISDIR(inode->i_mode)); +} +EXPORT_SYMBOL_GPL(fscrypt_prepare_new_inode); + +/** * fscrypt_put_encryption_info() - free most of an inode's fscrypt data * @inode: an inode being evicted * diff --git a/fs/crypto/keysetup_v1.c b/fs/crypto/keysetup_v1.c index a3cb52572b05..2762c5350432 100644 --- a/fs/crypto/keysetup_v1.c +++ b/fs/crypto/keysetup_v1.c @@ -60,7 +60,7 @@ static int derive_key_aes(const u8 *master_key, goto out; } crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_FORBID_WEAK_KEYS); - req = skcipher_request_alloc(tfm, GFP_NOFS); + req = skcipher_request_alloc(tfm, GFP_KERNEL); if (!req) { res = -ENOMEM; goto out; @@ -99,7 +99,7 @@ find_and_lock_process_key(const char *prefix, const struct user_key_payload *ukp; const struct fscrypt_key *payload; - description = kasprintf(GFP_NOFS, "%s%*phN", prefix, + description = kasprintf(GFP_KERNEL, "%s%*phN", prefix, FSCRYPT_KEY_DESCRIPTOR_SIZE, descriptor); if (!description) return ERR_PTR(-ENOMEM); @@ -228,7 +228,7 @@ fscrypt_get_direct_key(const struct fscrypt_info *ci, const u8 *raw_key) return dk; /* Nope, allocate one. */ - dk = kzalloc(sizeof(*dk), GFP_NOFS); + dk = kzalloc(sizeof(*dk), GFP_KERNEL); if (!dk) return ERR_PTR(-ENOMEM); refcount_set(&dk->dk_refcount, 1); @@ -272,7 +272,7 @@ static int setup_v1_file_key_derived(struct fscrypt_info *ci, * This cannot be a stack buffer because it will be passed to the * scatterlist crypto API during derive_key_aes(). */ - derived_key = kmalloc(ci->ci_mode->keysize, GFP_NOFS); + derived_key = kmalloc(ci->ci_mode->keysize, GFP_KERNEL); if (!derived_key) return -ENOMEM; diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index 2d73fd39ad96..4441d9944b9e 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -32,6 +32,14 @@ bool fscrypt_policies_equal(const union fscrypt_policy *policy1, return !memcmp(policy1, policy2, fscrypt_policy_size(policy1)); } +static const union fscrypt_policy * +fscrypt_get_dummy_policy(struct super_block *sb) +{ + if (!sb->s_cop->get_dummy_policy) + return NULL; + return sb->s_cop->get_dummy_policy(sb); +} + static bool fscrypt_valid_enc_modes(u32 contents_mode, u32 filenames_mode) { if (contents_mode == FSCRYPT_MODE_AES_256_XTS && @@ -192,10 +200,15 @@ static bool fscrypt_supported_v2_policy(const struct fscrypt_policy_v2 *policy, 32, 32)) return false; + /* + * IV_INO_LBLK_32 hashes the inode number, so in principle it can + * support any ino_bits. However, currently the inode number is gotten + * from inode::i_ino which is 'unsigned long'. So for now the + * implementation limit is 32 bits. + */ if ((policy->flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32) && - /* This uses hashed inode numbers, so ino_bits doesn't matter. */ !supported_iv_ino_lblk_policy(policy, inode, "IV_INO_LBLK_32", - INT_MAX, 32)) + 32, 32)) return false; if (memchr_inv(policy->__reserved, 0, sizeof(policy->__reserved))) { @@ -231,18 +244,19 @@ bool fscrypt_supported_policy(const union fscrypt_policy *policy_u, } /** - * fscrypt_new_context_from_policy() - create a new fscrypt_context from - * an fscrypt_policy + * fscrypt_new_context() - create a new fscrypt_context * @ctx_u: output context * @policy_u: input policy + * @nonce: nonce to use * * Create an fscrypt_context for an inode that is being assigned the given - * encryption policy. A new nonce is randomly generated. + * encryption policy. @nonce must be a new random nonce. * * Return: the size of the new context in bytes. */ -static int fscrypt_new_context_from_policy(union fscrypt_context *ctx_u, - const union fscrypt_policy *policy_u) +static int fscrypt_new_context(union fscrypt_context *ctx_u, + const union fscrypt_policy *policy_u, + const u8 nonce[FSCRYPT_FILE_NONCE_SIZE]) { memset(ctx_u, 0, sizeof(*ctx_u)); @@ -260,7 +274,7 @@ static int fscrypt_new_context_from_policy(union fscrypt_context *ctx_u, memcpy(ctx->master_key_descriptor, policy->master_key_descriptor, sizeof(ctx->master_key_descriptor)); - get_random_bytes(ctx->nonce, sizeof(ctx->nonce)); + memcpy(ctx->nonce, nonce, FSCRYPT_FILE_NONCE_SIZE); return sizeof(*ctx); } case FSCRYPT_POLICY_V2: { @@ -276,7 +290,7 @@ static int fscrypt_new_context_from_policy(union fscrypt_context *ctx_u, memcpy(ctx->master_key_identifier, policy->master_key_identifier, sizeof(ctx->master_key_identifier)); - get_random_bytes(ctx->nonce, sizeof(ctx->nonce)); + memcpy(ctx->nonce, nonce, FSCRYPT_FILE_NONCE_SIZE); return sizeof(*ctx); } } @@ -372,6 +386,7 @@ static int fscrypt_get_policy(struct inode *inode, union fscrypt_policy *policy) static int set_encryption_policy(struct inode *inode, const union fscrypt_policy *policy) { + u8 nonce[FSCRYPT_FILE_NONCE_SIZE]; union fscrypt_context ctx; int ctxsize; int err; @@ -409,7 +424,8 @@ static int set_encryption_policy(struct inode *inode, return -EINVAL; } - ctxsize = fscrypt_new_context_from_policy(&ctx, policy); + get_random_bytes(nonce, FSCRYPT_FILE_NONCE_SIZE); + ctxsize = fscrypt_new_context(&ctx, policy, nonce); return inode->i_sb->s_cop->set_context(inode, &ctx, ctxsize, NULL); } @@ -620,86 +636,99 @@ int fscrypt_has_permitted_context(struct inode *parent, struct inode *child) } EXPORT_SYMBOL(fscrypt_has_permitted_context); +/* + * Return the encryption policy that new files in the directory will inherit, or + * NULL if none, or an ERR_PTR() on error. If the directory is encrypted, also + * ensure that its key is set up, so that the new filename can be encrypted. + */ +const union fscrypt_policy *fscrypt_policy_to_inherit(struct inode *dir) +{ + int err; + + if (IS_ENCRYPTED(dir)) { + err = fscrypt_require_key(dir); + if (err) + return ERR_PTR(err); + return &dir->i_crypt_info->ci_policy; + } + + return fscrypt_get_dummy_policy(dir->i_sb); +} + /** - * fscrypt_inherit_context() - Sets a child context from its parent - * @parent: Parent inode from which the context is inherited. - * @child: Child inode that inherits the context from @parent. - * @fs_data: private data given by FS. - * @preload: preload child i_crypt_info if true + * fscrypt_set_context() - Set the fscrypt context of a new inode + * @inode: a new inode + * @fs_data: private data given by FS and passed to ->set_context() + * + * This should be called after fscrypt_prepare_new_inode(), generally during a + * filesystem transaction. Everything here must be %GFP_NOFS-safe. * * Return: 0 on success, -errno on failure */ -int fscrypt_inherit_context(struct inode *parent, struct inode *child, - void *fs_data, bool preload) +int fscrypt_set_context(struct inode *inode, void *fs_data) { + struct fscrypt_info *ci = inode->i_crypt_info; union fscrypt_context ctx; int ctxsize; - struct fscrypt_info *ci; - int res; - - res = fscrypt_get_encryption_info(parent); - if (res < 0) - return res; - ci = fscrypt_get_info(parent); - if (ci == NULL) + /* fscrypt_prepare_new_inode() should have set up the key already. */ + if (WARN_ON_ONCE(!ci)) return -ENOKEY; - ctxsize = fscrypt_new_context_from_policy(&ctx, &ci->ci_policy); - BUILD_BUG_ON(sizeof(ctx) != FSCRYPT_SET_CONTEXT_MAX_SIZE); - res = parent->i_sb->s_cop->set_context(child, &ctx, ctxsize, fs_data); - if (res) - return res; - return preload ? fscrypt_get_encryption_info(child): 0; + ctxsize = fscrypt_new_context(&ctx, &ci->ci_policy, ci->ci_nonce); + + /* + * This may be the first time the inode number is available, so do any + * delayed key setup that requires the inode number. + */ + if (ci->ci_policy.version == FSCRYPT_POLICY_V2 && + (ci->ci_policy.v2.flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32)) { + const struct fscrypt_master_key *mk = + ci->ci_master_key->payload.data[0]; + + fscrypt_hash_inode_number(ci, mk); + } + + return inode->i_sb->s_cop->set_context(inode, &ctx, ctxsize, fs_data); } -EXPORT_SYMBOL(fscrypt_inherit_context); +EXPORT_SYMBOL_GPL(fscrypt_set_context); /** * fscrypt_set_test_dummy_encryption() - handle '-o test_dummy_encryption' * @sb: the filesystem on which test_dummy_encryption is being specified - * @arg: the argument to the test_dummy_encryption option. - * If no argument was specified, then @arg->from == NULL. - * @dummy_ctx: the filesystem's current dummy context (input/output, see below) + * @arg: the argument to the test_dummy_encryption option. May be NULL. + * @dummy_policy: the filesystem's current dummy policy (input/output, see + * below) * * Handle the test_dummy_encryption mount option by creating a dummy encryption - * context, saving it in @dummy_ctx, and adding the corresponding dummy - * encryption key to the filesystem. If the @dummy_ctx is already set, then + * policy, saving it in @dummy_policy, and adding the corresponding dummy + * encryption key to the filesystem. If the @dummy_policy is already set, then * instead validate that it matches @arg. Don't support changing it via * remount, as that is difficult to do safely. * - * The reason we use an fscrypt_context rather than an fscrypt_policy is because - * we mustn't generate a new nonce each time we access a dummy-encrypted - * directory, as that would change the way filenames are encrypted. - * - * Return: 0 on success (dummy context set, or the same context is already set); - * -EEXIST if a different dummy context is already set; + * Return: 0 on success (dummy policy set, or the same policy is already set); + * -EEXIST if a different dummy policy is already set; * or another -errno value. */ -int fscrypt_set_test_dummy_encryption(struct super_block *sb, - const substring_t *arg, - struct fscrypt_dummy_context *dummy_ctx) +int fscrypt_set_test_dummy_encryption(struct super_block *sb, const char *arg, + struct fscrypt_dummy_policy *dummy_policy) { - const char *argstr = "v2"; - const char *argstr_to_free = NULL; struct fscrypt_key_specifier key_spec = { 0 }; int version; - union fscrypt_context *ctx = NULL; + union fscrypt_policy *policy = NULL; int err; - if (arg->from) { - argstr = argstr_to_free = match_strdup(arg); - if (!argstr) - return -ENOMEM; - } + if (!arg) + arg = "v2"; - if (!strcmp(argstr, "v1")) { - version = FSCRYPT_CONTEXT_V1; + if (!strcmp(arg, "v1")) { + version = FSCRYPT_POLICY_V1; key_spec.type = FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR; memset(key_spec.u.descriptor, 0x42, FSCRYPT_KEY_DESCRIPTOR_SIZE); - } else if (!strcmp(argstr, "v2")) { - version = FSCRYPT_CONTEXT_V2; + } else if (!strcmp(arg, "v2")) { + version = FSCRYPT_POLICY_V2; key_spec.type = FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER; /* key_spec.u.identifier gets filled in when adding the key */ } else { @@ -707,21 +736,8 @@ int fscrypt_set_test_dummy_encryption(struct super_block *sb, goto out; } - if (dummy_ctx->ctx) { - /* - * Note: if we ever make test_dummy_encryption support - * specifying other encryption settings, such as the encryption - * modes, we'll need to compare those settings here. - */ - if (dummy_ctx->ctx->version == version) - err = 0; - else - err = -EEXIST; - goto out; - } - - ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); - if (!ctx) { + policy = kzalloc(sizeof(*policy), GFP_KERNEL); + if (!policy) { err = -ENOMEM; goto out; } @@ -730,18 +746,18 @@ int fscrypt_set_test_dummy_encryption(struct super_block *sb, if (err) goto out; - ctx->version = version; - switch (ctx->version) { - case FSCRYPT_CONTEXT_V1: - ctx->v1.contents_encryption_mode = FSCRYPT_MODE_AES_256_XTS; - ctx->v1.filenames_encryption_mode = FSCRYPT_MODE_AES_256_CTS; - memcpy(ctx->v1.master_key_descriptor, key_spec.u.descriptor, + policy->version = version; + switch (policy->version) { + case FSCRYPT_POLICY_V1: + policy->v1.contents_encryption_mode = FSCRYPT_MODE_AES_256_XTS; + policy->v1.filenames_encryption_mode = FSCRYPT_MODE_AES_256_CTS; + memcpy(policy->v1.master_key_descriptor, key_spec.u.descriptor, FSCRYPT_KEY_DESCRIPTOR_SIZE); break; - case FSCRYPT_CONTEXT_V2: - ctx->v2.contents_encryption_mode = FSCRYPT_MODE_AES_256_XTS; - ctx->v2.filenames_encryption_mode = FSCRYPT_MODE_AES_256_CTS; - memcpy(ctx->v2.master_key_identifier, key_spec.u.identifier, + case FSCRYPT_POLICY_V2: + policy->v2.contents_encryption_mode = FSCRYPT_MODE_AES_256_XTS; + policy->v2.filenames_encryption_mode = FSCRYPT_MODE_AES_256_CTS; + memcpy(policy->v2.master_key_identifier, key_spec.u.identifier, FSCRYPT_KEY_IDENTIFIER_SIZE); break; default: @@ -749,12 +765,19 @@ int fscrypt_set_test_dummy_encryption(struct super_block *sb, err = -EINVAL; goto out; } - dummy_ctx->ctx = ctx; - ctx = NULL; + + if (dummy_policy->policy) { + if (fscrypt_policies_equal(policy, dummy_policy->policy)) + err = 0; + else + err = -EEXIST; + goto out; + } + dummy_policy->policy = policy; + policy = NULL; err = 0; out: - kfree(ctx); - kfree(argstr_to_free); + kfree(policy); return err; } EXPORT_SYMBOL_GPL(fscrypt_set_test_dummy_encryption); @@ -771,10 +794,16 @@ EXPORT_SYMBOL_GPL(fscrypt_set_test_dummy_encryption); void fscrypt_show_test_dummy_encryption(struct seq_file *seq, char sep, struct super_block *sb) { - const union fscrypt_context *ctx = fscrypt_get_dummy_context(sb); + const union fscrypt_policy *policy = fscrypt_get_dummy_policy(sb); + int vers; - if (!ctx) + if (!policy) return; - seq_printf(seq, "%ctest_dummy_encryption=v%d", sep, ctx->version); + + vers = policy->version; + if (vers == FSCRYPT_POLICY_V1) /* Handle numbering quirk */ + vers = 1; + + seq_printf(seq, "%ctest_dummy_encryption=v%d", sep, vers); } EXPORT_SYMBOL_GPL(fscrypt_show_test_dummy_encryption); diff --git a/fs/d_path.c b/fs/d_path.c index 0f1fc1743302..a69e2cd36e6e 100644 --- a/fs/d_path.c +++ b/fs/d_path.c @@ -102,6 +102,8 @@ restart: if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) { struct mount *parent = READ_ONCE(mnt->mnt_parent); + struct mnt_namespace *mnt_ns; + /* Escaped? */ if (dentry != vfsmnt->mnt_root) { bptr = *buffer; @@ -116,7 +118,9 @@ restart: vfsmnt = &mnt->mnt; continue; } - if (is_mounted(vfsmnt) && !is_anon_ns(mnt->mnt_ns)) + mnt_ns = READ_ONCE(mnt->mnt_ns); + /* open-coded is_mounted() to use local mnt_ns */ + if (!IS_ERR_OR_NULL(mnt_ns) && !is_anon_ns(mnt_ns)) error = 1; // absolute root else error = 2; // detached or not attached yet @@ -1037,18 +1037,18 @@ static vm_fault_t dax_load_hole(struct xa_state *xas, return ret; } -int dax_iomap_zero(loff_t pos, unsigned offset, unsigned size, - struct iomap *iomap) +s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap) { sector_t sector = iomap_sector(iomap, pos & PAGE_MASK); pgoff_t pgoff; long rc, id; void *kaddr; bool page_aligned = false; - + unsigned offset = offset_in_page(pos); + unsigned size = min_t(u64, PAGE_SIZE - offset, length); if (IS_ALIGNED(sector << SECTOR_SHIFT, PAGE_SIZE) && - IS_ALIGNED(size, PAGE_SIZE)) + (size == PAGE_SIZE)) page_aligned = true; rc = bdev_dax_pgoff(iomap->bdev, sector, PAGE_SIZE, &pgoff); @@ -1058,8 +1058,7 @@ int dax_iomap_zero(loff_t pos, unsigned offset, unsigned size, id = dax_read_lock(); if (page_aligned) - rc = dax_zero_page_range(iomap->dax_dev, pgoff, - size >> PAGE_SHIFT); + rc = dax_zero_page_range(iomap->dax_dev, pgoff, 1); else rc = dax_direct_access(iomap->dax_dev, pgoff, 1, &kaddr, NULL); if (rc < 0) { @@ -1072,7 +1071,7 @@ int dax_iomap_zero(loff_t pos, unsigned offset, unsigned size, dax_flush(iomap->dax_dev, kaddr + offset, size); } dax_read_unlock(id); - return 0; + return size; } static loff_t diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index b167d2d02148..a768a09430c3 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -177,7 +177,7 @@ static int open_proxy_open(struct inode *inode, struct file *filp) goto out; if (!fops_get(real_fops)) { -#ifdef MODULE +#ifdef CONFIG_MODULES if (real_fops->owner && real_fops->owner->state == MODULE_STATE_GOING) goto out; @@ -312,7 +312,7 @@ static int full_proxy_open(struct inode *inode, struct file *filp) goto out; if (!fops_get(real_fops)) { -#ifdef MODULE +#ifdef CONFIG_MODULES if (real_fops->owner && real_fops->owner->state == MODULE_STATE_GOING) goto out; diff --git a/fs/direct-io.c b/fs/direct-io.c index 183299892465..d53fa92a1ab6 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -386,25 +386,6 @@ static void dio_bio_end_io(struct bio *bio) spin_unlock_irqrestore(&dio->bio_lock, flags); } -/** - * dio_end_io - handle the end io action for the given bio - * @bio: The direct io bio thats being completed - * - * This is meant to be called by any filesystem that uses their own dio_submit_t - * so that the DIO specific endio actions are dealt with after the filesystem - * has done it's completion work. - */ -void dio_end_io(struct bio *bio) -{ - struct dio *dio = bio->bi_private; - - if (dio->is_async) - dio_bio_end_aio(bio); - else - dio_bio_end_io(bio); -} -EXPORT_SYMBOL_GPL(dio_end_io); - static inline void dio_bio_alloc(struct dio *dio, struct dio_submit *sdio, struct block_device *bdev, @@ -1165,22 +1146,13 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, * the early prefetch in the caller enough time. */ - if (align & blocksize_mask) { - if (bdev) - blkbits = blksize_bits(bdev_logical_block_size(bdev)); - blocksize_mask = (1 << blkbits) - 1; - if (align & blocksize_mask) - goto out; - } - /* watch out for a 0 len io from a tricksy fs */ if (iov_iter_rw(iter) == READ && !count) return 0; dio = kmem_cache_alloc(dio_cache, GFP_KERNEL); - retval = -ENOMEM; if (!dio) - goto out; + return -ENOMEM; /* * Believe it or not, zeroing out the page array caused a .5% * performance regression in a database benchmark. So, we take @@ -1189,32 +1161,32 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, memset(dio, 0, offsetof(struct dio, pages)); dio->flags = flags; - if (dio->flags & DIO_LOCKING) { - if (iov_iter_rw(iter) == READ) { - struct address_space *mapping = - iocb->ki_filp->f_mapping; - - /* will be released by direct_io_worker */ - inode_lock(inode); - - retval = filemap_write_and_wait_range(mapping, offset, - end - 1); - if (retval) { - inode_unlock(inode); - kmem_cache_free(dio_cache, dio); - goto out; - } - } + if (dio->flags & DIO_LOCKING && iov_iter_rw(iter) == READ) { + /* will be released by direct_io_worker */ + inode_lock(inode); } /* Once we sampled i_size check for reads beyond EOF */ dio->i_size = i_size_read(inode); if (iov_iter_rw(iter) == READ && offset >= dio->i_size) { - if (dio->flags & DIO_LOCKING) - inode_unlock(inode); - kmem_cache_free(dio_cache, dio); retval = 0; - goto out; + goto fail_dio; + } + + if (align & blocksize_mask) { + if (bdev) + blkbits = blksize_bits(bdev_logical_block_size(bdev)); + blocksize_mask = (1 << blkbits) - 1; + if (align & blocksize_mask) + goto fail_dio; + } + + if (dio->flags & DIO_LOCKING && iov_iter_rw(iter) == READ) { + struct address_space *mapping = iocb->ki_filp->f_mapping; + + retval = filemap_write_and_wait_range(mapping, offset, end - 1); + if (retval) + goto fail_dio; } /* @@ -1258,14 +1230,8 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, */ retval = sb_init_dio_done_wq(dio->inode->i_sb); } - if (retval) { - /* - * We grab i_mutex only for reads so we don't have - * to release it here - */ - kmem_cache_free(dio_cache, dio); - goto out; - } + if (retval) + goto fail_dio; } /* @@ -1368,7 +1334,13 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, } else BUG_ON(retval != -EIOCBQUEUED); -out: + return retval; + +fail_dio: + if (dio->flags & DIO_LOCKING && iov_iter_rw(iter) == READ) + inode_unlock(inode); + + kmem_cache_free(dio_cache, dio); return retval; } diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig index f82a4952769d..ee92634196a8 100644 --- a/fs/dlm/Kconfig +++ b/fs/dlm/Kconfig @@ -4,6 +4,7 @@ menuconfig DLM depends on INET depends on SYSFS && CONFIGFS_FS && (IPV6 || IPV6=n) select IP_SCTP + select SRCU help A general purpose distributed lock manager for kernel or userspace applications. diff --git a/fs/dlm/config.c b/fs/dlm/config.c index 47f0b98b707f..49c5f9407098 100644 --- a/fs/dlm/config.c +++ b/fs/dlm/config.c @@ -125,7 +125,7 @@ static ssize_t cluster_cluster_name_store(struct config_item *item, CONFIGFS_ATTR(cluster_, cluster_name); static ssize_t cluster_set(struct dlm_cluster *cl, unsigned int *cl_field, - int *info_field, int check_zero, + int *info_field, bool (*check_cb)(unsigned int x), const char *buf, size_t len) { unsigned int x; @@ -137,7 +137,7 @@ static ssize_t cluster_set(struct dlm_cluster *cl, unsigned int *cl_field, if (rc) return rc; - if (check_zero && !x) + if (check_cb && check_cb(x)) return -EINVAL; *cl_field = x; @@ -146,13 +146,13 @@ static ssize_t cluster_set(struct dlm_cluster *cl, unsigned int *cl_field, return len; } -#define CLUSTER_ATTR(name, check_zero) \ +#define CLUSTER_ATTR(name, check_cb) \ static ssize_t cluster_##name##_store(struct config_item *item, \ const char *buf, size_t len) \ { \ struct dlm_cluster *cl = config_item_to_cluster(item); \ return cluster_set(cl, &cl->cl_##name, &dlm_config.ci_##name, \ - check_zero, buf, len); \ + check_cb, buf, len); \ } \ static ssize_t cluster_##name##_show(struct config_item *item, char *buf) \ { \ @@ -161,20 +161,30 @@ static ssize_t cluster_##name##_show(struct config_item *item, char *buf) \ } \ CONFIGFS_ATTR(cluster_, name); -CLUSTER_ATTR(tcp_port, 1); -CLUSTER_ATTR(buffer_size, 1); -CLUSTER_ATTR(rsbtbl_size, 1); -CLUSTER_ATTR(recover_timer, 1); -CLUSTER_ATTR(toss_secs, 1); -CLUSTER_ATTR(scan_secs, 1); -CLUSTER_ATTR(log_debug, 0); -CLUSTER_ATTR(log_info, 0); -CLUSTER_ATTR(protocol, 0); -CLUSTER_ATTR(mark, 0); -CLUSTER_ATTR(timewarn_cs, 1); -CLUSTER_ATTR(waitwarn_us, 0); -CLUSTER_ATTR(new_rsb_count, 0); -CLUSTER_ATTR(recover_callbacks, 0); +static bool dlm_check_zero(unsigned int x) +{ + return !x; +} + +static bool dlm_check_buffer_size(unsigned int x) +{ + return (x < DEFAULT_BUFFER_SIZE); +} + +CLUSTER_ATTR(tcp_port, dlm_check_zero); +CLUSTER_ATTR(buffer_size, dlm_check_buffer_size); +CLUSTER_ATTR(rsbtbl_size, dlm_check_zero); +CLUSTER_ATTR(recover_timer, dlm_check_zero); +CLUSTER_ATTR(toss_secs, dlm_check_zero); +CLUSTER_ATTR(scan_secs, dlm_check_zero); +CLUSTER_ATTR(log_debug, NULL); +CLUSTER_ATTR(log_info, NULL); +CLUSTER_ATTR(protocol, NULL); +CLUSTER_ATTR(mark, NULL); +CLUSTER_ATTR(timewarn_cs, dlm_check_zero); +CLUSTER_ATTR(waitwarn_us, NULL); +CLUSTER_ATTR(new_rsb_count, NULL); +CLUSTER_ATTR(recover_callbacks, NULL); static struct configfs_attribute *cluster_attrs[] = { [CLUSTER_ATTR_TCP_PORT] = &cluster_attr_tcp_port, @@ -221,6 +231,7 @@ struct dlm_space { struct list_head members; struct mutex members_lock; int members_count; + struct dlm_nodes *nds; }; struct dlm_comms { @@ -430,6 +441,7 @@ static struct config_group *make_space(struct config_group *g, const char *name) INIT_LIST_HEAD(&sp->members); mutex_init(&sp->members_lock); sp->members_count = 0; + sp->nds = nds; return &sp->group; fail: @@ -451,6 +463,7 @@ static void drop_space(struct config_group *g, struct config_item *i) static void release_space(struct config_item *i) { struct dlm_space *sp = config_item_to_space(i); + kfree(sp->nds); kfree(sp); } @@ -857,18 +870,22 @@ int dlm_comm_seq(int nodeid, uint32_t *seq) return 0; } -int dlm_comm_mark(int nodeid, unsigned int *mark) +void dlm_comm_mark(int nodeid, unsigned int *mark) { struct dlm_comm *cm; cm = get_comm(nodeid); - if (!cm) - return -ENOENT; + if (!cm) { + *mark = dlm_config.ci_mark; + return; + } - *mark = cm->mark; - put_comm(cm); + if (cm->mark) + *mark = cm->mark; + else + *mark = dlm_config.ci_mark; - return 0; + put_comm(cm); } int dlm_our_nodeid(void) @@ -889,7 +906,6 @@ int dlm_our_addr(struct sockaddr_storage *addr, int num) /* Config file defaults */ #define DEFAULT_TCP_PORT 21064 -#define DEFAULT_BUFFER_SIZE 4096 #define DEFAULT_RSBTBL_SIZE 1024 #define DEFAULT_RECOVER_TIMER 5 #define DEFAULT_TOSS_SECS 10 diff --git a/fs/dlm/config.h b/fs/dlm/config.h index f62996cad561..c210250a2581 100644 --- a/fs/dlm/config.h +++ b/fs/dlm/config.h @@ -12,6 +12,8 @@ #ifndef __CONFIG_DOT_H__ #define __CONFIG_DOT_H__ +#define DEFAULT_BUFFER_SIZE 4096 + struct dlm_config_node { int nodeid; int weight; @@ -46,7 +48,7 @@ void dlm_config_exit(void); int dlm_config_nodes(char *lsname, struct dlm_config_node **nodes_out, int *count_out); int dlm_comm_seq(int nodeid, uint32_t *seq); -int dlm_comm_mark(int nodeid, unsigned int *mark); +void dlm_comm_mark(int nodeid, unsigned int *mark); int dlm_our_nodeid(void); int dlm_our_addr(struct sockaddr_storage *addr, int num); diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 5050fe05769b..79f56f16bc2c 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -65,40 +65,6 @@ #define MAX_SEND_MSG_COUNT 25 #define DLM_SHUTDOWN_WAIT_TIMEOUT msecs_to_jiffies(10000) -struct cbuf { - unsigned int base; - unsigned int len; - unsigned int mask; -}; - -static void cbuf_add(struct cbuf *cb, int n) -{ - cb->len += n; -} - -static int cbuf_data(struct cbuf *cb) -{ - return ((cb->base + cb->len) & cb->mask); -} - -static void cbuf_init(struct cbuf *cb, int size) -{ - cb->base = cb->len = 0; - cb->mask = size-1; -} - -static void cbuf_eat(struct cbuf *cb, int n) -{ - cb->len -= n; - cb->base += n; - cb->base &= cb->mask; -} - -static bool cbuf_empty(struct cbuf *cb) -{ - return cb->len == 0; -} - struct connection { struct socket *sock; /* NULL if not connected */ uint32_t nodeid; /* So we know who we are in the list */ @@ -117,8 +83,6 @@ struct connection { int (*rx_action) (struct connection *); /* What to do when active */ void (*connect_action) (struct connection *); /* What to do to connect */ void (*shutdown_action)(struct connection *con); /* What to do to shutdown */ - struct page *rx_page; - struct cbuf cb; int retries; #define MAX_CONNECT_RETRIES 3 struct hlist_node list; @@ -126,6 +90,10 @@ struct connection { struct work_struct rwork; /* Receive workqueue */ struct work_struct swork; /* Send workqueue */ wait_queue_head_t shutdown_wait; /* wait for graceful shutdown */ + unsigned char *rx_buf; + int rx_buflen; + int rx_leftover; + struct rcu_head rcu; }; #define sock2con(x) ((struct connection *)(x)->sk_user_data) @@ -167,8 +135,8 @@ static struct workqueue_struct *recv_workqueue; static struct workqueue_struct *send_workqueue; static struct hlist_head connection_hash[CONN_HASH_SIZE]; -static DEFINE_MUTEX(connections_lock); -static struct kmem_cache *con_cache; +static DEFINE_SPINLOCK(connections_lock); +DEFINE_STATIC_SRCU(connections_srcu); static void process_recv_sockets(struct work_struct *work); static void process_send_sockets(struct work_struct *work); @@ -184,15 +152,20 @@ static inline int nodeid_hash(int nodeid) static struct connection *__find_con(int nodeid) { - int r; + int r, idx; struct connection *con; r = nodeid_hash(nodeid); - hlist_for_each_entry(con, &connection_hash[r], list) { - if (con->nodeid == nodeid) + idx = srcu_read_lock(&connections_srcu); + hlist_for_each_entry_rcu(con, &connection_hash[r], list) { + if (con->nodeid == nodeid) { + srcu_read_unlock(&connections_srcu, idx); return con; + } } + srcu_read_unlock(&connections_srcu, idx); + return NULL; } @@ -200,21 +173,25 @@ static struct connection *__find_con(int nodeid) * If 'allocation' is zero then we don't attempt to create a new * connection structure for this node. */ -static struct connection *__nodeid2con(int nodeid, gfp_t alloc) +static struct connection *nodeid2con(int nodeid, gfp_t alloc) { - struct connection *con = NULL; + struct connection *con, *tmp; int r; con = __find_con(nodeid); if (con || !alloc) return con; - con = kmem_cache_zalloc(con_cache, alloc); + con = kzalloc(sizeof(*con), alloc); if (!con) return NULL; - r = nodeid_hash(nodeid); - hlist_add_head(&con->list, &connection_hash[r]); + con->rx_buflen = dlm_config.ci_buffer_size; + con->rx_buf = kmalloc(con->rx_buflen, GFP_NOFS); + if (!con->rx_buf) { + kfree(con); + return NULL; + } con->nodeid = nodeid; mutex_init(&con->sock_mutex); @@ -233,31 +210,41 @@ static struct connection *__nodeid2con(int nodeid, gfp_t alloc) con->rx_action = zerocon->rx_action; } + r = nodeid_hash(nodeid); + + spin_lock(&connections_lock); + /* Because multiple workqueues/threads calls this function it can + * race on multiple cpu's. Instead of locking hot path __find_con() + * we just check in rare cases of recently added nodes again + * under protection of connections_lock. If this is the case we + * abort our connection creation and return the existing connection. + */ + tmp = __find_con(nodeid); + if (tmp) { + spin_unlock(&connections_lock); + kfree(con->rx_buf); + kfree(con); + return tmp; + } + + hlist_add_head_rcu(&con->list, &connection_hash[r]); + spin_unlock(&connections_lock); + return con; } /* Loop round all connections */ static void foreach_conn(void (*conn_func)(struct connection *c)) { - int i; - struct hlist_node *n; + int i, idx; struct connection *con; + idx = srcu_read_lock(&connections_srcu); for (i = 0; i < CONN_HASH_SIZE; i++) { - hlist_for_each_entry_safe(con, n, &connection_hash[i], list) + hlist_for_each_entry_rcu(con, &connection_hash[i], list) conn_func(con); } -} - -static struct connection *nodeid2con(int nodeid, gfp_t allocation) -{ - struct connection *con; - - mutex_lock(&connections_lock); - con = __nodeid2con(nodeid, allocation); - mutex_unlock(&connections_lock); - - return con; + srcu_read_unlock(&connections_srcu, idx); } static struct dlm_node_addr *find_node_addr(int nodeid) @@ -614,11 +601,8 @@ static void close_connection(struct connection *con, bool and_other, /* Will only re-enter once. */ close_connection(con->othercon, false, true, true); } - if (con->rx_page) { - __free_page(con->rx_page); - con->rx_page = NULL; - } + con->rx_leftover = 0; con->retries = 0; mutex_unlock(&con->sock_mutex); clear_bit(CF_CLOSING, &con->flags); @@ -672,16 +656,33 @@ static void dlm_tcp_shutdown(struct connection *con) shutdown_connection(con); } +static int con_realloc_receive_buf(struct connection *con, int newlen) +{ + unsigned char *newbuf; + + newbuf = kmalloc(newlen, GFP_NOFS); + if (!newbuf) + return -ENOMEM; + + /* copy any leftover from last receive */ + if (con->rx_leftover) + memmove(newbuf, con->rx_buf, con->rx_leftover); + + /* swap to new buffer space */ + kfree(con->rx_buf); + con->rx_buflen = newlen; + con->rx_buf = newbuf; + + return 0; +} + /* Data received from remote end */ static int receive_from_sock(struct connection *con) { - int ret = 0; - struct msghdr msg = {}; - struct kvec iov[2]; - unsigned len; - int r; int call_again_soon = 0; - int nvec; + struct msghdr msg; + struct kvec iov; + int ret, buflen; mutex_lock(&con->sock_mutex); @@ -689,71 +690,55 @@ static int receive_from_sock(struct connection *con) ret = -EAGAIN; goto out_close; } + if (con->nodeid == 0) { ret = -EINVAL; goto out_close; } - if (con->rx_page == NULL) { - /* - * This doesn't need to be atomic, but I think it should - * improve performance if it is. - */ - con->rx_page = alloc_page(GFP_ATOMIC); - if (con->rx_page == NULL) + /* realloc if we get new buffer size to read out */ + buflen = dlm_config.ci_buffer_size; + if (con->rx_buflen != buflen && con->rx_leftover <= buflen) { + ret = con_realloc_receive_buf(con, buflen); + if (ret < 0) goto out_resched; - cbuf_init(&con->cb, PAGE_SIZE); } - /* - * iov[0] is the bit of the circular buffer between the current end - * point (cb.base + cb.len) and the end of the buffer. - */ - iov[0].iov_len = con->cb.base - cbuf_data(&con->cb); - iov[0].iov_base = page_address(con->rx_page) + cbuf_data(&con->cb); - iov[1].iov_len = 0; - nvec = 1; - - /* - * iov[1] is the bit of the circular buffer between the start of the - * buffer and the start of the currently used section (cb.base) + /* calculate new buffer parameter regarding last receive and + * possible leftover bytes */ - if (cbuf_data(&con->cb) >= con->cb.base) { - iov[0].iov_len = PAGE_SIZE - cbuf_data(&con->cb); - iov[1].iov_len = con->cb.base; - iov[1].iov_base = page_address(con->rx_page); - nvec = 2; - } - len = iov[0].iov_len + iov[1].iov_len; - iov_iter_kvec(&msg.msg_iter, READ, iov, nvec, len); + iov.iov_base = con->rx_buf + con->rx_leftover; + iov.iov_len = con->rx_buflen - con->rx_leftover; - r = ret = sock_recvmsg(con->sock, &msg, MSG_DONTWAIT | MSG_NOSIGNAL); + memset(&msg, 0, sizeof(msg)); + msg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL; + ret = kernel_recvmsg(con->sock, &msg, &iov, 1, iov.iov_len, + msg.msg_flags); if (ret <= 0) goto out_close; - else if (ret == len) + else if (ret == iov.iov_len) call_again_soon = 1; - cbuf_add(&con->cb, ret); - ret = dlm_process_incoming_buffer(con->nodeid, - page_address(con->rx_page), - con->cb.base, con->cb.len, - PAGE_SIZE); - if (ret < 0) { - log_print("lowcomms err %d: addr=%p, base=%u, len=%u, read=%d", - ret, page_address(con->rx_page), con->cb.base, - con->cb.len, r); - cbuf_eat(&con->cb, r); - } else { - cbuf_eat(&con->cb, ret); - } + /* new buflen according readed bytes and leftover from last receive */ + buflen = ret + con->rx_leftover; + ret = dlm_process_incoming_buffer(con->nodeid, con->rx_buf, buflen); + if (ret < 0) + goto out_close; - if (cbuf_empty(&con->cb) && !call_again_soon) { - __free_page(con->rx_page); - con->rx_page = NULL; + /* calculate leftover bytes from process and put it into begin of + * the receive buffer, so next receive we have the full message + * at the start address of the receive buffer. + */ + con->rx_leftover = buflen - ret; + if (con->rx_leftover) { + memmove(con->rx_buf, con->rx_buf + ret, + con->rx_leftover); + call_again_soon = true; } if (call_again_soon) goto out_resched; + mutex_unlock(&con->sock_mutex); return 0; @@ -791,13 +776,11 @@ static int accept_from_sock(struct connection *con) int nodeid; struct connection *newcon; struct connection *addcon; + unsigned int mark; - mutex_lock(&connections_lock); if (!dlm_allow_conn) { - mutex_unlock(&connections_lock); return -1; } - mutex_unlock(&connections_lock); mutex_lock_nested(&con->sock_mutex, 0); @@ -830,6 +813,9 @@ static int accept_from_sock(struct connection *con) return -1; } + dlm_comm_mark(nodeid, &mark); + sock_set_mark(newsock->sk, mark); + log_print("got connection from %d", nodeid); /* Check to see if we already have a connection to this node. This @@ -847,13 +833,24 @@ static int accept_from_sock(struct connection *con) struct connection *othercon = newcon->othercon; if (!othercon) { - othercon = kmem_cache_zalloc(con_cache, GFP_NOFS); + othercon = kzalloc(sizeof(*othercon), GFP_NOFS); if (!othercon) { log_print("failed to allocate incoming socket"); mutex_unlock(&newcon->sock_mutex); result = -ENOMEM; goto accept_err; } + + othercon->rx_buflen = dlm_config.ci_buffer_size; + othercon->rx_buf = kmalloc(othercon->rx_buflen, GFP_NOFS); + if (!othercon->rx_buf) { + mutex_unlock(&newcon->sock_mutex); + kfree(othercon); + log_print("failed to allocate incoming socket receive buffer"); + result = -ENOMEM; + goto accept_err; + } + othercon->nodeid = nodeid; othercon->rx_action = receive_from_sock; mutex_init(&othercon->sock_mutex); @@ -975,6 +972,8 @@ static void sctp_connect_to_sock(struct connection *con) return; } + dlm_comm_mark(con->nodeid, &mark); + mutex_lock(&con->sock_mutex); /* Some odd races can cause double-connects, ignore them */ @@ -999,11 +998,6 @@ static void sctp_connect_to_sock(struct connection *con) if (result < 0) goto socket_err; - /* set skb mark */ - result = dlm_comm_mark(con->nodeid, &mark); - if (result < 0) - goto bind_err; - sock_set_mark(sock->sk, mark); con->rx_action = receive_from_sock; @@ -1076,6 +1070,8 @@ static void tcp_connect_to_sock(struct connection *con) return; } + dlm_comm_mark(con->nodeid, &mark); + mutex_lock(&con->sock_mutex); if (con->retries++ > MAX_CONNECT_RETRIES) goto out; @@ -1090,11 +1086,6 @@ static void tcp_connect_to_sock(struct connection *con) if (result < 0) goto out_err; - /* set skb mark */ - result = dlm_comm_mark(con->nodeid, &mark); - if (result < 0) - goto out_err; - sock_set_mark(sock->sk, mark); memset(&saddr, 0, sizeof(saddr)); @@ -1238,6 +1229,14 @@ static void init_local(void) } } +static void deinit_local(void) +{ + int i; + + for (i = 0; i < dlm_local_count; i++) + kfree(dlm_local_addr[i]); +} + /* Initialise SCTP socket and bind to all interfaces */ static int sctp_listen_for_all(void) { @@ -1546,13 +1545,6 @@ static void process_send_sockets(struct work_struct *work) send_to_sock(con); } - -/* Discard all entries on the write queues */ -static void clean_writequeues(void) -{ - foreach_conn(clean_one_writequeue); -} - static void work_stop(void) { if (recv_workqueue) @@ -1608,26 +1600,34 @@ static void shutdown_conn(struct connection *con) con->shutdown_action(con); } +static void connection_release(struct rcu_head *rcu) +{ + struct connection *con = container_of(rcu, struct connection, rcu); + + kfree(con->rx_buf); + kfree(con); +} + static void free_conn(struct connection *con) { close_connection(con, true, true, true); - if (con->othercon) - kmem_cache_free(con_cache, con->othercon); - hlist_del(&con->list); - kmem_cache_free(con_cache, con); + spin_lock(&connections_lock); + hlist_del_rcu(&con->list); + spin_unlock(&connections_lock); + if (con->othercon) { + clean_one_writequeue(con->othercon); + call_rcu(&con->othercon->rcu, connection_release); + } + clean_one_writequeue(con); + call_rcu(&con->rcu, connection_release); } static void work_flush(void) { - int ok; + int ok, idx; int i; - struct hlist_node *n; struct connection *con; - if (recv_workqueue) - flush_workqueue(recv_workqueue); - if (send_workqueue) - flush_workqueue(send_workqueue); do { ok = 1; foreach_conn(stop_conn); @@ -1635,9 +1635,10 @@ static void work_flush(void) flush_workqueue(recv_workqueue); if (send_workqueue) flush_workqueue(send_workqueue); + idx = srcu_read_lock(&connections_srcu); for (i = 0; i < CONN_HASH_SIZE && ok; i++) { - hlist_for_each_entry_safe(con, n, - &connection_hash[i], list) { + hlist_for_each_entry_rcu(con, &connection_hash[i], + list) { ok &= test_bit(CF_READ_PENDING, &con->flags); ok &= test_bit(CF_WRITE_PENDING, &con->flags); if (con->othercon) { @@ -1648,6 +1649,7 @@ static void work_flush(void) } } } + srcu_read_unlock(&connections_srcu, idx); } while (!ok); } @@ -1656,16 +1658,18 @@ void dlm_lowcomms_stop(void) /* Set all the flags to prevent any socket activity. */ - mutex_lock(&connections_lock); dlm_allow_conn = 0; - mutex_unlock(&connections_lock); + + if (recv_workqueue) + flush_workqueue(recv_workqueue); + if (send_workqueue) + flush_workqueue(send_workqueue); + foreach_conn(shutdown_conn); work_flush(); - clean_writequeues(); foreach_conn(free_conn); work_stop(); - - kmem_cache_destroy(con_cache); + deinit_local(); } int dlm_lowcomms_start(void) @@ -1684,16 +1688,9 @@ int dlm_lowcomms_start(void) goto fail; } - error = -ENOMEM; - con_cache = kmem_cache_create("dlm_conn", sizeof(struct connection), - __alignof__(struct connection), 0, - NULL); - if (!con_cache) - goto fail; - error = work_start(); if (error) - goto fail_destroy; + goto fail; dlm_allow_conn = 1; @@ -1710,12 +1707,8 @@ int dlm_lowcomms_start(void) fail_unlisten: dlm_allow_conn = 0; con = nodeid2con(0,0); - if (con) { - close_connection(con, false, true, true); - kmem_cache_free(con_cache, con); - } -fail_destroy: - kmem_cache_destroy(con_cache); + if (con) + free_conn(con); fail: return error; } diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c index 921322d133e3..fde3a6afe4be 100644 --- a/fs/dlm/midcomms.c +++ b/fs/dlm/midcomms.c @@ -22,114 +22,84 @@ * into packets and sends them to the comms layer. */ +#include <asm/unaligned.h> + #include "dlm_internal.h" #include "lowcomms.h" #include "config.h" #include "lock.h" #include "midcomms.h" - -static void copy_from_cb(void *dst, const void *base, unsigned offset, - unsigned len, unsigned limit) -{ - unsigned copy = len; - - if ((copy + offset) > limit) - copy = limit - offset; - memcpy(dst, base + offset, copy); - len -= copy; - if (len) - memcpy(dst + copy, base, len); -} - /* * Called from the low-level comms layer to process a buffer of * commands. - * - * Only complete messages are processed here, any "spare" bytes from - * the end of a buffer are saved and tacked onto the front of the next - * message that comes in. I doubt this will happen very often but we - * need to be able to cope with it and I don't want the task to be waiting - * for packets to come in when there is useful work to be done. */ -int dlm_process_incoming_buffer(int nodeid, const void *base, - unsigned offset, unsigned len, unsigned limit) +int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int len) { - union { - unsigned char __buf[DLM_INBUF_LEN]; - /* this is to force proper alignment on some arches */ - union dlm_packet p; - } __tmp; - union dlm_packet *p = &__tmp.p; - int ret = 0; - int err = 0; + const unsigned char *ptr = buf; + const struct dlm_header *hd; uint16_t msglen; - uint32_t lockspace; - - while (len > sizeof(struct dlm_header)) { - - /* Copy just the header to check the total length. The - message may wrap around the end of the buffer back to the - start, so we need to use a temp buffer and copy_from_cb. */ - - copy_from_cb(p, base, offset, sizeof(struct dlm_header), - limit); - - msglen = le16_to_cpu(p->header.h_length); - lockspace = p->header.h_lockspace; + int ret = 0; - err = -EINVAL; - if (msglen < sizeof(struct dlm_header)) - break; - if (p->header.h_cmd == DLM_MSG) { - if (msglen < sizeof(struct dlm_message)) - break; - } else { - if (msglen < sizeof(struct dlm_rcom)) - break; - } - err = -E2BIG; - if (msglen > dlm_config.ci_buffer_size) { - log_print("message size %d from %d too big, buf len %d", - msglen, nodeid, len); - break; + while (len >= sizeof(struct dlm_header)) { + hd = (struct dlm_header *)ptr; + + /* no message should be more than this otherwise we + * cannot deliver this message to upper layers + */ + msglen = get_unaligned_le16(&hd->h_length); + if (msglen > DEFAULT_BUFFER_SIZE) { + log_print("received invalid length header: %u, will abort message parsing", + msglen); + return -EBADMSG; } - err = 0; - - /* If only part of the full message is contained in this - buffer, then do nothing and wait for lowcomms to call - us again later with more data. We return 0 meaning - we've consumed none of the input buffer. */ + /* caller will take care that leftover + * will be parsed next call with more data + */ if (msglen > len) break; - /* Allocate a larger temp buffer if the full message won't fit - in the buffer on the stack (which should work for most - ordinary messages). */ - - if (msglen > sizeof(__tmp) && p == &__tmp.p) { - p = kmalloc(dlm_config.ci_buffer_size, GFP_NOFS); - if (p == NULL) - return ret; - } + switch (hd->h_cmd) { + case DLM_MSG: + if (msglen < sizeof(struct dlm_message)) { + log_print("dlm msg too small: %u, will skip this message", + msglen); + goto skip; + } - copy_from_cb(p, base, offset, msglen, limit); + break; + case DLM_RCOM: + if (msglen < sizeof(struct dlm_rcom)) { + log_print("dlm rcom msg too small: %u, will skip this message", + msglen); + goto skip; + } - BUG_ON(lockspace != p->header.h_lockspace); + break; + default: + log_print("unsupported h_cmd received: %u, will skip this message", + hd->h_cmd); + goto skip; + } + /* for aligned memory access, we just copy current message + * to begin of the buffer which contains already parsed buffer + * data and should provide align access for upper layers + * because the start address of the buffer has a aligned + * address. This memmove can be removed when the upperlayer + * is capable of unaligned memory access. + */ + memmove(buf, ptr, msglen); + dlm_receive_buffer((union dlm_packet *)buf, nodeid); + +skip: ret += msglen; - offset += msglen; - offset &= (limit - 1); len -= msglen; - - dlm_receive_buffer(p, nodeid); + ptr += msglen; } - if (p != &__tmp.p) - kfree(p); - - return err ? err : ret; + return ret; } diff --git a/fs/dlm/midcomms.h b/fs/dlm/midcomms.h index 2e122e81c8d0..61e90a921849 100644 --- a/fs/dlm/midcomms.h +++ b/fs/dlm/midcomms.h @@ -12,8 +12,7 @@ #ifndef __MIDCOMMS_DOT_H__ #define __MIDCOMMS_DOT_H__ -int dlm_process_incoming_buffer(int nodeid, const void *base, unsigned offset, - unsigned len, unsigned limit); +int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int buflen); #endif /* __MIDCOMMS_DOT_H__ */ diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c index e338c407cb75..67f68d48d60c 100644 --- a/fs/dlm/netlink.c +++ b/fs/dlm/netlink.c @@ -62,7 +62,7 @@ static int user_cmd(struct sk_buff *skb, struct genl_info *info) return 0; } -static const struct genl_ops dlm_nl_ops[] = { +static const struct genl_small_ops dlm_nl_ops[] = { { .cmd = DLM_CMD_HELLO, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, @@ -73,8 +73,8 @@ static const struct genl_ops dlm_nl_ops[] = { static struct genl_family family __ro_after_init = { .name = DLM_GENL_NAME, .version = DLM_GENL_VERSION, - .ops = dlm_nl_ops, - .n_ops = ARRAY_SIZE(dlm_nl_ops), + .small_ops = dlm_nl_ops, + .n_small_ops = ARRAY_SIZE(dlm_nl_ops), .module = THIS_MODULE, }; diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index 28bb5689333a..15880a68faad 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c @@ -141,6 +141,9 @@ static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor, name[len + EFI_VARIABLE_GUID_LEN+1] = '\0'; + /* replace invalid slashes like kobject_set_name_vargs does for /sys/firmware/efi/vars. */ + strreplace(name, '/', '!'); + inode = efivarfs_get_inode(sb, d_inode(root), S_IFREG | 0644, 0, is_removable); if (!inode) diff --git a/fs/erofs/data.c b/fs/erofs/data.c index 459ecb42cbd3..347be146884c 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -224,7 +224,7 @@ submit_bio_retry: bio_set_dev(bio, sb->s_bdev); bio->bi_iter.bi_sector = (sector_t)blknr << LOG_SECTORS_PER_BLOCK; - bio->bi_opf = REQ_OP_READ; + bio->bi_opf = REQ_OP_READ | (ra ? REQ_RAHEAD : 0); } err = bio_add_page(bio, page, PAGE_SIZE, 0); diff --git a/fs/erofs/super.c b/fs/erofs/super.c index ddaa516c008a..b9a09806512a 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -211,9 +211,7 @@ static void erofs_default_options(struct erofs_fs_context *ctx) enum { Opt_user_xattr, - Opt_nouser_xattr, Opt_acl, - Opt_noacl, Opt_cache_strategy, Opt_err }; diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c index c8c381eadcd6..5bde77d70852 100644 --- a/fs/erofs/xattr.c +++ b/fs/erofs/xattr.c @@ -473,8 +473,6 @@ static int erofs_xattr_generic_get(const struct xattr_handler *handler, return -EOPNOTSUPP; break; case EROFS_XATTR_INDEX_TRUSTED: - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; break; case EROFS_XATTR_INDEX_SECURITY: break; diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 6c939def00f9..50912a5420b4 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -135,6 +135,7 @@ struct z_erofs_decompress_frontend { struct z_erofs_collector clt; struct erofs_map_blocks map; + bool readahead; /* used for applying cache strategy on the fly */ bool backmost; erofs_off_t headoffset; @@ -153,8 +154,7 @@ static DEFINE_MUTEX(z_pagemap_global_lock); static void preload_compressed_pages(struct z_erofs_collector *clt, struct address_space *mc, - enum z_erofs_cache_alloctype type, - struct list_head *pagepool) + enum z_erofs_cache_alloctype type) { const struct z_erofs_pcluster *pcl = clt->pcl; const unsigned int clusterpages = BIT(pcl->clusterbits); @@ -562,8 +562,7 @@ static bool should_alloc_managed_pages(struct z_erofs_decompress_frontend *fe, } static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, - struct page *page, - struct list_head *pagepool) + struct page *page) { struct inode *const inode = fe->inode; struct erofs_sb_info *const sbi = EROFS_I_SB(inode); @@ -620,8 +619,7 @@ restart_now: else cache_strategy = DONTALLOC; - preload_compressed_pages(clt, MNGD_MAPPING(sbi), - cache_strategy, pagepool); + preload_compressed_pages(clt, MNGD_MAPPING(sbi), cache_strategy); hitted: /* @@ -653,7 +651,7 @@ retry: /* should allocate an additional staging page for pagevec */ if (err == -EAGAIN) { struct page *const newpage = - erofs_allocpage(pagepool, GFP_NOFS | __GFP_NOFAIL); + alloc_page(GFP_NOFS | __GFP_NOFAIL); newpage->mapping = Z_EROFS_MAPPING_STAGING; err = z_erofs_attach_page(clt, newpage, @@ -1151,7 +1149,7 @@ static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl, } static void z_erofs_submit_queue(struct super_block *sb, - z_erofs_next_pcluster_t owned_head, + struct z_erofs_decompress_frontend *f, struct list_head *pagepool, struct z_erofs_decompressqueue *fgq, bool *force_fg) @@ -1160,6 +1158,7 @@ static void z_erofs_submit_queue(struct super_block *sb, z_erofs_next_pcluster_t qtail[NR_JOBQUEUES]; struct z_erofs_decompressqueue *q[NR_JOBQUEUES]; void *bi_private; + z_erofs_next_pcluster_t owned_head = f->clt.owned_head; /* since bio will be NULL, no need to initialize last_index */ pgoff_t last_index; unsigned int nr_bios = 0; @@ -1193,7 +1192,6 @@ static void z_erofs_submit_queue(struct super_block *sb, do { struct page *page; - int err; page = pickup_page_for_submission(pcl, i++, pagepool, MNGD_MAPPING(sbi), @@ -1216,11 +1214,12 @@ submit_bio_retry: LOG_SECTORS_PER_BLOCK; bio->bi_private = bi_private; bio->bi_opf = REQ_OP_READ; + if (f->readahead) + bio->bi_opf |= REQ_RAHEAD; ++nr_bios; } - err = bio_add_page(bio, page, PAGE_SIZE, 0); - if (err < PAGE_SIZE) + if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) goto submit_bio_retry; last_index = cur; @@ -1248,14 +1247,14 @@ submit_bio_retry: } static void z_erofs_runqueue(struct super_block *sb, - struct z_erofs_collector *clt, + struct z_erofs_decompress_frontend *f, struct list_head *pagepool, bool force_fg) { struct z_erofs_decompressqueue io[NR_JOBQUEUES]; - if (clt->owned_head == Z_EROFS_PCLUSTER_TAIL) + if (f->clt.owned_head == Z_EROFS_PCLUSTER_TAIL) return; - z_erofs_submit_queue(sb, clt->owned_head, pagepool, io, &force_fg); + z_erofs_submit_queue(sb, f, pagepool, io, &force_fg); /* handle bypass queue (no i/o pclusters) immediately */ z_erofs_decompress_queue(&io[JQ_BYPASS], pagepool); @@ -1282,11 +1281,11 @@ static int z_erofs_readpage(struct file *file, struct page *page) f.headoffset = (erofs_off_t)page->index << PAGE_SHIFT; - err = z_erofs_do_read_page(&f, page, &pagepool); + err = z_erofs_do_read_page(&f, page); (void)z_erofs_collector_end(&f.clt); /* if some compressed cluster ready, need submit them anyway */ - z_erofs_runqueue(inode->i_sb, &f.clt, &pagepool, true); + z_erofs_runqueue(inode->i_sb, &f, &pagepool, true); if (err) erofs_err(inode->i_sb, "failed to read, err [%d]", err); @@ -1299,25 +1298,20 @@ static int z_erofs_readpage(struct file *file, struct page *page) return err; } -static bool should_decompress_synchronously(struct erofs_sb_info *sbi, - unsigned int nr) -{ - return nr <= sbi->ctx.max_sync_decompress_pages; -} - static void z_erofs_readahead(struct readahead_control *rac) { struct inode *const inode = rac->mapping->host; struct erofs_sb_info *const sbi = EROFS_I_SB(inode); - bool sync = should_decompress_synchronously(sbi, readahead_count(rac)); + unsigned int nr_pages = readahead_count(rac); + bool sync = (nr_pages <= sbi->ctx.max_sync_decompress_pages); struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); struct page *page, *head = NULL; LIST_HEAD(pagepool); - trace_erofs_readpages(inode, readahead_index(rac), - readahead_count(rac), false); + trace_erofs_readpages(inode, readahead_index(rac), nr_pages, false); + f.readahead = true; f.headoffset = readahead_pos(rac); while ((page = readahead_page(rac))) { @@ -1341,7 +1335,7 @@ static void z_erofs_readahead(struct readahead_control *rac) /* traversal in reverse order */ head = (void *)page_private(page); - err = z_erofs_do_read_page(&f, page, &pagepool); + err = z_erofs_do_read_page(&f, page); if (err) erofs_err(inode->i_sb, "readahead error at page %lu @ nid %llu", @@ -1351,7 +1345,7 @@ static void z_erofs_readahead(struct readahead_control *rac) (void)z_erofs_collector_end(&f.clt); - z_erofs_runqueue(inode->i_sb, &f.clt, &pagepool, sync); + z_erofs_runqueue(inode->i_sb, &f, &pagepool, sync); if (f.map.mpage) put_page(f.map.mpage); diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 8107e06d7f6f..4df61129566d 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -218,8 +218,7 @@ struct eventpoll { struct file *file; /* used to optimize loop detection check */ - struct list_head visited_list_link; - int visited; + u64 gen; #ifdef CONFIG_NET_RX_BUSY_POLL /* used to track busy poll napi_id */ @@ -274,6 +273,8 @@ static long max_user_watches __read_mostly; */ static DEFINE_MUTEX(epmutex); +static u64 loop_check_gen = 0; + /* Used to check for epoll file descriptor inclusion loops */ static struct nested_calls poll_loop_ncalls; @@ -283,9 +284,6 @@ static struct kmem_cache *epi_cache __read_mostly; /* Slab cache used to allocate "struct eppoll_entry" */ static struct kmem_cache *pwq_cache __read_mostly; -/* Visited nodes during ep_loop_check(), so we can unset them when we finish */ -static LIST_HEAD(visited_list); - /* * List of files with newly added links, where we may need to limit the number * of emanating paths. Protected by the epmutex. @@ -1450,7 +1448,7 @@ static int reverse_path_check(void) static int ep_create_wakeup_source(struct epitem *epi) { - const char *name; + struct name_snapshot n; struct wakeup_source *ws; if (!epi->ep->ws) { @@ -1459,8 +1457,9 @@ static int ep_create_wakeup_source(struct epitem *epi) return -ENOMEM; } - name = epi->ffd.file->f_path.dentry->d_name.name; - ws = wakeup_source_register(NULL, name); + take_dentry_name_snapshot(&n, epi->ffd.file->f_path.dentry); + ws = wakeup_source_register(NULL, n.name.name); + release_dentry_name_snapshot(&n); if (!ws) return -ENOMEM; @@ -1522,6 +1521,22 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, RCU_INIT_POINTER(epi->ws, NULL); } + /* Add the current item to the list of active epoll hook for this file */ + spin_lock(&tfile->f_lock); + list_add_tail_rcu(&epi->fllink, &tfile->f_ep_links); + spin_unlock(&tfile->f_lock); + + /* + * Add the current item to the RB tree. All RB tree operations are + * protected by "mtx", and ep_insert() is called with "mtx" held. + */ + ep_rbtree_insert(ep, epi); + + /* now check if we've created too many backpaths */ + error = -EINVAL; + if (full_check && reverse_path_check()) + goto error_remove_epi; + /* Initialize the poll table using the queue callback */ epq.epi = epi; init_poll_funcptr(&epq.pt, ep_ptable_queue_proc); @@ -1544,22 +1559,6 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, if (epi->nwait < 0) goto error_unregister; - /* Add the current item to the list of active epoll hook for this file */ - spin_lock(&tfile->f_lock); - list_add_tail_rcu(&epi->fllink, &tfile->f_ep_links); - spin_unlock(&tfile->f_lock); - - /* - * Add the current item to the RB tree. All RB tree operations are - * protected by "mtx", and ep_insert() is called with "mtx" held. - */ - ep_rbtree_insert(ep, epi); - - /* now check if we've created too many backpaths */ - error = -EINVAL; - if (full_check && reverse_path_check()) - goto error_remove_epi; - /* We have to drop the new item inside our item list to keep track of it */ write_lock_irq(&ep->lock); @@ -1588,6 +1587,8 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, return 0; +error_unregister: + ep_unregister_pollwait(ep, epi); error_remove_epi: spin_lock(&tfile->f_lock); list_del_rcu(&epi->fllink); @@ -1595,9 +1596,6 @@ error_remove_epi: rb_erase_cached(&epi->rbn, &ep->rbr); -error_unregister: - ep_unregister_pollwait(ep, epi); - /* * We need to do this because an event could have been arrived on some * allocated wait queue. Note that we don't care about the ep->ovflist @@ -1972,13 +1970,12 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests) struct epitem *epi; mutex_lock_nested(&ep->mtx, call_nests + 1); - ep->visited = 1; - list_add(&ep->visited_list_link, &visited_list); + ep->gen = loop_check_gen; for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) { epi = rb_entry(rbp, struct epitem, rbn); if (unlikely(is_file_epoll(epi->ffd.file))) { ep_tovisit = epi->ffd.file->private_data; - if (ep_tovisit->visited) + if (ep_tovisit->gen == loop_check_gen) continue; error = ep_call_nested(&poll_loop_ncalls, ep_loop_check_proc, epi->ffd.file, @@ -2019,18 +2016,8 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests) */ static int ep_loop_check(struct eventpoll *ep, struct file *file) { - int ret; - struct eventpoll *ep_cur, *ep_next; - - ret = ep_call_nested(&poll_loop_ncalls, + return ep_call_nested(&poll_loop_ncalls, ep_loop_check_proc, file, ep, current); - /* clear visited list */ - list_for_each_entry_safe(ep_cur, ep_next, &visited_list, - visited_list_link) { - ep_cur->visited = 0; - list_del(&ep_cur->visited_list_link); - } - return ret; } static void clear_tfile_check_list(void) @@ -2195,11 +2182,13 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, goto error_tgt_fput; if (op == EPOLL_CTL_ADD) { if (!list_empty(&f.file->f_ep_links) || + ep->gen == loop_check_gen || is_file_epoll(tf.file)) { mutex_unlock(&ep->mtx); error = epoll_mutex_lock(&epmutex, 0, nonblock); if (error) goto error_tgt_fput; + loop_check_gen++; full_check = 1; if (is_file_epoll(tf.file)) { error = -ELOOP; @@ -2263,6 +2252,7 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, error_tgt_fput: if (full_check) { clear_tfile_check_list(); + loop_check_gen++; mutex_unlock(&epmutex); } diff --git a/fs/exec.c b/fs/exec.c index a91003e28eaa..547a2390baf5 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -23,6 +23,7 @@ * formats. */ +#include <linux/kernel_read_file.h> #include <linux/slab.h> #include <linux/file.h> #include <linux/fdtable.h> @@ -62,6 +63,7 @@ #include <linux/oom.h> #include <linux/compat.h> #include <linux/vmalloc.h> +#include <linux/io_uring.h> #include <linux/uaccess.h> #include <asm/mmu_context.h> @@ -949,137 +951,6 @@ struct file *open_exec(const char *name) } EXPORT_SYMBOL(open_exec); -int kernel_read_file(struct file *file, void **buf, loff_t *size, - loff_t max_size, enum kernel_read_file_id id) -{ - loff_t i_size, pos; - ssize_t bytes = 0; - int ret; - - if (!S_ISREG(file_inode(file)->i_mode) || max_size < 0) - return -EINVAL; - - ret = deny_write_access(file); - if (ret) - return ret; - - ret = security_kernel_read_file(file, id); - if (ret) - goto out; - - i_size = i_size_read(file_inode(file)); - if (i_size <= 0) { - ret = -EINVAL; - goto out; - } - if (i_size > SIZE_MAX || (max_size > 0 && i_size > max_size)) { - ret = -EFBIG; - goto out; - } - - if (id != READING_FIRMWARE_PREALLOC_BUFFER) - *buf = vmalloc(i_size); - if (!*buf) { - ret = -ENOMEM; - goto out; - } - - pos = 0; - while (pos < i_size) { - bytes = kernel_read(file, *buf + pos, i_size - pos, &pos); - if (bytes < 0) { - ret = bytes; - goto out_free; - } - - if (bytes == 0) - break; - } - - if (pos != i_size) { - ret = -EIO; - goto out_free; - } - - ret = security_kernel_post_read_file(file, *buf, i_size, id); - if (!ret) - *size = pos; - -out_free: - if (ret < 0) { - if (id != READING_FIRMWARE_PREALLOC_BUFFER) { - vfree(*buf); - *buf = NULL; - } - } - -out: - allow_write_access(file); - return ret; -} -EXPORT_SYMBOL_GPL(kernel_read_file); - -int kernel_read_file_from_path(const char *path, void **buf, loff_t *size, - loff_t max_size, enum kernel_read_file_id id) -{ - struct file *file; - int ret; - - if (!path || !*path) - return -EINVAL; - - file = filp_open(path, O_RDONLY, 0); - if (IS_ERR(file)) - return PTR_ERR(file); - - ret = kernel_read_file(file, buf, size, max_size, id); - fput(file); - return ret; -} -EXPORT_SYMBOL_GPL(kernel_read_file_from_path); - -int kernel_read_file_from_path_initns(const char *path, void **buf, - loff_t *size, loff_t max_size, - enum kernel_read_file_id id) -{ - struct file *file; - struct path root; - int ret; - - if (!path || !*path) - return -EINVAL; - - task_lock(&init_task); - get_fs_root(init_task.fs, &root); - task_unlock(&init_task); - - file = file_open_root(root.dentry, root.mnt, path, O_RDONLY, 0); - path_put(&root); - if (IS_ERR(file)) - return PTR_ERR(file); - - ret = kernel_read_file(file, buf, size, max_size, id); - fput(file); - return ret; -} -EXPORT_SYMBOL_GPL(kernel_read_file_from_path_initns); - -int kernel_read_file_from_fd(int fd, void **buf, loff_t *size, loff_t max_size, - enum kernel_read_file_id id) -{ - struct fd f = fdget(fd); - int ret = -EBADF; - - if (!f.file) - goto out; - - ret = kernel_read_file(f.file, buf, size, max_size, id); -out: - fdput(f); - return ret; -} -EXPORT_SYMBOL_GPL(kernel_read_file_from_fd); - #if defined(CONFIG_HAVE_AOUT) || defined(CONFIG_BINFMT_FLAT) || \ defined(CONFIG_BINFMT_ELF_FDPIC) ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len) @@ -1130,11 +1001,24 @@ static int exec_mmap(struct mm_struct *mm) } task_lock(tsk); - active_mm = tsk->active_mm; membarrier_exec_mmap(mm); - tsk->mm = mm; + + local_irq_disable(); + active_mm = tsk->active_mm; tsk->active_mm = mm; + tsk->mm = mm; + /* + * This prevents preemption while active_mm is being loaded and + * it and mm are being updated, which could cause problems for + * lazy tlb mm refcounting when these are updated by context + * switches. Not all architectures can handle irqs off over + * activate_mm yet. + */ + if (!IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM)) + local_irq_enable(); activate_mm(active_mm, mm); + if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM)) + local_irq_enable(); tsk->mm->vmacache_seqnum = 0; vmacache_flush(tsk); task_unlock(tsk); @@ -1895,6 +1779,11 @@ static int bprm_execve(struct linux_binprm *bprm, struct files_struct *displaced; int retval; + /* + * Cancel any io_uring activity across execve + */ + io_uring_task_cancel(); + retval = unshare_files(&displaced); if (retval) return retval; diff --git a/fs/exfat/cache.c b/fs/exfat/cache.c index 03d0824fc368..5a2f119b7e8c 100644 --- a/fs/exfat/cache.c +++ b/fs/exfat/cache.c @@ -17,7 +17,6 @@ #include "exfat_raw.h" #include "exfat_fs.h" -#define EXFAT_CACHE_VALID 0 #define EXFAT_MAX_CACHE 16 struct exfat_cache { @@ -61,16 +60,6 @@ void exfat_cache_shutdown(void) kmem_cache_destroy(exfat_cachep); } -void exfat_cache_init_inode(struct inode *inode) -{ - struct exfat_inode_info *ei = EXFAT_I(inode); - - spin_lock_init(&ei->cache_lru_lock); - ei->nr_caches = 0; - ei->cache_valid_id = EXFAT_CACHE_VALID + 1; - INIT_LIST_HEAD(&ei->cache_lru); -} - static inline struct exfat_cache *exfat_cache_alloc(void) { return kmem_cache_alloc(exfat_cachep, GFP_NOFS); diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h index 95d717f8620c..c013fe931d9c 100644 --- a/fs/exfat/exfat_fs.h +++ b/fs/exfat/exfat_fs.h @@ -248,6 +248,8 @@ struct exfat_sb_info { struct rcu_head rcu; }; +#define EXFAT_CACHE_VALID 0 + /* * EXFAT file system inode in-memory data */ @@ -428,7 +430,6 @@ extern const struct dentry_operations exfat_utf8_dentry_ops; /* cache.c */ int exfat_cache_init(void); void exfat_cache_shutdown(void); -void exfat_cache_init_inode(struct inode *inode); void exfat_cache_inval_inode(struct inode *inode); int exfat_get_cluster(struct inode *inode, unsigned int cluster, unsigned int *fclus, unsigned int *dclus, diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c index 7f90204adef5..a6de17cac3df 100644 --- a/fs/exfat/inode.c +++ b/fs/exfat/inode.c @@ -611,8 +611,6 @@ static int exfat_fill_inode(struct inode *inode, struct exfat_dir_entry *info) ei->i_crtime = info->crtime; inode->i_atime = info->atime; - exfat_cache_init_inode(inode); - return 0; } diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c index e73f20f66cb2..c94ac239f740 100644 --- a/fs/exfat/namei.c +++ b/fs/exfat/namei.c @@ -578,7 +578,8 @@ static int exfat_create(struct inode *dir, struct dentry *dentry, umode_t mode, i_pos = exfat_make_i_pos(&info); inode = exfat_build_inode(sb, &info, i_pos); - if (IS_ERR(inode)) + err = PTR_ERR_OR_ZERO(inode); + if (err) goto unlock; inode_inc_iversion(inode); @@ -745,10 +746,9 @@ static struct dentry *exfat_lookup(struct inode *dir, struct dentry *dentry, i_pos = exfat_make_i_pos(&info); inode = exfat_build_inode(sb, &info, i_pos); - if (IS_ERR(inode)) { - err = PTR_ERR(inode); + err = PTR_ERR_OR_ZERO(inode); + if (err) goto unlock; - } i_mode = inode->i_mode; alias = d_find_alias(inode); @@ -890,10 +890,9 @@ static int exfat_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) i_pos = exfat_make_i_pos(&info); inode = exfat_build_inode(sb, &info, i_pos); - if (IS_ERR(inode)) { - err = PTR_ERR(inode); + err = PTR_ERR_OR_ZERO(inode); + if (err) goto unlock; - } inode_inc_iversion(inode); inode->i_mtime = inode->i_atime = inode->i_ctime = diff --git a/fs/exfat/super.c b/fs/exfat/super.c index 3b6a1659892f..60b941ba557b 100644 --- a/fs/exfat/super.c +++ b/fs/exfat/super.c @@ -376,7 +376,6 @@ static int exfat_read_root(struct inode *inode) inode->i_mtime = inode->i_atime = inode->i_ctime = ei->i_crtime = current_time(inode); exfat_truncate_atime(&inode->i_atime); - exfat_cache_init_inode(inode); return 0; } @@ -763,6 +762,10 @@ static void exfat_inode_init_once(void *foo) { struct exfat_inode_info *ei = (struct exfat_inode_info *)foo; + spin_lock_init(&ei->cache_lru_lock); + ei->nr_caches = 0; + ei->cache_valid_id = EXFAT_CACHE_VALID + 1; + INIT_LIST_HEAD(&ei->cache_lru); INIT_HLIST_NODE(&ei->i_hash_fat); inode_init_once(&ei->vfs_inode); } diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c index fa9c951d3471..1f3f4326bf3c 100644 --- a/fs/ext2/balloc.c +++ b/fs/ext2/balloc.c @@ -189,7 +189,7 @@ static void group_adjust_blocks(struct super_block *sb, int group_no, /** * __rsv_window_dump() -- Dump the filesystem block allocation reservation map - * @rb_root: root of per-filesystem reservation rb tree + * @root: root of per-filesystem reservation rb tree * @verbose: verbose mode * @fn: function which wishes to dump the reservation map * @@ -282,7 +282,7 @@ goal_in_my_reservation(struct ext2_reserve_window *rsv, ext2_grpblk_t grp_goal, /** * search_reserve_window() - * @rb_root: root of reservation tree + * @root: root of reservation tree * @goal: target allocation block * * Find the reserved window which includes the goal, or the previous one @@ -859,7 +859,7 @@ static int find_next_reservable_window( * * failed: we failed to find a reservation window in this group * - * @rsv: the reservation + * @my_rsv: the reservation * * @grp_goal: The goal (group-relative). It is where the search for a * free reservable space should start from. diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 415c21f0e750..11c5c6fe75bb 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -36,7 +36,6 @@ #include <linux/iomap.h> #include <linux/namei.h> #include <linux/uio.h> -#include <linux/fiemap.h> #include "ext2.h" #include "acl.h" #include "xattr.h" diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 1d82336b1cd4..efe77cffc322 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -148,7 +148,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) } if (IS_ENCRYPTED(inode)) { - err = fscrypt_fname_alloc_buffer(inode, EXT4_NAME_LEN, &fstr); + err = fscrypt_fname_alloc_buffer(EXT4_NAME_LEN, &fstr); if (err < 0) return err; } diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 523e00d7b392..f9a692c0a66c 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1401,7 +1401,7 @@ struct ext4_super_block { #define EXT4_MF_FS_ABORTED 0x0002 /* Fatal error detected */ #ifdef CONFIG_FS_ENCRYPTION -#define DUMMY_ENCRYPTION_ENABLED(sbi) ((sbi)->s_dummy_enc_ctx.ctx != NULL) +#define DUMMY_ENCRYPTION_ENABLED(sbi) ((sbi)->s_dummy_enc_policy.policy != NULL) #else #define DUMMY_ENCRYPTION_ENABLED(sbi) (0) #endif @@ -1596,8 +1596,8 @@ struct ext4_sb_info { atomic_t s_warning_count; atomic_t s_msg_count; - /* Encryption context for '-o test_dummy_encryption' */ - struct fscrypt_dummy_context s_dummy_enc_ctx; + /* Encryption policy for '-o test_dummy_encryption' */ + struct fscrypt_dummy_policy s_dummy_enc_policy; /* * Barrier between writepages ops and changing any inode's JOURNAL_DATA diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index df25d38d6539..698ca4a4db5f 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -742,6 +742,53 @@ not_found: return 1; } +static int ext4_xattr_credits_for_new_inode(struct inode *dir, mode_t mode, + bool encrypt) +{ + struct super_block *sb = dir->i_sb; + int nblocks = 0; +#ifdef CONFIG_EXT4_FS_POSIX_ACL + struct posix_acl *p = get_acl(dir, ACL_TYPE_DEFAULT); + + if (IS_ERR(p)) + return PTR_ERR(p); + if (p) { + int acl_size = p->a_count * sizeof(ext4_acl_entry); + + nblocks += (S_ISDIR(mode) ? 2 : 1) * + __ext4_xattr_set_credits(sb, NULL /* inode */, + NULL /* block_bh */, acl_size, + true /* is_create */); + posix_acl_release(p); + } +#endif + +#ifdef CONFIG_SECURITY + { + int num_security_xattrs = 1; + +#ifdef CONFIG_INTEGRITY + num_security_xattrs++; +#endif + /* + * We assume that security xattrs are never more than 1k. + * In practice they are under 128 bytes. + */ + nblocks += num_security_xattrs * + __ext4_xattr_set_credits(sb, NULL /* inode */, + NULL /* block_bh */, 1024, + true /* is_create */); + } +#endif + if (encrypt) + nblocks += __ext4_xattr_set_credits(sb, + NULL /* inode */, + NULL /* block_bh */, + FSCRYPT_SET_CONTEXT_MAX_SIZE, + true /* is_create */); + return nblocks; +} + /* * There are two policies for allocating an inode. If the new inode is * a directory, then a forward search is made for a block group with both @@ -772,7 +819,7 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, ext4_group_t i; ext4_group_t flex_group; struct ext4_group_info *grp; - int encrypt = 0; + bool encrypt = false; /* Cannot create files in a deleted directory */ if (!dir || !dir->i_nlink) @@ -784,59 +831,6 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, if (unlikely(ext4_forced_shutdown(sbi))) return ERR_PTR(-EIO); - if ((IS_ENCRYPTED(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) && - (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) && - !(i_flags & EXT4_EA_INODE_FL)) { - err = fscrypt_get_encryption_info(dir); - if (err) - return ERR_PTR(err); - if (!fscrypt_has_encryption_key(dir)) - return ERR_PTR(-ENOKEY); - encrypt = 1; - } - - if (!handle && sbi->s_journal && !(i_flags & EXT4_EA_INODE_FL)) { -#ifdef CONFIG_EXT4_FS_POSIX_ACL - struct posix_acl *p = get_acl(dir, ACL_TYPE_DEFAULT); - - if (IS_ERR(p)) - return ERR_CAST(p); - if (p) { - int acl_size = p->a_count * sizeof(ext4_acl_entry); - - nblocks += (S_ISDIR(mode) ? 2 : 1) * - __ext4_xattr_set_credits(sb, NULL /* inode */, - NULL /* block_bh */, acl_size, - true /* is_create */); - posix_acl_release(p); - } -#endif - -#ifdef CONFIG_SECURITY - { - int num_security_xattrs = 1; - -#ifdef CONFIG_INTEGRITY - num_security_xattrs++; -#endif - /* - * We assume that security xattrs are never - * more than 1k. In practice they are under - * 128 bytes. - */ - nblocks += num_security_xattrs * - __ext4_xattr_set_credits(sb, NULL /* inode */, - NULL /* block_bh */, 1024, - true /* is_create */); - } -#endif - if (encrypt) - nblocks += __ext4_xattr_set_credits(sb, - NULL /* inode */, NULL /* block_bh */, - FSCRYPT_SET_CONTEXT_MAX_SIZE, - true /* is_create */); - } - ngroups = ext4_get_groups_count(sb); trace_ext4_request_inode(dir, mode); inode = new_inode(sb); @@ -866,10 +860,25 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, else ei->i_projid = make_kprojid(&init_user_ns, EXT4_DEF_PROJID); + if (!(i_flags & EXT4_EA_INODE_FL)) { + err = fscrypt_prepare_new_inode(dir, inode, &encrypt); + if (err) + goto out; + } + err = dquot_initialize(inode); if (err) goto out; + if (!handle && sbi->s_journal && !(i_flags & EXT4_EA_INODE_FL)) { + ret2 = ext4_xattr_credits_for_new_inode(dir, mode, encrypt); + if (ret2 < 0) { + err = ret2; + goto out; + } + nblocks += ret2; + } + if (!goal) goal = sbi->s_inode_goal; @@ -1162,7 +1171,7 @@ got: * prevent its deduplication. */ if (encrypt) { - err = fscrypt_inherit_context(dir, inode, handle, true); + err = fscrypt_set_context(inode, handle); if (err) goto fail_free_drop; } diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 153a9fbe1dd0..0d74615fcce3 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -663,8 +663,7 @@ static struct stats dx_show_leaf(struct inode *dir, /* Directory is encrypted */ res = fscrypt_fname_alloc_buffer( - dir, len, - &fname_crypto_str); + len, &fname_crypto_str); if (res) printk(KERN_WARNING "Error " "allocating crypto " @@ -1016,8 +1015,8 @@ static int htree_dirblock_to_tree(struct file *dir_file, brelse(bh); return err; } - err = fscrypt_fname_alloc_buffer(dir, EXT4_NAME_LEN, - &fname_crypto_str); + err = fscrypt_fname_alloc_buffer(EXT4_NAME_LEN, + &fname_crypto_str); if (err < 0) { brelse(bh); return err; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index ea425b49b345..8b2736283481 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1104,7 +1104,7 @@ static void ext4_put_super(struct super_block *sb) crypto_free_shash(sbi->s_chksum_driver); kfree(sbi->s_blockgroup_lock); fs_put_dax(sbi->s_daxdev); - fscrypt_free_dummy_context(&sbi->s_dummy_enc_ctx); + fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy); #ifdef CONFIG_UNICODE utf8_unload(sbi->s_encoding); #endif @@ -1392,10 +1392,9 @@ retry: return res; } -static const union fscrypt_context * -ext4_get_dummy_context(struct super_block *sb) +static const union fscrypt_policy *ext4_get_dummy_policy(struct super_block *sb) { - return EXT4_SB(sb)->s_dummy_enc_ctx.ctx; + return EXT4_SB(sb)->s_dummy_enc_policy.policy; } static bool ext4_has_stable_inodes(struct super_block *sb) @@ -1414,7 +1413,7 @@ static const struct fscrypt_operations ext4_cryptops = { .key_prefix = "ext4:", .get_context = ext4_get_context, .set_context = ext4_set_context, - .get_dummy_context = ext4_get_dummy_context, + .get_dummy_policy = ext4_get_dummy_policy, .empty_dir = ext4_empty_dir, .max_namelen = EXT4_NAME_LEN, .has_stable_inodes = ext4_has_stable_inodes, @@ -1888,12 +1887,13 @@ static int ext4_set_test_dummy_encryption(struct super_block *sb, * needed to allow it to be set or changed during remount. We do allow * it to be specified during remount, but only if there is no change. */ - if (is_remount && !sbi->s_dummy_enc_ctx.ctx) { + if (is_remount && !sbi->s_dummy_enc_policy.policy) { ext4_msg(sb, KERN_WARNING, "Can't set test_dummy_encryption on remount"); return -1; } - err = fscrypt_set_test_dummy_encryption(sb, arg, &sbi->s_dummy_enc_ctx); + err = fscrypt_set_test_dummy_encryption(sb, arg->from, + &sbi->s_dummy_enc_policy); if (err) { if (err == -EEXIST) ext4_msg(sb, KERN_WARNING, @@ -4935,7 +4935,7 @@ failed_mount: for (i = 0; i < EXT4_MAXQUOTAS; i++) kfree(get_qf_name(sb, sbi, i)); #endif - fscrypt_free_dummy_context(&sbi->s_dummy_enc_ctx); + fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy); ext4_blkdev_remove(sbi); brelse(bh); out_fail: diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c index bbd5e7e0632b..5b7ba8f71153 100644 --- a/fs/ext4/verity.c +++ b/fs/ext4/verity.c @@ -349,6 +349,7 @@ static struct page *ext4_read_merkle_tree_page(struct inode *inode, pgoff_t index, unsigned long num_ra_pages) { + DEFINE_READAHEAD(ractl, NULL, inode->i_mapping, index); struct page *page; index += ext4_verity_metadata_pos(inode) >> PAGE_SHIFT; @@ -358,8 +359,7 @@ static struct page *ext4_read_merkle_tree_page(struct inode *inode, if (page) put_page(page); else if (num_ra_pages > 1) - page_cache_readahead_unbounded(inode->i_mapping, NULL, - index, num_ra_pages, 0); + page_cache_ra_unbounded(&ractl, num_ra_pages, 0); page = read_mapping_page(inode->i_mapping, index, NULL); } return page; diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 264696495440..4b9ef8bbfa4a 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -112,7 +112,7 @@ static int __f2fs_setup_filename(const struct inode *dir, #ifdef CONFIG_FS_ENCRYPTION fname->crypto_buf = crypt_name->crypto_buf; #endif - if (crypt_name->is_ciphertext_name) { + if (crypt_name->is_nokey_name) { /* hash was decoded from the no-key name */ fname->hash = cpu_to_le32(crypt_name->hash); } else { @@ -531,7 +531,7 @@ struct page *f2fs_init_inode_metadata(struct inode *inode, struct inode *dir, goto put_error; if (IS_ENCRYPTED(inode)) { - err = fscrypt_inherit_context(dir, inode, page, false); + err = fscrypt_set_context(inode, page); if (err) goto put_error; } @@ -1026,7 +1026,7 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) if (err) goto out; - err = fscrypt_fname_alloc_buffer(inode, F2FS_NAME_LEN, &fstr); + err = fscrypt_fname_alloc_buffer(F2FS_NAME_LEN, &fstr); if (err < 0) goto out; } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index adda53d20a39..cb700d797296 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -139,7 +139,7 @@ struct f2fs_mount_info { int fsync_mode; /* fsync policy */ int fs_mode; /* fs mode: LFS or ADAPTIVE */ int bggc_mode; /* bggc mode: off, on or sync */ - struct fscrypt_dummy_context dummy_enc_ctx; /* test dummy encryption */ + struct fscrypt_dummy_policy dummy_enc_policy; /* test dummy encryption */ block_t unusable_cap_perc; /* percentage for cap */ block_t unusable_cap; /* Amount of space allowed to be * unusable when disabling checkpoint @@ -1340,13 +1340,6 @@ enum fsync_mode { #define IS_IO_TRACED_PAGE(page) (0) #endif -#ifdef CONFIG_FS_ENCRYPTION -#define DUMMY_ENCRYPTION_ENABLED(sbi) \ - (unlikely(F2FS_OPTION(sbi).dummy_enc_ctx.ctx != NULL)) -#else -#define DUMMY_ENCRYPTION_ENABLED(sbi) (0) -#endif - /* For compression */ enum compress_algorithm_type { COMPRESS_LZO, @@ -4071,22 +4064,6 @@ static inline bool f2fs_lfs_mode(struct f2fs_sb_info *sbi) return F2FS_OPTION(sbi).fs_mode == FS_MODE_LFS; } -static inline bool f2fs_may_encrypt(struct inode *dir, struct inode *inode) -{ -#ifdef CONFIG_FS_ENCRYPTION - struct f2fs_sb_info *sbi = F2FS_I_SB(dir); - umode_t mode = inode->i_mode; - - /* - * If the directory encrypted or dummy encryption enabled, - * then we should encrypt the inode. - */ - if (IS_ENCRYPTED(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) - return (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)); -#endif - return false; -} - static inline bool f2fs_may_compress(struct inode *inode) { if (IS_SWAPFILE(inode) || f2fs_is_pinned_file(inode) || diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 90565432559c..8fa37d1434de 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -28,6 +28,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) nid_t ino; struct inode *inode; bool nid_free = false; + bool encrypt = false; int xattr_size = 0; int err; @@ -69,13 +70,17 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) F2FS_I(inode)->i_projid = make_kprojid(&init_user_ns, F2FS_DEF_PROJID); + err = fscrypt_prepare_new_inode(dir, inode, &encrypt); + if (err) + goto fail_drop; + err = dquot_initialize(inode); if (err) goto fail_drop; set_inode_flag(inode, FI_NEW_INODE); - if (f2fs_may_encrypt(dir, inode)) + if (encrypt) f2fs_set_encrypted_inode(inode); if (f2fs_sb_has_extra_attr(sbi)) { diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 18f914247f01..0c958fed3392 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -435,12 +435,12 @@ static int f2fs_set_test_dummy_encryption(struct super_block *sb, * needed to allow it to be set or changed during remount. We do allow * it to be specified during remount, but only if there is no change. */ - if (is_remount && !F2FS_OPTION(sbi).dummy_enc_ctx.ctx) { + if (is_remount && !F2FS_OPTION(sbi).dummy_enc_policy.policy) { f2fs_warn(sbi, "Can't set test_dummy_encryption on remount"); return -EINVAL; } err = fscrypt_set_test_dummy_encryption( - sb, arg, &F2FS_OPTION(sbi).dummy_enc_ctx); + sb, arg->from, &F2FS_OPTION(sbi).dummy_enc_policy); if (err) { if (err == -EEXIST) f2fs_warn(sbi, @@ -1295,7 +1295,7 @@ static void f2fs_put_super(struct super_block *sb) for (i = 0; i < MAXQUOTAS; i++) kfree(F2FS_OPTION(sbi).s_qf_names[i]); #endif - fscrypt_free_dummy_context(&F2FS_OPTION(sbi).dummy_enc_ctx); + fscrypt_free_dummy_policy(&F2FS_OPTION(sbi).dummy_enc_policy); destroy_percpu_info(sbi); for (i = 0; i < NR_PAGE_TYPE; i++) kvfree(sbi->write_io[i]); @@ -2513,10 +2513,9 @@ static int f2fs_set_context(struct inode *inode, const void *ctx, size_t len, ctx, len, fs_data, XATTR_CREATE); } -static const union fscrypt_context * -f2fs_get_dummy_context(struct super_block *sb) +static const union fscrypt_policy *f2fs_get_dummy_policy(struct super_block *sb) { - return F2FS_OPTION(F2FS_SB(sb)).dummy_enc_ctx.ctx; + return F2FS_OPTION(F2FS_SB(sb)).dummy_enc_policy.policy; } static bool f2fs_has_stable_inodes(struct super_block *sb) @@ -2554,7 +2553,7 @@ static const struct fscrypt_operations f2fs_cryptops = { .key_prefix = "f2fs:", .get_context = f2fs_get_context, .set_context = f2fs_set_context, - .get_dummy_context = f2fs_get_dummy_context, + .get_dummy_policy = f2fs_get_dummy_policy, .empty_dir = f2fs_empty_dir, .max_namelen = F2FS_NAME_LEN, .has_stable_inodes = f2fs_has_stable_inodes, @@ -3926,7 +3925,7 @@ free_options: for (i = 0; i < MAXQUOTAS; i++) kfree(F2FS_OPTION(sbi).s_qf_names[i]); #endif - fscrypt_free_dummy_context(&F2FS_OPTION(sbi).dummy_enc_ctx); + fscrypt_free_dummy_policy(&F2FS_OPTION(sbi).dummy_enc_policy); kvfree(options); free_sb_buf: kfree(raw_super); diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c index 9eb0dba851e8..054ec852b5ea 100644 --- a/fs/f2fs/verity.c +++ b/fs/f2fs/verity.c @@ -228,6 +228,7 @@ static struct page *f2fs_read_merkle_tree_page(struct inode *inode, pgoff_t index, unsigned long num_ra_pages) { + DEFINE_READAHEAD(ractl, NULL, inode->i_mapping, index); struct page *page; index += f2fs_verity_metadata_pos(inode) >> PAGE_SHIFT; @@ -237,8 +238,7 @@ static struct page *f2fs_read_merkle_tree_page(struct inode *inode, if (page) put_page(page); else if (num_ra_pages > 1) - page_cache_readahead_unbounded(inode->i_mapping, NULL, - index, num_ra_pages, 0); + page_cache_ra_unbounded(&ractl, num_ra_pages, 0); page = read_mapping_page(inode->i_mapping, index, NULL); } return page; diff --git a/fs/file.c b/fs/file.c index 21c0893f2f1d..4559b5fec3bd 100644 --- a/fs/file.c +++ b/fs/file.c @@ -21,6 +21,7 @@ #include <linux/rcupdate.h> #include <linux/close_range.h> #include <net/sock.h> +#include <linux/io_uring.h> unsigned int sysctl_nr_open __read_mostly = 1024*1024; unsigned int sysctl_nr_open_min = BITS_PER_LONG; @@ -452,6 +453,7 @@ void exit_files(struct task_struct *tsk) struct files_struct * files = tsk->files; if (files) { + io_uring_files_cancel(files); task_lock(tsk); tsk->files = NULL; task_unlock(tsk); diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 149227160ff0..e6005c78bfa9 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -2184,7 +2184,7 @@ static int __init start_dirtytime_writeback(void) __initcall(start_dirtytime_writeback); int dirtytime_interval_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret; @@ -2321,7 +2321,7 @@ void __mark_inode_dirty(struct inode *inode, int flags) wb = locked_inode_to_wb_and_lock_list(inode); - WARN(bdi_cap_writeback_dirty(wb->bdi) && + WARN((wb->bdi->capabilities & BDI_CAP_WRITEBACK) && !test_bit(WB_registered, &wb->state), "bdi-%s not registered\n", bdi_dev_name(wb->bdi)); @@ -2346,7 +2346,8 @@ void __mark_inode_dirty(struct inode *inode, int flags) * to make sure background write-back happens * later. */ - if (bdi_cap_writeback_dirty(wb->bdi) && wakeup_bdi) + if (wakeup_bdi && + (wb->bdi->capabilities & BDI_CAP_WRITEBACK)) wb_wakeup_delayed(wb); return; } @@ -2581,7 +2582,7 @@ int write_inode_now(struct inode *inode, int sync) .range_end = LLONG_MAX, }; - if (!mapping_cap_writeback_dirty(inode->i_mapping)) + if (!mapping_can_writeback(inode->i_mapping)) wbc.nr_to_write = 0; might_sleep(); diff --git a/fs/fs_parser.c b/fs/fs_parser.c index ab53e42a874a..68b0148f4bb8 100644 --- a/fs/fs_parser.c +++ b/fs/fs_parser.c @@ -189,7 +189,7 @@ out: } EXPORT_SYMBOL(fs_lookup_param); -int fs_param_bad_value(struct p_log *log, struct fs_parameter *param) +static int fs_param_bad_value(struct p_log *log, struct fs_parameter *param) { return inval_plog(log, "Bad value for '%s'", param->key); } diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 6611ef3269a8..43c165e796da 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -3091,11 +3091,10 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter) ssize_t ret = 0; struct file *file = iocb->ki_filp; struct fuse_file *ff = file->private_data; - bool async_dio = ff->fc->async_dio; loff_t pos = 0; struct inode *inode; loff_t i_size; - size_t count = iov_iter_count(iter); + size_t count = iov_iter_count(iter), shortened = 0; loff_t offset = iocb->ki_pos; struct fuse_io_priv *io; @@ -3103,17 +3102,9 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter) inode = file->f_mapping->host; i_size = i_size_read(inode); - if ((iov_iter_rw(iter) == READ) && (offset > i_size)) + if ((iov_iter_rw(iter) == READ) && (offset >= i_size)) return 0; - /* optimization for short read */ - if (async_dio && iov_iter_rw(iter) != WRITE && offset + count > i_size) { - if (offset >= i_size) - return 0; - iov_iter_truncate(iter, fuse_round_up(ff->fc, i_size - offset)); - count = iov_iter_count(iter); - } - io = kmalloc(sizeof(struct fuse_io_priv), GFP_KERNEL); if (!io) return -ENOMEM; @@ -3129,15 +3120,22 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter) * By default, we want to optimize all I/Os with async request * submission to the client filesystem if supported. */ - io->async = async_dio; + io->async = ff->fc->async_dio; io->iocb = iocb; io->blocking = is_sync_kiocb(iocb); + /* optimization for short read */ + if (io->async && !io->write && offset + count > i_size) { + iov_iter_truncate(iter, fuse_round_up(ff->fc, i_size - offset)); + shortened = count - iov_iter_count(iter); + count -= shortened; + } + /* * We cannot asynchronously extend the size of a file. * In such case the aio will behave exactly like sync io. */ - if ((offset + count > i_size) && iov_iter_rw(iter) == WRITE) + if ((offset + count > i_size) && io->write) io->blocking = true; if (io->async && io->blocking) { @@ -3155,6 +3153,7 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter) } else { ret = __fuse_direct_read(io, iter, &pos); } + iov_iter_reexpand(iter, iov_iter_count(iter) + shortened); if (io->async) { bool blocking = io->blocking; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index bba747520e9b..581329203d68 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1049,9 +1049,9 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb) if (err) return err; - sb->s_bdi->ra_pages = VM_READAHEAD_PAGES; /* fuse does it's own writeback accounting */ - sb->s_bdi->capabilities = BDI_CAP_NO_ACCT_WB | BDI_CAP_STRICTLIMIT; + sb->s_bdi->capabilities &= ~BDI_CAP_WRITEBACK_ACCT; + sb->s_bdi->capabilities |= BDI_CAP_STRICTLIMIT; /* * For a single fuse filesystem use max 1% of dirty + diff --git a/fs/inode.c b/fs/inode.c index 72c4c347afb7..9d78c37b00b8 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -181,6 +181,8 @@ int inode_init_always(struct super_block *sb, struct inode *inode) mapping->a_ops = &empty_aops; mapping->host = inode; mapping->flags = 0; + if (sb->s_type->fs_flags & FS_THP_SUPPORT) + __set_bit(AS_THP_SUPPORT, &mapping->flags); mapping->wb_err = 0; atomic_set(&mapping->i_mmap_writable, 0); #ifdef CONFIG_READ_ONLY_THP_FOR_FS diff --git a/fs/internal.h b/fs/internal.h index 10517ece4516..a7cd0f64faa4 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -82,9 +82,6 @@ int may_linkat(struct path *link); /* * namespace.c */ -extern void *copy_mount_options(const void __user *); -extern char *copy_mount_string(const void __user *); - extern struct vfsmount *lookup_mnt(const struct path *); extern int finish_automount(struct vfsmount *, struct path *); diff --git a/fs/io-wq.c b/fs/io-wq.c index 414beb543883..0a182f1333e8 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -17,6 +17,7 @@ #include <linux/rculist_nulls.h> #include <linux/fs_struct.h> #include <linux/task_work.h> +#include <linux/blk-cgroup.h> #include "io-wq.h" @@ -26,9 +27,8 @@ enum { IO_WORKER_F_UP = 1, /* up and active */ IO_WORKER_F_RUNNING = 2, /* account as running */ IO_WORKER_F_FREE = 4, /* worker on free list */ - IO_WORKER_F_EXITING = 8, /* worker exiting */ - IO_WORKER_F_FIXED = 16, /* static idle worker */ - IO_WORKER_F_BOUND = 32, /* is doing bounded work */ + IO_WORKER_F_FIXED = 8, /* static idle worker */ + IO_WORKER_F_BOUND = 16, /* is doing bounded work */ }; enum { @@ -57,9 +57,13 @@ struct io_worker { struct rcu_head rcu; struct mm_struct *mm; +#ifdef CONFIG_BLK_CGROUP + struct cgroup_subsys_state *blkcg_css; +#endif const struct cred *cur_creds; const struct cred *saved_creds; struct files_struct *restore_files; + struct nsproxy *restore_nsproxy; struct fs_struct *restore_fs; }; @@ -87,7 +91,7 @@ enum { */ struct io_wqe { struct { - spinlock_t lock; + raw_spinlock_t lock; struct io_wq_work_list work_list; unsigned long hash_map; unsigned flags; @@ -148,11 +152,12 @@ static bool __io_worker_unuse(struct io_wqe *wqe, struct io_worker *worker) if (current->files != worker->restore_files) { __acquire(&wqe->lock); - spin_unlock_irq(&wqe->lock); + raw_spin_unlock_irq(&wqe->lock); dropped_lock = true; task_lock(current); current->files = worker->restore_files; + current->nsproxy = worker->restore_nsproxy; task_unlock(current); } @@ -166,7 +171,7 @@ static bool __io_worker_unuse(struct io_wqe *wqe, struct io_worker *worker) if (worker->mm) { if (!dropped_lock) { __acquire(&wqe->lock); - spin_unlock_irq(&wqe->lock); + raw_spin_unlock_irq(&wqe->lock); dropped_lock = true; } __set_current_state(TASK_RUNNING); @@ -175,6 +180,13 @@ static bool __io_worker_unuse(struct io_wqe *wqe, struct io_worker *worker) worker->mm = NULL; } +#ifdef CONFIG_BLK_CGROUP + if (worker->blkcg_css) { + kthread_associate_blkcg(NULL); + worker->blkcg_css = NULL; + } +#endif + return dropped_lock; } @@ -200,7 +212,6 @@ static void io_worker_exit(struct io_worker *worker) { struct io_wqe *wqe = worker->wqe; struct io_wqe_acct *acct = io_wqe_get_acct(wqe, worker); - unsigned nr_workers; /* * If we're not at zero, someone else is holding a brief reference @@ -220,23 +231,19 @@ static void io_worker_exit(struct io_worker *worker) worker->flags = 0; preempt_enable(); - spin_lock_irq(&wqe->lock); + raw_spin_lock_irq(&wqe->lock); hlist_nulls_del_rcu(&worker->nulls_node); list_del_rcu(&worker->all_list); if (__io_worker_unuse(wqe, worker)) { __release(&wqe->lock); - spin_lock_irq(&wqe->lock); + raw_spin_lock_irq(&wqe->lock); } acct->nr_workers--; - nr_workers = wqe->acct[IO_WQ_ACCT_BOUND].nr_workers + - wqe->acct[IO_WQ_ACCT_UNBOUND].nr_workers; - spin_unlock_irq(&wqe->lock); - - /* all workers gone, wq exit can proceed */ - if (!nr_workers && refcount_dec_and_test(&wqe->wq->refs)) - complete(&wqe->wq->done); + raw_spin_unlock_irq(&wqe->lock); kfree_rcu(worker, rcu); + if (refcount_dec_and_test(&wqe->wq->refs)) + complete(&wqe->wq->done); } static inline bool io_wqe_run_queue(struct io_wqe *wqe) @@ -318,6 +325,7 @@ static void io_worker_start(struct io_wqe *wqe, struct io_worker *worker) worker->flags |= (IO_WORKER_F_UP | IO_WORKER_F_RUNNING); worker->restore_files = current->files; + worker->restore_nsproxy = current->nsproxy; worker->restore_fs = current->fs; io_wqe_inc_running(wqe, worker); } @@ -436,6 +444,17 @@ static void io_wq_switch_mm(struct io_worker *worker, struct io_wq_work *work) work->flags |= IO_WQ_WORK_CANCEL; } +static inline void io_wq_switch_blkcg(struct io_worker *worker, + struct io_wq_work *work) +{ +#ifdef CONFIG_BLK_CGROUP + if (work->blkcg_css != worker->blkcg_css) { + kthread_associate_blkcg(work->blkcg_css); + worker->blkcg_css = work->blkcg_css; + } +#endif +} + static void io_wq_switch_creds(struct io_worker *worker, struct io_wq_work *work) { @@ -454,6 +473,7 @@ static void io_impersonate_work(struct io_worker *worker, if (work->files && current->files != work->files) { task_lock(current); current->files = work->files; + current->nsproxy = work->nsproxy; task_unlock(current); } if (work->fs && current->fs != work->fs) @@ -463,6 +483,7 @@ static void io_impersonate_work(struct io_worker *worker, if (worker->cur_creds != work->creds) io_wq_switch_creds(worker, work); current->signal->rlim[RLIMIT_FSIZE].rlim_cur = work->fsize; + io_wq_switch_blkcg(worker, work); } static void io_assign_current_work(struct io_worker *worker, @@ -504,7 +525,7 @@ get_next: else if (!wq_list_empty(&wqe->work_list)) wqe->flags |= IO_WQE_FLAG_STALLED; - spin_unlock_irq(&wqe->lock); + raw_spin_unlock_irq(&wqe->lock); if (!work) break; io_assign_current_work(worker, work); @@ -538,17 +559,17 @@ get_next: io_wqe_enqueue(wqe, linked); if (hash != -1U && !next_hashed) { - spin_lock_irq(&wqe->lock); + raw_spin_lock_irq(&wqe->lock); wqe->hash_map &= ~BIT_ULL(hash); wqe->flags &= ~IO_WQE_FLAG_STALLED; /* skip unnecessary unlock-lock wqe->lock */ if (!work) goto get_next; - spin_unlock_irq(&wqe->lock); + raw_spin_unlock_irq(&wqe->lock); } } while (work); - spin_lock_irq(&wqe->lock); + raw_spin_lock_irq(&wqe->lock); } while (1); } @@ -563,7 +584,7 @@ static int io_wqe_worker(void *data) while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)) { set_current_state(TASK_INTERRUPTIBLE); loop: - spin_lock_irq(&wqe->lock); + raw_spin_lock_irq(&wqe->lock); if (io_wqe_run_queue(wqe)) { __set_current_state(TASK_RUNNING); io_worker_handle_work(worker); @@ -574,7 +595,7 @@ loop: __release(&wqe->lock); goto loop; } - spin_unlock_irq(&wqe->lock); + raw_spin_unlock_irq(&wqe->lock); if (signal_pending(current)) flush_signals(current); if (schedule_timeout(WORKER_IDLE_TIMEOUT)) @@ -586,11 +607,11 @@ loop: } if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) { - spin_lock_irq(&wqe->lock); + raw_spin_lock_irq(&wqe->lock); if (!wq_list_empty(&wqe->work_list)) io_worker_handle_work(worker); else - spin_unlock_irq(&wqe->lock); + raw_spin_unlock_irq(&wqe->lock); } io_worker_exit(worker); @@ -630,14 +651,14 @@ void io_wq_worker_sleeping(struct task_struct *tsk) worker->flags &= ~IO_WORKER_F_RUNNING; - spin_lock_irq(&wqe->lock); + raw_spin_lock_irq(&wqe->lock); io_wqe_dec_running(wqe, worker); - spin_unlock_irq(&wqe->lock); + raw_spin_unlock_irq(&wqe->lock); } static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index) { - struct io_wqe_acct *acct =&wqe->acct[index]; + struct io_wqe_acct *acct = &wqe->acct[index]; struct io_worker *worker; worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, wqe->node); @@ -656,7 +677,7 @@ static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index) return false; } - spin_lock_irq(&wqe->lock); + raw_spin_lock_irq(&wqe->lock); hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list); list_add_tail_rcu(&worker->all_list, &wqe->all_list); worker->flags |= IO_WORKER_F_FREE; @@ -665,11 +686,12 @@ static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index) if (!acct->nr_workers && (worker->flags & IO_WORKER_F_BOUND)) worker->flags |= IO_WORKER_F_FIXED; acct->nr_workers++; - spin_unlock_irq(&wqe->lock); + raw_spin_unlock_irq(&wqe->lock); if (index == IO_WQ_ACCT_UNBOUND) atomic_inc(&wq->user->processes); + refcount_inc(&wq->refs); wake_up_process(worker->task); return true; } @@ -685,28 +707,63 @@ static inline bool io_wqe_need_worker(struct io_wqe *wqe, int index) return acct->nr_workers < acct->max_workers; } +static bool io_wqe_worker_send_sig(struct io_worker *worker, void *data) +{ + send_sig(SIGINT, worker->task, 1); + return false; +} + +/* + * Iterate the passed in list and call the specific function for each + * worker that isn't exiting + */ +static bool io_wq_for_each_worker(struct io_wqe *wqe, + bool (*func)(struct io_worker *, void *), + void *data) +{ + struct io_worker *worker; + bool ret = false; + + list_for_each_entry_rcu(worker, &wqe->all_list, all_list) { + if (io_worker_get(worker)) { + /* no task if node is/was offline */ + if (worker->task) + ret = func(worker, data); + io_worker_release(worker); + if (ret) + break; + } + } + + return ret; +} + +static bool io_wq_worker_wake(struct io_worker *worker, void *data) +{ + wake_up_process(worker->task); + return false; +} + /* * Manager thread. Tasked with creating new workers, if we need them. */ static int io_wq_manager(void *data) { struct io_wq *wq = data; - int workers_to_create = num_possible_nodes(); int node; /* create fixed workers */ - refcount_set(&wq->refs, workers_to_create); + refcount_set(&wq->refs, 1); for_each_node(node) { if (!node_online(node)) continue; - if (!create_io_worker(wq, wq->wqes[node], IO_WQ_ACCT_BOUND)) - goto err; - workers_to_create--; + if (create_io_worker(wq, wq->wqes[node], IO_WQ_ACCT_BOUND)) + continue; + set_bit(IO_WQ_BIT_ERROR, &wq->state); + set_bit(IO_WQ_BIT_EXIT, &wq->state); + goto out; } - while (workers_to_create--) - refcount_dec(&wq->refs); - complete(&wq->done); while (!kthread_should_stop()) { @@ -720,12 +777,12 @@ static int io_wq_manager(void *data) if (!node_online(node)) continue; - spin_lock_irq(&wqe->lock); + raw_spin_lock_irq(&wqe->lock); if (io_wqe_need_worker(wqe, IO_WQ_ACCT_BOUND)) fork_worker[IO_WQ_ACCT_BOUND] = true; if (io_wqe_need_worker(wqe, IO_WQ_ACCT_UNBOUND)) fork_worker[IO_WQ_ACCT_UNBOUND] = true; - spin_unlock_irq(&wqe->lock); + raw_spin_unlock_irq(&wqe->lock); if (fork_worker[IO_WQ_ACCT_BOUND]) create_io_worker(wq, wqe, IO_WQ_ACCT_BOUND); if (fork_worker[IO_WQ_ACCT_UNBOUND]) @@ -738,12 +795,18 @@ static int io_wq_manager(void *data) if (current->task_works) task_work_run(); - return 0; -err: - set_bit(IO_WQ_BIT_ERROR, &wq->state); - set_bit(IO_WQ_BIT_EXIT, &wq->state); - if (refcount_sub_and_test(workers_to_create, &wq->refs)) +out: + if (refcount_dec_and_test(&wq->refs)) { complete(&wq->done); + return 0; + } + /* if ERROR is set and we get here, we have workers to wake */ + if (test_bit(IO_WQ_BIT_ERROR, &wq->state)) { + rcu_read_lock(); + for_each_node(node) + io_wq_for_each_worker(wq->wqes[node], io_wq_worker_wake, NULL); + rcu_read_unlock(); + } return 0; } @@ -821,10 +884,10 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work) } work_flags = work->flags; - spin_lock_irqsave(&wqe->lock, flags); + raw_spin_lock_irqsave(&wqe->lock, flags); io_wqe_insert_work(wqe, work); wqe->flags &= ~IO_WQE_FLAG_STALLED; - spin_unlock_irqrestore(&wqe->lock, flags); + raw_spin_unlock_irqrestore(&wqe->lock, flags); if ((work_flags & IO_WQ_WORK_CONCURRENT) || !atomic_read(&acct->nr_running)) @@ -850,37 +913,6 @@ void io_wq_hash_work(struct io_wq_work *work, void *val) work->flags |= (IO_WQ_WORK_HASHED | (bit << IO_WQ_HASH_SHIFT)); } -static bool io_wqe_worker_send_sig(struct io_worker *worker, void *data) -{ - send_sig(SIGINT, worker->task, 1); - return false; -} - -/* - * Iterate the passed in list and call the specific function for each - * worker that isn't exiting - */ -static bool io_wq_for_each_worker(struct io_wqe *wqe, - bool (*func)(struct io_worker *, void *), - void *data) -{ - struct io_worker *worker; - bool ret = false; - - list_for_each_entry_rcu(worker, &wqe->all_list, all_list) { - if (io_worker_get(worker)) { - /* no task if node is/was offline */ - if (worker->task) - ret = func(worker, data); - io_worker_release(worker); - if (ret) - break; - } - } - - return ret; -} - void io_wq_cancel_all(struct io_wq *wq) { int node; @@ -951,13 +983,13 @@ static void io_wqe_cancel_pending_work(struct io_wqe *wqe, unsigned long flags; retry: - spin_lock_irqsave(&wqe->lock, flags); + raw_spin_lock_irqsave(&wqe->lock, flags); wq_list_for_each(node, prev, &wqe->work_list) { work = container_of(node, struct io_wq_work, list); if (!match->fn(work, match->data)) continue; io_wqe_remove_pending(wqe, work, prev); - spin_unlock_irqrestore(&wqe->lock, flags); + raw_spin_unlock_irqrestore(&wqe->lock, flags); io_run_cancel(work, wqe); match->nr_pending++; if (!match->cancel_all) @@ -966,7 +998,7 @@ retry: /* not safe to continue after unlock */ goto retry; } - spin_unlock_irqrestore(&wqe->lock, flags); + raw_spin_unlock_irqrestore(&wqe->lock, flags); } static void io_wqe_cancel_running_work(struct io_wqe *wqe, @@ -1074,7 +1106,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) } atomic_set(&wqe->acct[IO_WQ_ACCT_UNBOUND].nr_running, 0); wqe->wq = wq; - spin_lock_init(&wqe->lock); + raw_spin_lock_init(&wqe->lock); INIT_WQ_LIST(&wqe->work_list); INIT_HLIST_NULLS_HEAD(&wqe->free_list, 0); INIT_LIST_HEAD(&wqe->all_list); @@ -1113,12 +1145,6 @@ bool io_wq_get(struct io_wq *wq, struct io_wq_data *data) return refcount_inc_not_zero(&wq->use_refs); } -static bool io_wq_worker_wake(struct io_worker *worker, void *data) -{ - wake_up_process(worker->task); - return false; -} - static void __io_wq_destroy(struct io_wq *wq) { int node; diff --git a/fs/io-wq.h b/fs/io-wq.h index ddaf9614cf9b..84bcf6a85523 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -87,7 +87,11 @@ struct io_wq_work { struct io_wq_work_node list; struct files_struct *files; struct mm_struct *mm; +#ifdef CONFIG_BLK_CGROUP + struct cgroup_subsys_state *blkcg_css; +#endif const struct cred *creds; + struct nsproxy *nsproxy; struct fs_struct *fs; unsigned long fsize; unsigned flags; diff --git a/fs/io_uring.c b/fs/io_uring.c index 3790c7fe9fee..2e1dc354cd08 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -79,6 +79,8 @@ #include <linux/splice.h> #include <linux/task_work.h> #include <linux/pagemap.h> +#include <linux/io_uring.h> +#include <linux/blk-cgroup.h> #define CREATE_TRACE_POINTS #include <trace/events/io_uring.h> @@ -98,6 +100,8 @@ #define IORING_MAX_FILES_TABLE (1U << IORING_FILE_TABLE_SHIFT) #define IORING_FILE_TABLE_MASK (IORING_MAX_FILES_TABLE - 1) #define IORING_MAX_FIXED_FILES (64 * IORING_MAX_FILES_TABLE) +#define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ + IORING_REGISTER_LAST + IORING_OP_LAST) struct io_uring { u32 head ____cacheline_aligned_in_smp; @@ -187,6 +191,7 @@ struct io_mapped_ubuf { size_t len; struct bio_vec *bvec; unsigned int nr_bvecs; + unsigned long acct_pages; }; struct fixed_file_table { @@ -205,7 +210,7 @@ struct fixed_file_data { struct fixed_file_table *table; struct io_ring_ctx *ctx; - struct percpu_ref *cur_refs; + struct fixed_file_ref_node *node; struct percpu_ref refs; struct completion done; struct list_head ref_list; @@ -219,6 +224,27 @@ struct io_buffer { __u16 bid; }; +struct io_restriction { + DECLARE_BITMAP(register_op, IORING_REGISTER_LAST); + DECLARE_BITMAP(sqe_op, IORING_OP_LAST); + u8 sqe_flags_allowed; + u8 sqe_flags_required; + bool registered; +}; + +struct io_sq_data { + refcount_t refs; + struct mutex lock; + + /* ctx's that are using this sqd */ + struct list_head ctx_list; + struct list_head ctx_new_list; + struct mutex ctx_lock; + + struct task_struct *thread; + struct wait_queue_head wait; +}; + struct io_ring_ctx { struct { struct percpu_ref refs; @@ -231,6 +257,7 @@ struct io_ring_ctx { unsigned int cq_overflow_flushed: 1; unsigned int drain_next: 1; unsigned int eventfd_async: 1; + unsigned int restricted: 1; /* * Ring buffer of indices into array of io_uring_sqe, which is @@ -264,9 +291,25 @@ struct io_ring_ctx { /* IO offload */ struct io_wq *io_wq; - struct task_struct *sqo_thread; /* if using sq thread polling */ - struct mm_struct *sqo_mm; - wait_queue_head_t sqo_wait; + + /* + * For SQPOLL usage - we hold a reference to the parent task, so we + * have access to the ->files + */ + struct task_struct *sqo_task; + + /* Only used for accounting purposes */ + struct mm_struct *mm_account; + +#ifdef CONFIG_BLK_CGROUP + struct cgroup_subsys_state *sqo_blkcg_css; +#endif + + struct io_sq_data *sq_data; /* if using sq thread polling */ + + struct wait_queue_head sqo_sq_wait; + struct wait_queue_entry sqo_wait_entry; + struct list_head sqd_list; /* * If used, fixed file set. Writers must ensure that ->refs is dead, @@ -275,8 +318,6 @@ struct io_ring_ctx { */ struct fixed_file_data *file_data; unsigned nr_user_files; - int ring_fd; - struct file *ring_file; /* if used, fixed mapped user buffers */ unsigned nr_user_bufs; @@ -338,6 +379,7 @@ struct io_ring_ctx { struct llist_head file_put_llist; struct work_struct exit_work; + struct io_restriction restrictions; }; /* @@ -392,13 +434,16 @@ struct io_cancel { struct io_timeout { struct file *file; - u64 addr; - int flags; u32 off; u32 target_seq; struct list_head list; }; +struct io_timeout_rem { + struct file *file; + u64 addr; +}; + struct io_rw { /* NOTE: kiocb has the file as the first member, so don't do it here */ struct kiocb kiocb; @@ -514,15 +559,6 @@ struct io_async_rw { struct wait_page_queue wpq; }; -struct io_async_ctx { - union { - struct io_async_rw rw; - struct io_async_msghdr msg; - struct io_async_connect connect; - struct io_timeout_data timeout; - }; -}; - enum { REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT, REQ_F_IO_DRAIN_BIT = IOSQE_IO_DRAIN_BIT, @@ -544,7 +580,6 @@ enum { REQ_F_BUFFER_SELECTED_BIT, REQ_F_NO_FILE_TABLE_BIT, REQ_F_WORK_INITIALIZED_BIT, - REQ_F_TASK_PINNED_BIT, /* not a real bit, just to check we're not overflowing the space */ __REQ_F_LAST_BIT, @@ -590,8 +625,6 @@ enum { REQ_F_NO_FILE_TABLE = BIT(REQ_F_NO_FILE_TABLE_BIT), /* io_wq_work is initialized */ REQ_F_WORK_INITIALIZED = BIT(REQ_F_WORK_INITIALIZED_BIT), - /* req->task is refcounted */ - REQ_F_TASK_PINNED = BIT(REQ_F_TASK_PINNED_BIT), }; struct async_poll { @@ -614,6 +647,7 @@ struct io_kiocb { struct io_sync sync; struct io_cancel cancel; struct io_timeout timeout; + struct io_timeout_rem timeout_rem; struct io_connect connect; struct io_sr_msg sr_msg; struct io_open open; @@ -629,7 +663,8 @@ struct io_kiocb { struct io_completion compl; }; - struct io_async_ctx *io; + /* opcode allocated if it needs to store data for async defer */ + void *async_data; u8 opcode; /* polled IO has completed */ u8 iopoll_completed; @@ -697,8 +732,6 @@ struct io_submit_state { }; struct io_op_def { - /* needs req->io allocated for deferral/async */ - unsigned async_ctx : 1; /* needs current->mm setup, does mm access */ unsigned needs_mm : 1; /* needs req->file assigned */ @@ -720,35 +753,49 @@ struct io_op_def { unsigned pollout : 1; /* op supports buffer selection */ unsigned buffer_select : 1; + /* needs rlimit(RLIMIT_FSIZE) assigned */ unsigned needs_fsize : 1; + /* must always have async data allocated */ + unsigned needs_async_data : 1; + /* needs blkcg context, issues async io potentially */ + unsigned needs_blkcg : 1; + /* size of async data needed, if any */ + unsigned short async_size; }; -static const struct io_op_def io_op_defs[] = { +static const struct io_op_def io_op_defs[] __read_mostly = { [IORING_OP_NOP] = {}, [IORING_OP_READV] = { - .async_ctx = 1, .needs_mm = 1, .needs_file = 1, .unbound_nonreg_file = 1, .pollin = 1, .buffer_select = 1, + .needs_async_data = 1, + .needs_blkcg = 1, + .async_size = sizeof(struct io_async_rw), }, [IORING_OP_WRITEV] = { - .async_ctx = 1, .needs_mm = 1, .needs_file = 1, .hash_reg_file = 1, .unbound_nonreg_file = 1, .pollout = 1, .needs_fsize = 1, + .needs_async_data = 1, + .needs_blkcg = 1, + .async_size = sizeof(struct io_async_rw), }, [IORING_OP_FSYNC] = { .needs_file = 1, + .needs_blkcg = 1, }, [IORING_OP_READ_FIXED] = { .needs_file = 1, .unbound_nonreg_file = 1, .pollin = 1, + .needs_blkcg = 1, + .async_size = sizeof(struct io_async_rw), }, [IORING_OP_WRITE_FIXED] = { .needs_file = 1, @@ -756,6 +803,8 @@ static const struct io_op_def io_op_defs[] = { .unbound_nonreg_file = 1, .pollout = 1, .needs_fsize = 1, + .needs_blkcg = 1, + .async_size = sizeof(struct io_async_rw), }, [IORING_OP_POLL_ADD] = { .needs_file = 1, @@ -764,27 +813,33 @@ static const struct io_op_def io_op_defs[] = { [IORING_OP_POLL_REMOVE] = {}, [IORING_OP_SYNC_FILE_RANGE] = { .needs_file = 1, + .needs_blkcg = 1, }, [IORING_OP_SENDMSG] = { - .async_ctx = 1, .needs_mm = 1, .needs_file = 1, .unbound_nonreg_file = 1, .needs_fs = 1, .pollout = 1, + .needs_async_data = 1, + .needs_blkcg = 1, + .async_size = sizeof(struct io_async_msghdr), }, [IORING_OP_RECVMSG] = { - .async_ctx = 1, .needs_mm = 1, .needs_file = 1, .unbound_nonreg_file = 1, .needs_fs = 1, .pollin = 1, .buffer_select = 1, + .needs_async_data = 1, + .needs_blkcg = 1, + .async_size = sizeof(struct io_async_msghdr), }, [IORING_OP_TIMEOUT] = { - .async_ctx = 1, .needs_mm = 1, + .needs_async_data = 1, + .async_size = sizeof(struct io_timeout_data), }, [IORING_OP_TIMEOUT_REMOVE] = {}, [IORING_OP_ACCEPT] = { @@ -796,28 +851,33 @@ static const struct io_op_def io_op_defs[] = { }, [IORING_OP_ASYNC_CANCEL] = {}, [IORING_OP_LINK_TIMEOUT] = { - .async_ctx = 1, .needs_mm = 1, + .needs_async_data = 1, + .async_size = sizeof(struct io_timeout_data), }, [IORING_OP_CONNECT] = { - .async_ctx = 1, .needs_mm = 1, .needs_file = 1, .unbound_nonreg_file = 1, .pollout = 1, + .needs_async_data = 1, + .async_size = sizeof(struct io_async_connect), }, [IORING_OP_FALLOCATE] = { .needs_file = 1, .needs_fsize = 1, + .needs_blkcg = 1, }, [IORING_OP_OPENAT] = { .file_table = 1, .needs_fs = 1, + .needs_blkcg = 1, }, [IORING_OP_CLOSE] = { .needs_file = 1, .needs_file_no_error = 1, .file_table = 1, + .needs_blkcg = 1, }, [IORING_OP_FILES_UPDATE] = { .needs_mm = 1, @@ -827,6 +887,7 @@ static const struct io_op_def io_op_defs[] = { .needs_mm = 1, .needs_fs = 1, .file_table = 1, + .needs_blkcg = 1, }, [IORING_OP_READ] = { .needs_mm = 1, @@ -834,6 +895,8 @@ static const struct io_op_def io_op_defs[] = { .unbound_nonreg_file = 1, .pollin = 1, .buffer_select = 1, + .needs_blkcg = 1, + .async_size = sizeof(struct io_async_rw), }, [IORING_OP_WRITE] = { .needs_mm = 1, @@ -841,18 +904,23 @@ static const struct io_op_def io_op_defs[] = { .unbound_nonreg_file = 1, .pollout = 1, .needs_fsize = 1, + .needs_blkcg = 1, + .async_size = sizeof(struct io_async_rw), }, [IORING_OP_FADVISE] = { .needs_file = 1, + .needs_blkcg = 1, }, [IORING_OP_MADVISE] = { .needs_mm = 1, + .needs_blkcg = 1, }, [IORING_OP_SEND] = { .needs_mm = 1, .needs_file = 1, .unbound_nonreg_file = 1, .pollout = 1, + .needs_blkcg = 1, }, [IORING_OP_RECV] = { .needs_mm = 1, @@ -860,10 +928,12 @@ static const struct io_op_def io_op_defs[] = { .unbound_nonreg_file = 1, .pollin = 1, .buffer_select = 1, + .needs_blkcg = 1, }, [IORING_OP_OPENAT2] = { .file_table = 1, .needs_fs = 1, + .needs_blkcg = 1, }, [IORING_OP_EPOLL_CTL] = { .unbound_nonreg_file = 1, @@ -873,6 +943,7 @@ static const struct io_op_def io_op_defs[] = { .needs_file = 1, .hash_reg_file = 1, .unbound_nonreg_file = 1, + .needs_blkcg = 1, }, [IORING_OP_PROVIDE_BUFFERS] = {}, [IORING_OP_REMOVE_BUFFERS] = {}, @@ -900,13 +971,10 @@ static void io_queue_linked_timeout(struct io_kiocb *req); static int __io_sqe_files_update(struct io_ring_ctx *ctx, struct io_uring_files_update *ip, unsigned nr_args); -static int io_prep_work_files(struct io_kiocb *req); static void __io_clean_op(struct io_kiocb *req); -static int io_file_get(struct io_submit_state *state, struct io_kiocb *req, - int fd, struct file **out_file, bool fixed); -static void __io_queue_sqe(struct io_kiocb *req, - const struct io_uring_sqe *sqe, - struct io_comp_state *cs); +static struct file *io_file_get(struct io_submit_state *state, + struct io_kiocb *req, int fd, bool fixed); +static void __io_queue_sqe(struct io_kiocb *req, struct io_comp_state *cs); static void io_file_put_work(struct work_struct *work); static ssize_t io_import_iovec(int rw, struct io_kiocb *req, @@ -918,7 +986,7 @@ static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec, static struct kmem_cache *req_cachep; -static const struct file_operations io_uring_fops; +static const struct file_operations io_uring_fops __read_mostly; struct sock *io_uring_get_socket(struct file *file) { @@ -933,14 +1001,6 @@ struct sock *io_uring_get_socket(struct file *file) } EXPORT_SYMBOL(io_uring_get_socket); -static void io_get_req_task(struct io_kiocb *req) -{ - if (req->flags & REQ_F_TASK_PINNED) - return; - get_task_struct(req->task); - req->flags |= REQ_F_TASK_PINNED; -} - static inline void io_clean_op(struct io_kiocb *req) { if (req->flags & (REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED | @@ -948,13 +1008,6 @@ static inline void io_clean_op(struct io_kiocb *req) __io_clean_op(req); } -/* not idempotent -- it doesn't clear REQ_F_TASK_PINNED */ -static void __io_put_req_task(struct io_kiocb *req) -{ - if (req->flags & REQ_F_TASK_PINNED) - put_task_struct(req->task); -} - static void io_sq_thread_drop_mm(void) { struct mm_struct *mm = current->mm; @@ -969,9 +1022,10 @@ static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx) { if (!current->mm) { if (unlikely(!(ctx->flags & IORING_SETUP_SQPOLL) || - !mmget_not_zero(ctx->sqo_mm))) + !ctx->sqo_task->mm || + !mmget_not_zero(ctx->sqo_task->mm))) return -EFAULT; - kthread_use_mm(ctx->sqo_mm); + kthread_use_mm(ctx->sqo_task->mm); } return 0; @@ -985,6 +1039,26 @@ static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx, return __io_sq_thread_acquire_mm(ctx); } +static void io_sq_thread_associate_blkcg(struct io_ring_ctx *ctx, + struct cgroup_subsys_state **cur_css) + +{ +#ifdef CONFIG_BLK_CGROUP + /* puts the old one when swapping */ + if (*cur_css != ctx->sqo_blkcg_css) { + kthread_associate_blkcg(ctx->sqo_blkcg_css); + *cur_css = ctx->sqo_blkcg_css; + } +#endif +} + +static void io_sq_thread_unassociate_blkcg(void) +{ +#ifdef CONFIG_BLK_CGROUP + kthread_associate_blkcg(NULL); +#endif +} + static inline void req_set_fail_links(struct io_kiocb *req) { if ((req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK) @@ -1054,7 +1128,8 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) goto err; ctx->flags = p->flags; - init_waitqueue_head(&ctx->sqo_wait); + init_waitqueue_head(&ctx->sqo_sq_wait); + INIT_LIST_HEAD(&ctx->sqd_list); init_waitqueue_head(&ctx->cq_wait); INIT_LIST_HEAD(&ctx->cq_overflow_list); init_completion(&ctx->ref_comp); @@ -1121,6 +1196,10 @@ static bool io_req_clean_work(struct io_kiocb *req) mmdrop(req->work.mm); req->work.mm = NULL; } +#ifdef CONFIG_BLK_CGROUP + if (req->work.blkcg_css) + css_put(req->work.blkcg_css); +#endif if (req->work.creds) { put_cred(req->work.creds); req->work.creds = NULL; @@ -1146,20 +1225,45 @@ static bool io_req_clean_work(struct io_kiocb *req) static void io_prep_async_work(struct io_kiocb *req) { const struct io_op_def *def = &io_op_defs[req->opcode]; + struct io_ring_ctx *ctx = req->ctx; io_req_init_async(req); if (req->flags & REQ_F_ISREG) { - if (def->hash_reg_file || (req->ctx->flags & IORING_SETUP_IOPOLL)) + if (def->hash_reg_file || (ctx->flags & IORING_SETUP_IOPOLL)) io_wq_hash_work(&req->work, file_inode(req->file)); } else { if (def->unbound_nonreg_file) req->work.flags |= IO_WQ_WORK_UNBOUND; } + if (!req->work.files && io_op_defs[req->opcode].file_table && + !(req->flags & REQ_F_NO_FILE_TABLE)) { + req->work.files = get_files_struct(current); + get_nsproxy(current->nsproxy); + req->work.nsproxy = current->nsproxy; + req->flags |= REQ_F_INFLIGHT; + + spin_lock_irq(&ctx->inflight_lock); + list_add(&req->inflight_entry, &ctx->inflight_list); + spin_unlock_irq(&ctx->inflight_lock); + } if (!req->work.mm && def->needs_mm) { mmgrab(current->mm); req->work.mm = current->mm; } +#ifdef CONFIG_BLK_CGROUP + if (!req->work.blkcg_css && def->needs_blkcg) { + rcu_read_lock(); + req->work.blkcg_css = blkcg_css(); + /* + * This should be rare, either the cgroup is dying or the task + * is moving cgroups. Just punt to root for the handful of ios. + */ + if (!css_tryget_online(req->work.blkcg_css)) + req->work.blkcg_css = NULL; + rcu_read_unlock(); + } +#endif if (!req->work.creds) req->work.creds = get_current_cred(); if (!req->work.fs && def->needs_fs) { @@ -1213,9 +1317,10 @@ static void io_queue_async_work(struct io_kiocb *req) static void io_kill_timeout(struct io_kiocb *req) { + struct io_timeout_data *io = req->async_data; int ret; - ret = hrtimer_try_to_cancel(&req->io->timeout.timer); + ret = hrtimer_try_to_cancel(&io->timer); if (ret != -1) { atomic_set(&req->ctx->cq_timeouts, atomic_read(&req->ctx->cq_timeouts) + 1); @@ -1226,14 +1331,36 @@ static void io_kill_timeout(struct io_kiocb *req) } } -static void io_kill_timeouts(struct io_ring_ctx *ctx) +static bool io_task_match(struct io_kiocb *req, struct task_struct *tsk) +{ + struct io_ring_ctx *ctx = req->ctx; + + if (!tsk || req->task == tsk) + return true; + if (ctx->flags & IORING_SETUP_SQPOLL) { + if (ctx->sq_data && req->task == ctx->sq_data->thread) + return true; + } + return false; +} + +/* + * Returns true if we found and killed one or more timeouts + */ +static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk) { struct io_kiocb *req, *tmp; + int canceled = 0; spin_lock_irq(&ctx->completion_lock); - list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) - io_kill_timeout(req); + list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) { + if (io_task_match(req, tsk)) { + io_kill_timeout(req); + canceled++; + } + } spin_unlock_irq(&ctx->completion_lock); + return canceled != 0; } static void __io_queue_deferred(struct io_ring_ctx *ctx) @@ -1284,6 +1411,13 @@ static void io_commit_cqring(struct io_ring_ctx *ctx) __io_queue_deferred(ctx); } +static inline bool io_sqring_full(struct io_ring_ctx *ctx) +{ + struct io_rings *r = ctx->rings; + + return READ_ONCE(r->sq.tail) - ctx->cached_sq_head == r->sq_ring_entries; +} + static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx) { struct io_rings *rings = ctx->rings; @@ -1317,8 +1451,8 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx) { if (waitqueue_active(&ctx->wait)) wake_up(&ctx->wait); - if (waitqueue_active(&ctx->sqo_wait)) - wake_up(&ctx->sqo_wait); + if (ctx->sq_data && waitqueue_active(&ctx->sq_data->wait)) + wake_up(&ctx->sq_data->wait); if (io_should_trigger_evfd(ctx)) eventfd_signal(ctx->cq_ev_fd, 1); } @@ -1332,12 +1466,24 @@ static void io_cqring_mark_overflow(struct io_ring_ctx *ctx) } } +static inline bool io_match_files(struct io_kiocb *req, + struct files_struct *files) +{ + if (!files) + return true; + if (req->flags & REQ_F_WORK_INITIALIZED) + return req->work.files == files; + return false; +} + /* Returns true if there are no backlogged entries after the flush */ -static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) +static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force, + struct task_struct *tsk, + struct files_struct *files) { struct io_rings *rings = ctx->rings; + struct io_kiocb *req, *tmp; struct io_uring_cqe *cqe; - struct io_kiocb *req; unsigned long flags; LIST_HEAD(list); @@ -1356,13 +1502,16 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) ctx->cq_overflow_flushed = 1; cqe = NULL; - while (!list_empty(&ctx->cq_overflow_list)) { + list_for_each_entry_safe(req, tmp, &ctx->cq_overflow_list, compl.list) { + if (tsk && req->task != tsk) + continue; + if (!io_match_files(req, files)) + continue; + cqe = io_get_cqring(ctx); if (!cqe && !force) break; - req = list_first_entry(&ctx->cq_overflow_list, struct io_kiocb, - compl.list); list_move(&req->compl.list, &list); if (cqe) { WRITE_ONCE(cqe->user_data, req->user_data); @@ -1406,7 +1555,12 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags) WRITE_ONCE(cqe->user_data, req->user_data); WRITE_ONCE(cqe->res, res); WRITE_ONCE(cqe->flags, cflags); - } else if (ctx->cq_overflow_flushed) { + } else if (ctx->cq_overflow_flushed || req->task->io_uring->in_idle) { + /* + * If we're in ring overflow flush mode, or in task cancel mode, + * then we cannot store the request for later flushing, we need + * to drop it on the floor. + */ WRITE_ONCE(ctx->rings->cq_overflow, atomic_inc_return(&ctx->cached_cq_overflow)); } else { @@ -1509,10 +1663,8 @@ static struct io_kiocb *io_get_fallback_req(struct io_ring_ctx *ctx) static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx, struct io_submit_state *state) { - gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; - struct io_kiocb *req; - if (!state->free_reqs) { + gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; size_t sz; int ret; @@ -1529,14 +1681,11 @@ static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx, goto fallback; ret = 1; } - state->free_reqs = ret - 1; - req = state->reqs[ret - 1]; - } else { - state->free_reqs--; - req = state->reqs[state->free_reqs]; + state->free_reqs = ret; } - return req; + state->free_reqs--; + return state->reqs[state->free_reqs]; fallback: return io_get_fallback_req(ctx); } @@ -1554,8 +1703,8 @@ static bool io_dismantle_req(struct io_kiocb *req) { io_clean_op(req); - if (req->io) - kfree(req->io); + if (req->async_data) + kfree(req->async_data); if (req->file) io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE)); @@ -1564,9 +1713,14 @@ static bool io_dismantle_req(struct io_kiocb *req) static void __io_free_req_finish(struct io_kiocb *req) { + struct io_uring_task *tctx = req->task->io_uring; struct io_ring_ctx *ctx = req->ctx; - __io_put_req_task(req); + atomic_long_inc(&tctx->req_complete); + if (tctx->in_idle) + wake_up(&tctx->wait); + put_task_struct(req->task); + if (likely(!io_is_fallback_req(req))) kmem_cache_free(req_cachep, req); else @@ -1609,10 +1763,11 @@ static void __io_free_req(struct io_kiocb *req) static bool io_link_cancel_timeout(struct io_kiocb *req) { + struct io_timeout_data *io = req->async_data; struct io_ring_ctx *ctx = req->ctx; int ret; - ret = hrtimer_try_to_cancel(&req->io->timeout.timer); + ret = hrtimer_try_to_cancel(&io->timer); if (ret != -1) { io_cqring_fill_event(req, -ECANCELED); io_commit_cqring(ctx); @@ -1746,13 +1901,15 @@ static struct io_kiocb *io_req_find_next(struct io_kiocb *req) return __io_req_find_next(req); } -static int io_req_task_work_add(struct io_kiocb *req, struct callback_head *cb, - bool twa_signal_ok) +static int io_req_task_work_add(struct io_kiocb *req, bool twa_signal_ok) { struct task_struct *tsk = req->task; struct io_ring_ctx *ctx = req->ctx; int ret, notify; + if (tsk->flags & PF_EXITING) + return -ESRCH; + /* * SQPOLL kernel thread doesn't need notification, just a wakeup. For * all other cases, use TWA_SIGNAL unconditionally to ensure we're @@ -1763,7 +1920,7 @@ static int io_req_task_work_add(struct io_kiocb *req, struct callback_head *cb, if (!(ctx->flags & IORING_SETUP_SQPOLL) && twa_signal_ok) notify = TWA_SIGNAL; - ret = task_work_add(tsk, cb, notify); + ret = task_work_add(tsk, &req->task_work, notify); if (!ret) wake_up_process(tsk); @@ -1787,8 +1944,10 @@ static void __io_req_task_cancel(struct io_kiocb *req, int error) static void io_req_task_cancel(struct callback_head *cb) { struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); + struct io_ring_ctx *ctx = req->ctx; __io_req_task_cancel(req, -ECANCELED); + percpu_ref_put(&ctx->refs); } static void __io_req_task_submit(struct io_kiocb *req) @@ -1797,7 +1956,7 @@ static void __io_req_task_submit(struct io_kiocb *req) if (!__io_sq_thread_acquire_mm(ctx)) { mutex_lock(&ctx->uring_lock); - __io_queue_sqe(req, NULL, NULL); + __io_queue_sqe(req, NULL); mutex_unlock(&ctx->uring_lock); } else { __io_req_task_cancel(req, -EFAULT); @@ -1820,7 +1979,7 @@ static void io_req_task_queue(struct io_kiocb *req) init_task_work(&req->task_work, io_req_task_submit); percpu_ref_get(&req->ctx->refs); - ret = io_req_task_work_add(req, &req->task_work, true); + ret = io_req_task_work_add(req, true); if (unlikely(ret)) { struct task_struct *tsk; @@ -1874,6 +2033,7 @@ static void io_req_free_batch_finish(struct io_ring_ctx *ctx, if (rb->to_free) __io_req_free_batch_flush(ctx, rb); if (rb->task) { + atomic_long_add(rb->task_refs, &rb->task->io_uring->req_complete); put_task_struct_many(rb->task, rb->task_refs); rb->task = NULL; } @@ -1888,16 +2048,15 @@ static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req) if (req->flags & REQ_F_LINK_HEAD) io_queue_next(req); - if (req->flags & REQ_F_TASK_PINNED) { - if (req->task != rb->task) { - if (rb->task) - put_task_struct_many(rb->task, rb->task_refs); - rb->task = req->task; - rb->task_refs = 0; + if (req->task != rb->task) { + if (rb->task) { + atomic_long_add(rb->task_refs, &rb->task->io_uring->req_complete); + put_task_struct_many(rb->task, rb->task_refs); } - rb->task_refs++; - req->flags &= ~REQ_F_TASK_PINNED; + rb->task = req->task; + rb->task_refs = 0; } + rb->task_refs++; WARN_ON_ONCE(io_dismantle_req(req)); rb->reqs[rb->to_free++] = req; @@ -1973,7 +2132,7 @@ static unsigned io_cqring_events(struct io_ring_ctx *ctx, bool noflush) if (noflush && !list_empty(&ctx->cq_overflow_list)) return -1U; - io_cqring_overflow_flush(ctx, false); + io_cqring_overflow_flush(ctx, false, NULL, NULL); } /* See comment at the top of this file */ @@ -2010,6 +2169,12 @@ static inline unsigned int io_put_rw_kbuf(struct io_kiocb *req) static inline bool io_run_task_work(void) { + /* + * Not safe to run on exiting task, and the task_work handling will + * not add work to such a task. + */ + if (unlikely(current->flags & PF_EXITING)) + return false; if (current->task_works) { __set_current_state(TASK_RUNNING); task_work_run(); @@ -2283,13 +2448,17 @@ static bool io_resubmit_prep(struct io_kiocb *req, int error) goto end_req; } - ret = io_import_iovec(rw, req, &iovec, &iter, false); - if (ret < 0) - goto end_req; - ret = io_setup_async_rw(req, iovec, inline_vecs, &iter, false); - if (!ret) + if (!req->async_data) { + ret = io_import_iovec(rw, req, &iovec, &iter, false); + if (ret < 0) + goto end_req; + ret = io_setup_async_rw(req, iovec, inline_vecs, &iter, false); + if (!ret) + return true; + kfree(iovec); + } else { return true; - kfree(iovec); + } end_req: req_set_fail_links(req); io_req_complete(req, ret); @@ -2386,8 +2555,8 @@ static void io_iopoll_req_issued(struct io_kiocb *req) list_add_tail(&req->inflight_entry, &ctx->iopoll_list); if ((ctx->flags & IORING_SETUP_SQPOLL) && - wq_has_sleeper(&ctx->sqo_wait)) - wake_up(&ctx->sqo_wait); + wq_has_sleeper(&ctx->sq_data->wait)) + wake_up(&ctx->sq_data->wait); } static void __io_state_file_put(struct io_submit_state *state) @@ -2416,7 +2585,6 @@ static struct file *__io_file_get(struct io_submit_state *state, int fd) if (state->file) { if (state->fd == fd) { state->has_refs--; - state->ios_left--; return state->file; } __io_state_file_put(state); @@ -2426,8 +2594,7 @@ static struct file *__io_file_get(struct io_submit_state *state, int fd) return NULL; state->fd = fd; - state->ios_left--; - state->has_refs = state->ios_left; + state->has_refs = state->ios_left - 1; return state->file; } @@ -2476,8 +2643,7 @@ static bool io_file_supports_async(struct file *file, int rw) return file->f_op->write_iter != NULL; } -static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, - bool force_nonblock) +static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_ring_ctx *ctx = req->ctx; struct kiocb *kiocb = &req->rw.kiocb; @@ -2512,12 +2678,6 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (kiocb->ki_flags & IOCB_NOWAIT) req->flags |= REQ_F_NOWAIT; - if (kiocb->ki_flags & IOCB_DIRECT) - io_get_req_task(req); - - if (force_nonblock) - kiocb->ki_flags |= IOCB_NOWAIT; - if (ctx->flags & IORING_SETUP_IOPOLL) { if (!(kiocb->ki_flags & IOCB_DIRECT) || !kiocb->ki_filp->f_op->iopoll) @@ -2526,7 +2686,6 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, kiocb->ki_flags |= IOCB_HIPRI; kiocb->ki_complete = io_complete_rw_iopoll; req->iopoll_completed = 0; - io_get_req_task(req); } else { if (kiocb->ki_flags & IOCB_HIPRI) return -EINVAL; @@ -2564,13 +2723,14 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret, struct io_comp_state *cs) { struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); + struct io_async_rw *io = req->async_data; /* add previously done IO, if any */ - if (req->io && req->io->rw.bytes_done > 0) { + if (io && io->bytes_done > 0) { if (ret < 0) - ret = req->io->rw.bytes_done; + ret = io->bytes_done; else - ret += req->io->rw.bytes_done; + ret += io->bytes_done; } if (req->flags & REQ_F_CUR_POS) @@ -2587,18 +2747,12 @@ static ssize_t io_import_fixed(struct io_kiocb *req, int rw, struct io_ring_ctx *ctx = req->ctx; size_t len = req->rw.len; struct io_mapped_ubuf *imu; - u16 index, buf_index; + u16 index, buf_index = req->buf_index; size_t offset; u64 buf_addr; - /* attempt to use fixed buffers without having provided iovecs */ - if (unlikely(!ctx->user_bufs)) - return -EFAULT; - - buf_index = req->buf_index; if (unlikely(buf_index >= ctx->nr_user_bufs)) return -EFAULT; - index = array_index_nospec(buf_index, ctx->nr_user_bufs); imu = &ctx->user_bufs[index]; buf_addr = req->rw.addr; @@ -2837,28 +2991,25 @@ static ssize_t __io_import_iovec(int rw, struct io_kiocb *req, return ret; } -#ifdef CONFIG_COMPAT - if (req->ctx->compat) - return compat_import_iovec(rw, buf, sqe_len, UIO_FASTIOV, - iovec, iter); -#endif - - return import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter); + return __import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter, + req->ctx->compat); } static ssize_t io_import_iovec(int rw, struct io_kiocb *req, struct iovec **iovec, struct iov_iter *iter, bool needs_lock) { - if (!req->io) + struct io_async_rw *iorw = req->async_data; + + if (!iorw) return __io_import_iovec(rw, req, iovec, iter, needs_lock); *iovec = NULL; - return iov_iter_count(&req->io->rw.iter); + return iov_iter_count(&iorw->iter); } static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb) { - return kiocb->ki_filp->f_mode & FMODE_STREAM ? NULL : &kiocb->ki_pos; + return (kiocb->ki_filp->f_mode & FMODE_STREAM) ? NULL : &kiocb->ki_pos; } /* @@ -2922,10 +3073,10 @@ static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb, static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec, const struct iovec *fast_iov, struct iov_iter *iter) { - struct io_async_rw *rw = &req->io->rw; + struct io_async_rw *rw = req->async_data; memcpy(&rw->iter, iter, sizeof(*iter)); - rw->free_iovec = NULL; + rw->free_iovec = iovec; rw->bytes_done = 0; /* can only be fixed buffers, no need to do anything */ if (iter->type == ITER_BVEC) @@ -2942,33 +3093,33 @@ static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec, memcpy(rw->fast_iov + iov_off, fast_iov + iov_off, sizeof(struct iovec) * iter->nr_segs); } else { - rw->free_iovec = iovec; req->flags |= REQ_F_NEED_CLEANUP; } } -static inline int __io_alloc_async_ctx(struct io_kiocb *req) +static inline int __io_alloc_async_data(struct io_kiocb *req) { - req->io = kmalloc(sizeof(*req->io), GFP_KERNEL); - return req->io == NULL; + WARN_ON_ONCE(!io_op_defs[req->opcode].async_size); + req->async_data = kmalloc(io_op_defs[req->opcode].async_size, GFP_KERNEL); + return req->async_data == NULL; } -static int io_alloc_async_ctx(struct io_kiocb *req) +static int io_alloc_async_data(struct io_kiocb *req) { - if (!io_op_defs[req->opcode].async_ctx) + if (!io_op_defs[req->opcode].needs_async_data) return 0; - return __io_alloc_async_ctx(req); + return __io_alloc_async_data(req); } static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec, const struct iovec *fast_iov, struct iov_iter *iter, bool force) { - if (!force && !io_op_defs[req->opcode].async_ctx) + if (!force && !io_op_defs[req->opcode].needs_async_data) return 0; - if (!req->io) { - if (__io_alloc_async_ctx(req)) + if (!req->async_data) { + if (__io_alloc_async_data(req)) return -ENOMEM; io_req_map_rw(req, iovec, fast_iov, iter); @@ -2976,29 +3127,28 @@ static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec, return 0; } -static inline int io_rw_prep_async(struct io_kiocb *req, int rw, - bool force_nonblock) +static inline int io_rw_prep_async(struct io_kiocb *req, int rw) { - struct io_async_rw *iorw = &req->io->rw; - struct iovec *iov; + struct io_async_rw *iorw = req->async_data; + struct iovec *iov = iorw->fast_iov; ssize_t ret; - iorw->iter.iov = iov = iorw->fast_iov; - ret = __io_import_iovec(rw, req, &iov, &iorw->iter, !force_nonblock); + ret = __io_import_iovec(rw, req, &iov, &iorw->iter, false); if (unlikely(ret < 0)) return ret; - iorw->iter.iov = iov; - io_req_map_rw(req, iorw->iter.iov, iorw->fast_iov, &iorw->iter); + iorw->bytes_done = 0; + iorw->free_iovec = iov; + if (iov) + req->flags |= REQ_F_NEED_CLEANUP; return 0; } -static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, - bool force_nonblock) +static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { ssize_t ret; - ret = io_prep_rw(req, sqe, force_nonblock); + ret = io_prep_rw(req, sqe); if (ret) return ret; @@ -3006,9 +3156,9 @@ static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, return -EBADF; /* either don't need iovec imported or already have it */ - if (!req->io || req->flags & REQ_F_NEED_CLEANUP) + if (!req->async_data) return 0; - return io_rw_prep_async(req, READ, force_nonblock); + return io_rw_prep_async(req, READ); } /* @@ -3034,6 +3184,7 @@ static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode, if (!wake_page_match(wpq, key)) return 0; + req->rw.kiocb.ki_flags &= ~IOCB_WAITQ; list_del_init(&wait->entry); init_task_work(&req->task_work, io_req_task_submit); @@ -3041,7 +3192,7 @@ static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode, /* submit ref gets dropped, acquire a new one */ refcount_inc(&req->refs); - ret = io_req_task_work_add(req, &req->task_work, true); + ret = io_req_task_work_add(req, true); if (unlikely(ret)) { struct task_struct *tsk; @@ -3068,7 +3219,8 @@ static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode, */ static bool io_rw_should_retry(struct io_kiocb *req) { - struct wait_page_queue *wait = &req->io->rw.wpq; + struct io_async_rw *rw = req->async_data; + struct wait_page_queue *wait = &rw->wpq; struct kiocb *kiocb = &req->rw.kiocb; /* never retry for NOWAIT, we just complete with -EAGAIN */ @@ -3091,9 +3243,8 @@ static bool io_rw_should_retry(struct io_kiocb *req) wait->wait.flags = 0; INIT_LIST_HEAD(&wait->wait.entry); kiocb->ki_flags |= IOCB_WAITQ; + kiocb->ki_flags &= ~IOCB_NOWAIT; kiocb->ki_waitq = wait; - - io_get_req_task(req); return true; } @@ -3113,11 +3264,13 @@ static int io_read(struct io_kiocb *req, bool force_nonblock, struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct kiocb *kiocb = &req->rw.kiocb; struct iov_iter __iter, *iter = &__iter; + struct io_async_rw *rw = req->async_data; ssize_t io_size, ret, ret2; size_t iov_count; + bool no_async; - if (req->io) - iter = &req->io->rw.iter; + if (rw) + iter = &rw->iter; ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock); if (ret < 0) @@ -3130,9 +3283,13 @@ static int io_read(struct io_kiocb *req, bool force_nonblock, /* Ensure we clear previously set non-block flag */ if (!force_nonblock) kiocb->ki_flags &= ~IOCB_NOWAIT; + else + kiocb->ki_flags |= IOCB_NOWAIT; + /* If the file doesn't support async, just async punt */ - if (force_nonblock && !io_file_supports_async(req->file, READ)) + no_async = force_nonblock && !io_file_supports_async(req->file, READ); + if (no_async) goto copy_iov; ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), iov_count); @@ -3155,10 +3312,8 @@ static int io_read(struct io_kiocb *req, bool force_nonblock, goto done; /* some cases will consume bytes even on error returns */ iov_iter_revert(iter, iov_count - iov_iter_count(iter)); - ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false); - if (ret) - goto out_free; - return -EAGAIN; + ret = 0; + goto copy_iov; } else if (ret < 0) { /* make sure -ERESTARTSYS -> -EINTR is done */ goto done; @@ -3176,12 +3331,15 @@ copy_iov: ret = ret2; goto out_free; } + if (no_async) + return -EAGAIN; + rw = req->async_data; /* it's copied and will be cleaned with ->io */ iovec = NULL; /* now use our persistent iterator, if we aren't already */ - iter = &req->io->rw.iter; + iter = &rw->iter; retry: - req->io->rw.bytes_done += ret; + rw->bytes_done += ret; /* if we can retry, do so with the callbacks armed */ if (!io_rw_should_retry(req)) { kiocb->ki_flags &= ~IOCB_WAITQ; @@ -3212,12 +3370,11 @@ out_free: return ret; } -static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, - bool force_nonblock) +static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { ssize_t ret; - ret = io_prep_rw(req, sqe, force_nonblock); + ret = io_prep_rw(req, sqe); if (ret) return ret; @@ -3225,9 +3382,9 @@ static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, return -EBADF; /* either don't need iovec imported or already have it */ - if (!req->io || req->flags & REQ_F_NEED_CLEANUP) + if (!req->async_data) return 0; - return io_rw_prep_async(req, WRITE, force_nonblock); + return io_rw_prep_async(req, WRITE); } static int io_write(struct io_kiocb *req, bool force_nonblock, @@ -3236,11 +3393,12 @@ static int io_write(struct io_kiocb *req, bool force_nonblock, struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct kiocb *kiocb = &req->rw.kiocb; struct iov_iter __iter, *iter = &__iter; + struct io_async_rw *rw = req->async_data; size_t iov_count; ssize_t ret, ret2, io_size; - if (req->io) - iter = &req->io->rw.iter; + if (rw) + iter = &rw->iter; ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock); if (ret < 0) @@ -3251,7 +3409,9 @@ static int io_write(struct io_kiocb *req, bool force_nonblock, /* Ensure we clear previously set non-block flag */ if (!force_nonblock) - req->rw.kiocb.ki_flags &= ~IOCB_NOWAIT; + kiocb->ki_flags &= ~IOCB_NOWAIT; + else + kiocb->ki_flags |= IOCB_NOWAIT; /* If the file doesn't support async, just async punt */ if (force_nonblock && !io_file_supports_async(req->file, WRITE)) @@ -3323,10 +3483,7 @@ static int __io_splice_prep(struct io_kiocb *req, { struct io_splice* sp = &req->splice; unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL; - int ret; - if (req->flags & REQ_F_NEED_CLEANUP) - return 0; if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; @@ -3337,10 +3494,10 @@ static int __io_splice_prep(struct io_kiocb *req, if (unlikely(sp->flags & ~valid_flags)) return -EINVAL; - ret = io_file_get(NULL, req, READ_ONCE(sqe->splice_fd_in), &sp->file_in, - (sp->flags & SPLICE_F_FD_IN_FIXED)); - if (ret) - return ret; + sp->file_in = io_file_get(NULL, req, READ_ONCE(sqe->splice_fd_in), + (sp->flags & SPLICE_F_FD_IN_FIXED)); + if (!sp->file_in) + return -EBADF; req->flags |= REQ_F_NEED_CLEANUP; if (!S_ISREG(file_inode(sp->file_in)->i_mode)) { @@ -3508,8 +3665,6 @@ static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe const char __user *fname; int ret; - if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) - return -EINVAL; if (unlikely(sqe->ioprio || sqe->buf_index)) return -EINVAL; if (unlikely(req->flags & REQ_F_FIXED_FILE)) @@ -3536,8 +3691,8 @@ static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { u64 flags, mode; - if (req->flags & REQ_F_NEED_CLEANUP) - return 0; + if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) + return -EINVAL; mode = READ_ONCE(sqe->len); flags = READ_ONCE(sqe->open_flags); req->open.how = build_open_how(flags, mode); @@ -3550,8 +3705,8 @@ static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) size_t len; int ret; - if (req->flags & REQ_F_NEED_CLEANUP) - return 0; + if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) + return -EINVAL; how = u64_to_user_ptr(READ_ONCE(sqe->addr2)); len = READ_ONCE(sqe->len); if (len < OPEN_HOW_SIZE_VER0) @@ -3767,7 +3922,7 @@ static int io_epoll_ctl_prep(struct io_kiocb *req, #if defined(CONFIG_EPOLL) if (sqe->ioprio || sqe->buf_index) return -EINVAL; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL))) return -EINVAL; req->epoll.epfd = READ_ONCE(sqe->fd); @@ -3882,7 +4037,7 @@ static int io_fadvise(struct io_kiocb *req, bool force_nonblock) static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL))) return -EINVAL; if (sqe->ioprio || sqe->buf_index) return -EINVAL; @@ -3938,8 +4093,7 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -EBADF; req->close.fd = READ_ONCE(sqe->fd); - if ((req->file && req->file->f_op == &io_uring_fops) || - req->close.fd == req->ctx->ring_fd) + if ((req->file && req->file->f_op == &io_uring_fops)) return -EBADF; req->close.put_file = NULL; @@ -4016,15 +4170,18 @@ static int io_sync_file_range(struct io_kiocb *req, bool force_nonblock) static int io_setup_async_msg(struct io_kiocb *req, struct io_async_msghdr *kmsg) { - if (req->io) + struct io_async_msghdr *async_msg = req->async_data; + + if (async_msg) return -EAGAIN; - if (io_alloc_async_ctx(req)) { + if (io_alloc_async_data(req)) { if (kmsg->iov != kmsg->fast_iov) kfree(kmsg->iov); return -ENOMEM; } + async_msg = req->async_data; req->flags |= REQ_F_NEED_CLEANUP; - memcpy(&req->io->msg, kmsg, sizeof(*kmsg)); + memcpy(async_msg, kmsg, sizeof(*kmsg)); return -EAGAIN; } @@ -4039,8 +4196,8 @@ static int io_sendmsg_copy_hdr(struct io_kiocb *req, static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { + struct io_async_msghdr *async_msg = req->async_data; struct io_sr_msg *sr = &req->sr_msg; - struct io_async_ctx *io = req->io; int ret; if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) @@ -4055,13 +4212,9 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) sr->msg_flags |= MSG_CMSG_COMPAT; #endif - if (!io || req->opcode == IORING_OP_SEND) - return 0; - /* iovec is already imported */ - if (req->flags & REQ_F_NEED_CLEANUP) + if (!async_msg || !io_op_defs[req->opcode].needs_async_data) return 0; - - ret = io_sendmsg_copy_hdr(req, &io->msg); + ret = io_sendmsg_copy_hdr(req, async_msg); if (!ret) req->flags |= REQ_F_NEED_CLEANUP; return ret; @@ -4079,9 +4232,9 @@ static int io_sendmsg(struct io_kiocb *req, bool force_nonblock, if (unlikely(!sock)) return ret; - if (req->io) { - kmsg = &req->io->msg; - kmsg->msg.msg_name = &req->io->msg.addr; + if (req->async_data) { + kmsg = req->async_data; + kmsg->msg.msg_name = &kmsg->addr; /* if iov is set, it's allocated already */ if (!kmsg->iov) kmsg->iov = kmsg->fast_iov; @@ -4130,7 +4283,7 @@ static int io_send(struct io_kiocb *req, bool force_nonblock, ret = import_single_range(WRITE, sr->buf, sr->len, &iov, &msg.msg_iter); if (unlikely(ret)) - return ret;; + return ret; msg.msg_name = NULL; msg.msg_control = NULL; @@ -4179,8 +4332,9 @@ static int __io_recvmsg_copy_hdr(struct io_kiocb *req, sr->len); iomsg->iov = NULL; } else { - ret = import_iovec(READ, uiov, iov_len, UIO_FASTIOV, - &iomsg->iov, &iomsg->msg.msg_iter); + ret = __import_iovec(READ, uiov, iov_len, UIO_FASTIOV, + &iomsg->iov, &iomsg->msg.msg_iter, + false); if (ret > 0) ret = 0; } @@ -4220,9 +4374,9 @@ static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req, sr->len = iomsg->iov[0].iov_len; iomsg->iov = NULL; } else { - ret = compat_import_iovec(READ, uiov, len, UIO_FASTIOV, - &iomsg->iov, - &iomsg->msg.msg_iter); + ret = __import_iovec(READ, (struct iovec __user *)uiov, len, + UIO_FASTIOV, &iomsg->iov, + &iomsg->msg.msg_iter, true); if (ret < 0) return ret; } @@ -4268,8 +4422,8 @@ static inline unsigned int io_put_recv_kbuf(struct io_kiocb *req) static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { + struct io_async_msghdr *async_msg = req->async_data; struct io_sr_msg *sr = &req->sr_msg; - struct io_async_ctx *io = req->io; int ret; if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) @@ -4285,13 +4439,9 @@ static int io_recvmsg_prep(struct io_kiocb *req, sr->msg_flags |= MSG_CMSG_COMPAT; #endif - if (!io || req->opcode == IORING_OP_RECV) + if (!async_msg || !io_op_defs[req->opcode].needs_async_data) return 0; - /* iovec is already imported */ - if (req->flags & REQ_F_NEED_CLEANUP) - return 0; - - ret = io_recvmsg_copy_hdr(req, &io->msg); + ret = io_recvmsg_copy_hdr(req, async_msg); if (!ret) req->flags |= REQ_F_NEED_CLEANUP; return ret; @@ -4310,9 +4460,9 @@ static int io_recvmsg(struct io_kiocb *req, bool force_nonblock, if (unlikely(!sock)) return ret; - if (req->io) { - kmsg = &req->io->msg; - kmsg->msg.msg_name = &req->io->msg.addr; + if (req->async_data) { + kmsg = req->async_data; + kmsg->msg.msg_name = &kmsg->addr; /* if iov is set, it's allocated already */ if (!kmsg->iov) kmsg->iov = kmsg->fast_iov; @@ -4454,7 +4604,7 @@ static int io_accept(struct io_kiocb *req, bool force_nonblock, static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_connect *conn = &req->connect; - struct io_async_ctx *io = req->io; + struct io_async_connect *io = req->async_data; if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) return -EINVAL; @@ -4468,22 +4618,22 @@ static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; return move_addr_to_kernel(conn->addr, conn->addr_len, - &io->connect.address); + &io->address); } static int io_connect(struct io_kiocb *req, bool force_nonblock, struct io_comp_state *cs) { - struct io_async_ctx __io, *io; + struct io_async_connect __io, *io; unsigned file_flags; int ret; - if (req->io) { - io = req->io; + if (req->async_data) { + io = req->async_data; } else { ret = move_addr_to_kernel(req->connect.addr, req->connect.addr_len, - &__io.connect.address); + &__io.address); if (ret) goto out; io = &__io; @@ -4491,16 +4641,17 @@ static int io_connect(struct io_kiocb *req, bool force_nonblock, file_flags = force_nonblock ? O_NONBLOCK : 0; - ret = __sys_connect_file(req->file, &io->connect.address, + ret = __sys_connect_file(req->file, &io->address, req->connect.addr_len, file_flags); if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) { - if (req->io) + if (req->async_data) return -EAGAIN; - if (io_alloc_async_ctx(req)) { + if (io_alloc_async_data(req)) { ret = -ENOMEM; goto out; } - memcpy(&req->io->connect, &__io.connect, sizeof(__io.connect)); + io = req->async_data; + memcpy(req->async_data, &__io, sizeof(__io)); return -EAGAIN; } if (ret == -ERESTARTSYS) @@ -4608,7 +4759,7 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll, * of executing it. We can't safely execute it anyway, as we may not * have the needed state needed for it anyway. */ - ret = io_req_task_work_add(req, &req->task_work, twa_signal_ok); + ret = io_req_task_work_add(req, twa_signal_ok); if (unlikely(ret)) { struct task_struct *tsk; @@ -4642,9 +4793,9 @@ static bool io_poll_rewait(struct io_kiocb *req, struct io_poll_iocb *poll) static struct io_poll_iocb *io_poll_get_double(struct io_kiocb *req) { - /* pure poll stashes this in ->io, poll driven retry elsewhere */ + /* pure poll stashes this in ->async_data, poll driven retry elsewhere */ if (req->opcode == IORING_OP_POLL_ADD) - return (struct io_poll_iocb *) req->io; + return req->async_data; return req->apoll->double_poll; } @@ -4724,6 +4875,8 @@ static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode, if (mask && !(mask & poll->events)) return 0; + list_del_init(&wait->entry); + if (poll && poll->head) { bool done; @@ -4919,7 +5072,6 @@ static bool io_arm_poll_handler(struct io_kiocb *req) apoll->double_poll = NULL; req->flags |= REQ_F_POLLED; - io_get_req_task(req); req->apoll = apoll; INIT_HLIST_NODE(&req->hash_node); @@ -4928,6 +5080,12 @@ static bool io_arm_poll_handler(struct io_kiocb *req) mask |= POLLIN | POLLRDNORM; if (def->pollout) mask |= POLLOUT | POLLWRNORM; + + /* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */ + if ((req->opcode == IORING_OP_RECVMSG) && + (req->sr_msg.msg_flags & MSG_ERRQUEUE)) + mask &= ~POLLIN; + mask |= POLLERR | POLLPRI; ipt.pt._qproc = io_async_queue_proc; @@ -4994,7 +5152,10 @@ static bool io_poll_remove_one(struct io_kiocb *req) return do_complete; } -static void io_poll_remove_all(struct io_ring_ctx *ctx) +/* + * Returns true if we found and killed one or more poll requests + */ +static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk) { struct hlist_node *tmp; struct io_kiocb *req; @@ -5005,13 +5166,17 @@ static void io_poll_remove_all(struct io_ring_ctx *ctx) struct hlist_head *list; list = &ctx->cancel_hash[i]; - hlist_for_each_entry_safe(req, tmp, list, hash_node) - posted += io_poll_remove_one(req); + hlist_for_each_entry_safe(req, tmp, list, hash_node) { + if (io_task_match(req, tsk)) + posted += io_poll_remove_one(req); + } } spin_unlock_irq(&ctx->completion_lock); if (posted) io_cqring_ev_posted(ctx); + + return posted != 0; } static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr) @@ -5079,7 +5244,7 @@ static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head, { struct io_poll_table *pt = container_of(p, struct io_poll_table, pt); - __io_queue_proc(&pt->req->poll, pt, head, (struct io_poll_iocb **) &pt->req->io); + __io_queue_proc(&pt->req->poll, pt, head, (struct io_poll_iocb **) &pt->req->async_data); } static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -5100,8 +5265,6 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe #endif poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP | (events & EPOLLEXCLUSIVE); - - io_get_req_task(req); return 0; } @@ -5140,16 +5303,10 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) unsigned long flags; spin_lock_irqsave(&ctx->completion_lock, flags); + list_del_init(&req->timeout.list); atomic_set(&req->ctx->cq_timeouts, atomic_read(&req->ctx->cq_timeouts) + 1); - /* - * We could be racing with timeout deletion. If the list is empty, - * then timeout lookup already found it and will be handling it. - */ - if (!list_empty(&req->timeout.list)) - list_del_init(&req->timeout.list); - io_cqring_fill_event(req, -ETIME); io_commit_cqring(ctx); spin_unlock_irqrestore(&ctx->completion_lock, flags); @@ -5162,13 +5319,13 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) static int __io_timeout_cancel(struct io_kiocb *req) { + struct io_timeout_data *io = req->async_data; int ret; - list_del_init(&req->timeout.list); - - ret = hrtimer_try_to_cancel(&req->io->timeout.timer); + ret = hrtimer_try_to_cancel(&io->timer); if (ret == -1) return -EALREADY; + list_del_init(&req->timeout.list); req_set_fail_links(req); req->flags |= REQ_F_COMP_LOCKED; @@ -5202,14 +5359,10 @@ static int io_timeout_remove_prep(struct io_kiocb *req, return -EINVAL; if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) return -EINVAL; - if (sqe->ioprio || sqe->buf_index || sqe->len) - return -EINVAL; - - req->timeout.addr = READ_ONCE(sqe->addr); - req->timeout.flags = READ_ONCE(sqe->timeout_flags); - if (req->timeout.flags) + if (sqe->ioprio || sqe->buf_index || sqe->len || sqe->timeout_flags) return -EINVAL; + req->timeout_rem.addr = READ_ONCE(sqe->addr); return 0; } @@ -5222,7 +5375,7 @@ static int io_timeout_remove(struct io_kiocb *req) int ret; spin_lock_irq(&ctx->completion_lock); - ret = io_timeout_cancel(ctx, req->timeout.addr); + ret = io_timeout_cancel(ctx, req->timeout_rem.addr); io_cqring_fill_event(req, ret); io_commit_cqring(ctx); @@ -5253,10 +5406,10 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, req->timeout.off = off; - if (!req->io && io_alloc_async_ctx(req)) + if (!req->async_data && io_alloc_async_data(req)) return -ENOMEM; - data = &req->io->timeout; + data = req->async_data; data->req = req; if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr))) @@ -5274,7 +5427,7 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, static int io_timeout(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - struct io_timeout_data *data = &req->io->timeout; + struct io_timeout_data *data = req->async_data; struct list_head *entry; u32 tail, off = req->timeout.off; @@ -5399,6 +5552,8 @@ static int io_async_cancel(struct io_kiocb *req) static int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { + if (unlikely(req->ctx->flags & IORING_SETUP_SQPOLL)) + return -EINVAL; if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) return -EINVAL; if (sqe->ioprio || sqe->rw_flags) @@ -5435,118 +5590,86 @@ static int io_files_update(struct io_kiocb *req, bool force_nonblock, return 0; } -static int io_req_defer_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) +static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - ssize_t ret = 0; - - if (!sqe) - return 0; - - if (io_alloc_async_ctx(req)) - return -EAGAIN; - ret = io_prep_work_files(req); - if (unlikely(ret)) - return ret; - switch (req->opcode) { case IORING_OP_NOP: - break; + return 0; case IORING_OP_READV: case IORING_OP_READ_FIXED: case IORING_OP_READ: - ret = io_read_prep(req, sqe, true); - break; + return io_read_prep(req, sqe); case IORING_OP_WRITEV: case IORING_OP_WRITE_FIXED: case IORING_OP_WRITE: - ret = io_write_prep(req, sqe, true); - break; + return io_write_prep(req, sqe); case IORING_OP_POLL_ADD: - ret = io_poll_add_prep(req, sqe); - break; + return io_poll_add_prep(req, sqe); case IORING_OP_POLL_REMOVE: - ret = io_poll_remove_prep(req, sqe); - break; + return io_poll_remove_prep(req, sqe); case IORING_OP_FSYNC: - ret = io_prep_fsync(req, sqe); - break; + return io_prep_fsync(req, sqe); case IORING_OP_SYNC_FILE_RANGE: - ret = io_prep_sfr(req, sqe); - break; + return io_prep_sfr(req, sqe); case IORING_OP_SENDMSG: case IORING_OP_SEND: - ret = io_sendmsg_prep(req, sqe); - break; + return io_sendmsg_prep(req, sqe); case IORING_OP_RECVMSG: case IORING_OP_RECV: - ret = io_recvmsg_prep(req, sqe); - break; + return io_recvmsg_prep(req, sqe); case IORING_OP_CONNECT: - ret = io_connect_prep(req, sqe); - break; + return io_connect_prep(req, sqe); case IORING_OP_TIMEOUT: - ret = io_timeout_prep(req, sqe, false); - break; + return io_timeout_prep(req, sqe, false); case IORING_OP_TIMEOUT_REMOVE: - ret = io_timeout_remove_prep(req, sqe); - break; + return io_timeout_remove_prep(req, sqe); case IORING_OP_ASYNC_CANCEL: - ret = io_async_cancel_prep(req, sqe); - break; + return io_async_cancel_prep(req, sqe); case IORING_OP_LINK_TIMEOUT: - ret = io_timeout_prep(req, sqe, true); - break; + return io_timeout_prep(req, sqe, true); case IORING_OP_ACCEPT: - ret = io_accept_prep(req, sqe); - break; + return io_accept_prep(req, sqe); case IORING_OP_FALLOCATE: - ret = io_fallocate_prep(req, sqe); - break; + return io_fallocate_prep(req, sqe); case IORING_OP_OPENAT: - ret = io_openat_prep(req, sqe); - break; + return io_openat_prep(req, sqe); case IORING_OP_CLOSE: - ret = io_close_prep(req, sqe); - break; + return io_close_prep(req, sqe); case IORING_OP_FILES_UPDATE: - ret = io_files_update_prep(req, sqe); - break; + return io_files_update_prep(req, sqe); case IORING_OP_STATX: - ret = io_statx_prep(req, sqe); - break; + return io_statx_prep(req, sqe); case IORING_OP_FADVISE: - ret = io_fadvise_prep(req, sqe); - break; + return io_fadvise_prep(req, sqe); case IORING_OP_MADVISE: - ret = io_madvise_prep(req, sqe); - break; + return io_madvise_prep(req, sqe); case IORING_OP_OPENAT2: - ret = io_openat2_prep(req, sqe); - break; + return io_openat2_prep(req, sqe); case IORING_OP_EPOLL_CTL: - ret = io_epoll_ctl_prep(req, sqe); - break; + return io_epoll_ctl_prep(req, sqe); case IORING_OP_SPLICE: - ret = io_splice_prep(req, sqe); - break; + return io_splice_prep(req, sqe); case IORING_OP_PROVIDE_BUFFERS: - ret = io_provide_buffers_prep(req, sqe); - break; + return io_provide_buffers_prep(req, sqe); case IORING_OP_REMOVE_BUFFERS: - ret = io_remove_buffers_prep(req, sqe); - break; + return io_remove_buffers_prep(req, sqe); case IORING_OP_TEE: - ret = io_tee_prep(req, sqe); - break; - default: - printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", - req->opcode); - ret = -EINVAL; - break; + return io_tee_prep(req, sqe); } - return ret; + printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", + req->opcode); + return-EINVAL; +} + +static int io_req_defer_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + if (!sqe) + return 0; + if (io_alloc_async_data(req)) + return -EAGAIN; + return io_req_prep(req, sqe); } static u32 io_get_sequence(struct io_kiocb *req) @@ -5580,7 +5703,7 @@ static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list)) return 0; - if (!req->io) { + if (!req->async_data) { ret = io_req_defer_prep(req, sqe); if (ret) return ret; @@ -5606,10 +5729,24 @@ static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -EIOCBQUEUED; } -static void __io_clean_op(struct io_kiocb *req) +static void io_req_drop_files(struct io_kiocb *req) { - struct io_async_ctx *io = req->io; + struct io_ring_ctx *ctx = req->ctx; + unsigned long flags; + + spin_lock_irqsave(&ctx->inflight_lock, flags); + list_del(&req->inflight_entry); + if (waitqueue_active(&ctx->inflight_wait)) + wake_up(&ctx->inflight_wait); + spin_unlock_irqrestore(&ctx->inflight_lock, flags); + req->flags &= ~REQ_F_INFLIGHT; + put_files_struct(req->work.files); + put_nsproxy(req->work.nsproxy); + req->work.files = NULL; +} +static void __io_clean_op(struct io_kiocb *req) +{ if (req->flags & REQ_F_BUFFER_SELECTED) { switch (req->opcode) { case IORING_OP_READV: @@ -5632,39 +5769,39 @@ static void __io_clean_op(struct io_kiocb *req) case IORING_OP_READ: case IORING_OP_WRITEV: case IORING_OP_WRITE_FIXED: - case IORING_OP_WRITE: - if (io->rw.free_iovec) - kfree(io->rw.free_iovec); + case IORING_OP_WRITE: { + struct io_async_rw *io = req->async_data; + if (io->free_iovec) + kfree(io->free_iovec); break; + } case IORING_OP_RECVMSG: - case IORING_OP_SENDMSG: - if (io->msg.iov != io->msg.fast_iov) - kfree(io->msg.iov); + case IORING_OP_SENDMSG: { + struct io_async_msghdr *io = req->async_data; + if (io->iov != io->fast_iov) + kfree(io->iov); break; + } case IORING_OP_SPLICE: case IORING_OP_TEE: io_put_file(req, req->splice.file_in, (req->splice.flags & SPLICE_F_FD_IN_FIXED)); break; + case IORING_OP_OPENAT: + case IORING_OP_OPENAT2: + if (req->open.filename) + putname(req->open.filename); + break; } req->flags &= ~REQ_F_NEED_CLEANUP; } - if (req->flags & REQ_F_INFLIGHT) { - struct io_ring_ctx *ctx = req->ctx; - unsigned long flags; - - spin_lock_irqsave(&ctx->inflight_lock, flags); - list_del(&req->inflight_entry); - if (waitqueue_active(&ctx->inflight_wait)) - wake_up(&ctx->inflight_wait); - spin_unlock_irqrestore(&ctx->inflight_lock, flags); - req->flags &= ~REQ_F_INFLIGHT; - } + if (req->flags & REQ_F_INFLIGHT) + io_req_drop_files(req); } -static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, - bool force_nonblock, struct io_comp_state *cs) +static int io_issue_sqe(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { struct io_ring_ctx *ctx = req->ctx; int ret; @@ -5676,221 +5813,89 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, case IORING_OP_READV: case IORING_OP_READ_FIXED: case IORING_OP_READ: - if (sqe) { - ret = io_read_prep(req, sqe, force_nonblock); - if (ret < 0) - break; - } ret = io_read(req, force_nonblock, cs); break; case IORING_OP_WRITEV: case IORING_OP_WRITE_FIXED: case IORING_OP_WRITE: - if (sqe) { - ret = io_write_prep(req, sqe, force_nonblock); - if (ret < 0) - break; - } ret = io_write(req, force_nonblock, cs); break; case IORING_OP_FSYNC: - if (sqe) { - ret = io_prep_fsync(req, sqe); - if (ret < 0) - break; - } ret = io_fsync(req, force_nonblock); break; case IORING_OP_POLL_ADD: - if (sqe) { - ret = io_poll_add_prep(req, sqe); - if (ret) - break; - } ret = io_poll_add(req); break; case IORING_OP_POLL_REMOVE: - if (sqe) { - ret = io_poll_remove_prep(req, sqe); - if (ret < 0) - break; - } ret = io_poll_remove(req); break; case IORING_OP_SYNC_FILE_RANGE: - if (sqe) { - ret = io_prep_sfr(req, sqe); - if (ret < 0) - break; - } ret = io_sync_file_range(req, force_nonblock); break; case IORING_OP_SENDMSG: + ret = io_sendmsg(req, force_nonblock, cs); + break; case IORING_OP_SEND: - if (sqe) { - ret = io_sendmsg_prep(req, sqe); - if (ret < 0) - break; - } - if (req->opcode == IORING_OP_SENDMSG) - ret = io_sendmsg(req, force_nonblock, cs); - else - ret = io_send(req, force_nonblock, cs); + ret = io_send(req, force_nonblock, cs); break; case IORING_OP_RECVMSG: + ret = io_recvmsg(req, force_nonblock, cs); + break; case IORING_OP_RECV: - if (sqe) { - ret = io_recvmsg_prep(req, sqe); - if (ret) - break; - } - if (req->opcode == IORING_OP_RECVMSG) - ret = io_recvmsg(req, force_nonblock, cs); - else - ret = io_recv(req, force_nonblock, cs); + ret = io_recv(req, force_nonblock, cs); break; case IORING_OP_TIMEOUT: - if (sqe) { - ret = io_timeout_prep(req, sqe, false); - if (ret) - break; - } ret = io_timeout(req); break; case IORING_OP_TIMEOUT_REMOVE: - if (sqe) { - ret = io_timeout_remove_prep(req, sqe); - if (ret) - break; - } ret = io_timeout_remove(req); break; case IORING_OP_ACCEPT: - if (sqe) { - ret = io_accept_prep(req, sqe); - if (ret) - break; - } ret = io_accept(req, force_nonblock, cs); break; case IORING_OP_CONNECT: - if (sqe) { - ret = io_connect_prep(req, sqe); - if (ret) - break; - } ret = io_connect(req, force_nonblock, cs); break; case IORING_OP_ASYNC_CANCEL: - if (sqe) { - ret = io_async_cancel_prep(req, sqe); - if (ret) - break; - } ret = io_async_cancel(req); break; case IORING_OP_FALLOCATE: - if (sqe) { - ret = io_fallocate_prep(req, sqe); - if (ret) - break; - } ret = io_fallocate(req, force_nonblock); break; case IORING_OP_OPENAT: - if (sqe) { - ret = io_openat_prep(req, sqe); - if (ret) - break; - } ret = io_openat(req, force_nonblock); break; case IORING_OP_CLOSE: - if (sqe) { - ret = io_close_prep(req, sqe); - if (ret) - break; - } ret = io_close(req, force_nonblock, cs); break; case IORING_OP_FILES_UPDATE: - if (sqe) { - ret = io_files_update_prep(req, sqe); - if (ret) - break; - } ret = io_files_update(req, force_nonblock, cs); break; case IORING_OP_STATX: - if (sqe) { - ret = io_statx_prep(req, sqe); - if (ret) - break; - } ret = io_statx(req, force_nonblock); break; case IORING_OP_FADVISE: - if (sqe) { - ret = io_fadvise_prep(req, sqe); - if (ret) - break; - } ret = io_fadvise(req, force_nonblock); break; case IORING_OP_MADVISE: - if (sqe) { - ret = io_madvise_prep(req, sqe); - if (ret) - break; - } ret = io_madvise(req, force_nonblock); break; case IORING_OP_OPENAT2: - if (sqe) { - ret = io_openat2_prep(req, sqe); - if (ret) - break; - } ret = io_openat2(req, force_nonblock); break; case IORING_OP_EPOLL_CTL: - if (sqe) { - ret = io_epoll_ctl_prep(req, sqe); - if (ret) - break; - } ret = io_epoll_ctl(req, force_nonblock, cs); break; case IORING_OP_SPLICE: - if (sqe) { - ret = io_splice_prep(req, sqe); - if (ret < 0) - break; - } ret = io_splice(req, force_nonblock); break; case IORING_OP_PROVIDE_BUFFERS: - if (sqe) { - ret = io_provide_buffers_prep(req, sqe); - if (ret) - break; - } ret = io_provide_buffers(req, force_nonblock, cs); break; case IORING_OP_REMOVE_BUFFERS: - if (sqe) { - ret = io_remove_buffers_prep(req, sqe); - if (ret) - break; - } ret = io_remove_buffers(req, force_nonblock, cs); break; case IORING_OP_TEE: - if (sqe) { - ret = io_tee_prep(req, sqe); - if (ret < 0) - break; - } ret = io_tee(req, force_nonblock); break; default: @@ -5936,7 +5941,7 @@ static struct io_wq_work *io_wq_submit_work(struct io_wq_work *work) if (!ret) { do { - ret = io_issue_sqe(req, NULL, false, NULL); + ret = io_issue_sqe(req, false, NULL); /* * We can get EAGAIN for polled IO even though we're * forcing a sync submission from here, since we can't @@ -5965,20 +5970,19 @@ static inline struct file *io_file_from_index(struct io_ring_ctx *ctx, return table->files[index & IORING_FILE_TABLE_MASK]; } -static int io_file_get(struct io_submit_state *state, struct io_kiocb *req, - int fd, struct file **out_file, bool fixed) +static struct file *io_file_get(struct io_submit_state *state, + struct io_kiocb *req, int fd, bool fixed) { struct io_ring_ctx *ctx = req->ctx; struct file *file; if (fixed) { - if (unlikely(!ctx->file_data || - (unsigned) fd >= ctx->nr_user_files)) - return -EBADF; + if (unlikely((unsigned int)fd >= ctx->nr_user_files)) + return NULL; fd = array_index_nospec(fd, ctx->nr_user_files); file = io_file_from_index(ctx, fd); if (file) { - req->fixed_file_refs = ctx->file_data->cur_refs; + req->fixed_file_refs = &ctx->file_data->node->refs; percpu_ref_get(req->fixed_file_refs); } } else { @@ -5986,11 +5990,7 @@ static int io_file_get(struct io_submit_state *state, struct io_kiocb *req, file = __io_file_get(state, fd); } - if (file || io_op_defs[req->opcode].needs_file_no_error) { - *out_file = file; - return 0; - } - return -EBADF; + return file; } static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req, @@ -6002,46 +6002,10 @@ static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req, if (unlikely(!fixed && io_async_submit(req->ctx))) return -EBADF; - return io_file_get(state, req, fd, &req->file, fixed); -} - -static int io_grab_files(struct io_kiocb *req) -{ - int ret = -EBADF; - struct io_ring_ctx *ctx = req->ctx; - - io_req_init_async(req); - - if (req->work.files || (req->flags & REQ_F_NO_FILE_TABLE)) + req->file = io_file_get(state, req, fd, fixed); + if (req->file || io_op_defs[req->opcode].needs_file_no_error) return 0; - if (!ctx->ring_file) - return -EBADF; - - rcu_read_lock(); - spin_lock_irq(&ctx->inflight_lock); - /* - * We use the f_ops->flush() handler to ensure that we can flush - * out work accessing these files if the fd is closed. Check if - * the fd has changed since we started down this path, and disallow - * this operation if it has. - */ - if (fcheck(ctx->ring_fd) == ctx->ring_file) { - list_add(&req->inflight_entry, &ctx->inflight_list); - req->flags |= REQ_F_INFLIGHT; - req->work.files = current->files; - ret = 0; - } - spin_unlock_irq(&ctx->inflight_lock); - rcu_read_unlock(); - - return ret; -} - -static inline int io_prep_work_files(struct io_kiocb *req) -{ - if (!io_op_defs[req->opcode].file_table) - return 0; - return io_grab_files(req); + return -EBADF; } static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) @@ -6088,7 +6052,7 @@ static void __io_queue_linked_timeout(struct io_kiocb *req) * we got a chance to setup the timer */ if (!list_empty(&req->link_list)) { - struct io_timeout_data *data = &req->io->timeout; + struct io_timeout_data *data = req->async_data; data->timer.function = io_link_timeout_fn; hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), @@ -6126,8 +6090,7 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) return nxt; } -static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, - struct io_comp_state *cs) +static void __io_queue_sqe(struct io_kiocb *req, struct io_comp_state *cs) { struct io_kiocb *linked_timeout; struct io_kiocb *nxt; @@ -6147,7 +6110,7 @@ again: old_creds = override_creds(req->work.creds); } - ret = io_issue_sqe(req, sqe, true, cs); + ret = io_issue_sqe(req, true, cs); /* * We async punt it if the file wasn't marked NOWAIT, or if the file @@ -6156,9 +6119,6 @@ again: if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) { if (!io_arm_poll_handler(req)) { punt: - ret = io_prep_work_files(req); - if (unlikely(ret)) - goto err; /* * Queued up for async execution, worker will release * submit reference when the iocb is actually submitted. @@ -6172,7 +6132,6 @@ punt: } if (unlikely(ret)) { -err: /* un-prep timeout, so it'll be killed as any other linked */ req->flags &= ~REQ_F_LINK_TIMEOUT; req_set_fail_links(req); @@ -6212,7 +6171,7 @@ fail_req: io_req_complete(req, ret); } } else if (req->flags & REQ_F_FORCE_ASYNC) { - if (!req->io) { + if (!req->async_data) { ret = io_req_defer_prep(req, sqe); if (unlikely(ret)) goto fail_req; @@ -6226,7 +6185,12 @@ fail_req: req->work.flags |= IO_WQ_WORK_CONCURRENT; io_queue_async_work(req); } else { - __io_queue_sqe(req, sqe, cs); + if (sqe) { + ret = io_req_prep(req, sqe); + if (unlikely(ret)) + goto fail_req; + } + __io_queue_sqe(req, cs); } } @@ -6274,7 +6238,6 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, return ret; } trace_io_uring_link(ctx, req, head); - io_get_req_task(req); list_add_tail(&req->link_list, &head->link_list); /* last request of a link, enqueue the link */ @@ -6323,9 +6286,6 @@ static void io_submit_state_start(struct io_submit_state *state, struct io_ring_ctx *ctx, unsigned int max_ios) { blk_start_plug(&state->plug); -#ifdef CONFIG_BLOCK - state->plug.nowait = true; -#endif state->comp.nr = 0; INIT_LIST_HEAD(&state->comp.list); state->comp.ctx = ctx; @@ -6382,6 +6342,32 @@ static inline void io_consume_sqe(struct io_ring_ctx *ctx) ctx->cached_sq_head++; } +/* + * Check SQE restrictions (opcode and flags). + * + * Returns 'true' if SQE is allowed, 'false' otherwise. + */ +static inline bool io_check_restriction(struct io_ring_ctx *ctx, + struct io_kiocb *req, + unsigned int sqe_flags) +{ + if (!ctx->restricted) + return true; + + if (!test_bit(req->opcode, ctx->restrictions.sqe_op)) + return false; + + if ((sqe_flags & ctx->restrictions.sqe_flags_required) != + ctx->restrictions.sqe_flags_required) + return false; + + if (sqe_flags & ~(ctx->restrictions.sqe_flags_allowed | + ctx->restrictions.sqe_flags_required)) + return false; + + return true; +} + #define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \ IOSQE_IO_HARDLINK | IOSQE_ASYNC | \ IOSQE_BUFFER_SELECT) @@ -6391,11 +6377,11 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, struct io_submit_state *state) { unsigned int sqe_flags; - int id; + int id, ret; req->opcode = READ_ONCE(sqe->opcode); req->user_data = READ_ONCE(sqe->user_data); - req->io = NULL; + req->async_data = NULL; req->file = NULL; req->ctx = ctx; req->flags = 0; @@ -6415,6 +6401,9 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, if (unlikely(sqe_flags & ~SQE_VALID_FLAGS)) return -EINVAL; + if (unlikely(!io_check_restriction(ctx, req, sqe_flags))) + return -EACCES; + if ((sqe_flags & IOSQE_BUFFER_SELECT) && !io_op_defs[req->opcode].buffer_select) return -EOPNOTSUPP; @@ -6434,11 +6423,12 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, if (!io_op_defs[req->opcode].needs_file) return 0; - return io_req_set_file(state, req, READ_ONCE(sqe->fd)); + ret = io_req_set_file(state, req, READ_ONCE(sqe->fd)); + state->ios_left--; + return ret; } -static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, - struct file *ring_file, int ring_fd) +static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) { struct io_submit_state state; struct io_kiocb *link = NULL; @@ -6447,7 +6437,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, /* if we have a backlog and couldn't flush it all, return BUSY */ if (test_bit(0, &ctx->sq_check_overflow)) { if (!list_empty(&ctx->cq_overflow_list) && - !io_cqring_overflow_flush(ctx, false)) + !io_cqring_overflow_flush(ctx, false, NULL, NULL)) return -EBUSY; } @@ -6457,10 +6447,10 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, if (!percpu_ref_tryget_many(&ctx->refs, nr)) return -EAGAIN; - io_submit_state_start(&state, ctx, nr); + atomic_long_add(nr, ¤t->io_uring->req_issue); + refcount_add(nr, ¤t->usage); - ctx->ring_fd = ring_fd; - ctx->ring_file = ring_file; + io_submit_state_start(&state, ctx, nr); for (i = 0; i < nr; i++) { const struct io_uring_sqe *sqe; @@ -6478,12 +6468,11 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, submitted = -EAGAIN; break; } - - err = io_init_req(ctx, req, sqe, &state); io_consume_sqe(ctx); /* will complete beyond this point, count as submitted */ submitted++; + err = io_init_req(ctx, req, sqe, &state); if (unlikely(err)) { fail_req: io_put_req(req); @@ -6502,6 +6491,8 @@ fail_req: int ref_used = (submitted == -EAGAIN) ? 0 : submitted; percpu_ref_put_many(&ctx->refs, nr - ref_used); + atomic_long_sub(nr - ref_used, ¤t->io_uring->req_issue); + put_task_struct_many(current, nr - ref_used); } if (link) io_queue_link_head(link, &state.comp); @@ -6528,117 +6519,186 @@ static inline void io_ring_clear_wakeup_flag(struct io_ring_ctx *ctx) spin_unlock_irq(&ctx->completion_lock); } -static int io_sq_thread(void *data) +static int io_sq_wake_function(struct wait_queue_entry *wqe, unsigned mode, + int sync, void *key) { - struct io_ring_ctx *ctx = data; - const struct cred *old_cred; - DEFINE_WAIT(wait); - unsigned long timeout; + struct io_ring_ctx *ctx = container_of(wqe, struct io_ring_ctx, sqo_wait_entry); + int ret; + + ret = autoremove_wake_function(wqe, mode, sync, key); + if (ret) { + unsigned long flags; + + spin_lock_irqsave(&ctx->completion_lock, flags); + ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP; + spin_unlock_irqrestore(&ctx->completion_lock, flags); + } + return ret; +} + +enum sq_ret { + SQT_IDLE = 1, + SQT_SPIN = 2, + SQT_DID_WORK = 4, +}; + +static enum sq_ret __io_sq_thread(struct io_ring_ctx *ctx, + unsigned long start_jiffies, bool cap_entries) +{ + unsigned long timeout = start_jiffies + ctx->sq_thread_idle; + struct io_sq_data *sqd = ctx->sq_data; + unsigned int to_submit; int ret = 0; - complete(&ctx->sq_thread_comp); +again: + if (!list_empty(&ctx->iopoll_list)) { + unsigned nr_events = 0; + + mutex_lock(&ctx->uring_lock); + if (!list_empty(&ctx->iopoll_list) && !need_resched()) + io_do_iopoll(ctx, &nr_events, 0); + mutex_unlock(&ctx->uring_lock); + } - old_cred = override_creds(ctx->creds); + to_submit = io_sqring_entries(ctx); - timeout = jiffies + ctx->sq_thread_idle; - while (!kthread_should_park()) { - unsigned int to_submit; + /* + * If submit got -EBUSY, flag us as needing the application + * to enter the kernel to reap and flush events. + */ + if (!to_submit || ret == -EBUSY || need_resched()) { + /* + * Drop cur_mm before scheduling, we can't hold it for + * long periods (or over schedule()). Do this before + * adding ourselves to the waitqueue, as the unuse/drop + * may sleep. + */ + io_sq_thread_drop_mm(); - if (!list_empty(&ctx->iopoll_list)) { - unsigned nr_events = 0; + /* + * We're polling. If we're within the defined idle + * period, then let us spin without work before going + * to sleep. The exception is if we got EBUSY doing + * more IO, we should wait for the application to + * reap events and wake us up. + */ + if (!list_empty(&ctx->iopoll_list) || need_resched() || + (!time_after(jiffies, timeout) && ret != -EBUSY && + !percpu_ref_is_dying(&ctx->refs))) + return SQT_SPIN; - mutex_lock(&ctx->uring_lock); - if (!list_empty(&ctx->iopoll_list) && !need_resched()) - io_do_iopoll(ctx, &nr_events, 0); - else - timeout = jiffies + ctx->sq_thread_idle; - mutex_unlock(&ctx->uring_lock); + prepare_to_wait(&sqd->wait, &ctx->sqo_wait_entry, + TASK_INTERRUPTIBLE); + + /* + * While doing polled IO, before going to sleep, we need + * to check if there are new reqs added to iopoll_list, + * it is because reqs may have been punted to io worker + * and will be added to iopoll_list later, hence check + * the iopoll_list again. + */ + if ((ctx->flags & IORING_SETUP_IOPOLL) && + !list_empty_careful(&ctx->iopoll_list)) { + finish_wait(&sqd->wait, &ctx->sqo_wait_entry); + goto again; } to_submit = io_sqring_entries(ctx); + if (!to_submit || ret == -EBUSY) + return SQT_IDLE; + } + + finish_wait(&sqd->wait, &ctx->sqo_wait_entry); + io_ring_clear_wakeup_flag(ctx); + + /* if we're handling multiple rings, cap submit size for fairness */ + if (cap_entries && to_submit > 8) + to_submit = 8; + + mutex_lock(&ctx->uring_lock); + if (likely(!percpu_ref_is_dying(&ctx->refs))) + ret = io_submit_sqes(ctx, to_submit); + mutex_unlock(&ctx->uring_lock); + + if (!io_sqring_full(ctx) && wq_has_sleeper(&ctx->sqo_sq_wait)) + wake_up(&ctx->sqo_sq_wait); + + return SQT_DID_WORK; +} + +static void io_sqd_init_new(struct io_sq_data *sqd) +{ + struct io_ring_ctx *ctx; + + while (!list_empty(&sqd->ctx_new_list)) { + ctx = list_first_entry(&sqd->ctx_new_list, struct io_ring_ctx, sqd_list); + init_wait(&ctx->sqo_wait_entry); + ctx->sqo_wait_entry.func = io_sq_wake_function; + list_move_tail(&ctx->sqd_list, &sqd->ctx_list); + complete(&ctx->sq_thread_comp); + } +} + +static int io_sq_thread(void *data) +{ + struct cgroup_subsys_state *cur_css = NULL; + const struct cred *old_cred = NULL; + struct io_sq_data *sqd = data; + struct io_ring_ctx *ctx; + unsigned long start_jiffies; + + start_jiffies = jiffies; + while (!kthread_should_stop()) { + enum sq_ret ret = 0; + bool cap_entries; /* - * If submit got -EBUSY, flag us as needing the application - * to enter the kernel to reap and flush events. + * Any changes to the sqd lists are synchronized through the + * kthread parking. This synchronizes the thread vs users, + * the users are synchronized on the sqd->ctx_lock. */ - if (!to_submit || ret == -EBUSY || need_resched()) { - /* - * Drop cur_mm before scheduling, we can't hold it for - * long periods (or over schedule()). Do this before - * adding ourselves to the waitqueue, as the unuse/drop - * may sleep. - */ - io_sq_thread_drop_mm(); + if (kthread_should_park()) + kthread_parkme(); - /* - * We're polling. If we're within the defined idle - * period, then let us spin without work before going - * to sleep. The exception is if we got EBUSY doing - * more IO, we should wait for the application to - * reap events and wake us up. - */ - if (!list_empty(&ctx->iopoll_list) || need_resched() || - (!time_after(jiffies, timeout) && ret != -EBUSY && - !percpu_ref_is_dying(&ctx->refs))) { - io_run_task_work(); - cond_resched(); - continue; - } + if (unlikely(!list_empty(&sqd->ctx_new_list))) + io_sqd_init_new(sqd); - prepare_to_wait(&ctx->sqo_wait, &wait, - TASK_INTERRUPTIBLE); + cap_entries = !list_is_singular(&sqd->ctx_list); - /* - * While doing polled IO, before going to sleep, we need - * to check if there are new reqs added to iopoll_list, - * it is because reqs may have been punted to io worker - * and will be added to iopoll_list later, hence check - * the iopoll_list again. - */ - if ((ctx->flags & IORING_SETUP_IOPOLL) && - !list_empty_careful(&ctx->iopoll_list)) { - finish_wait(&ctx->sqo_wait, &wait); - continue; + list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { + if (current->cred != ctx->creds) { + if (old_cred) + revert_creds(old_cred); + old_cred = override_creds(ctx->creds); } + io_sq_thread_associate_blkcg(ctx, &cur_css); - io_ring_set_wakeup_flag(ctx); + ret |= __io_sq_thread(ctx, start_jiffies, cap_entries); - to_submit = io_sqring_entries(ctx); - if (!to_submit || ret == -EBUSY) { - if (kthread_should_park()) { - finish_wait(&ctx->sqo_wait, &wait); - break; - } - if (io_run_task_work()) { - finish_wait(&ctx->sqo_wait, &wait); - io_ring_clear_wakeup_flag(ctx); - continue; - } - if (signal_pending(current)) - flush_signals(current); - schedule(); - finish_wait(&ctx->sqo_wait, &wait); + io_sq_thread_drop_mm(); + } - io_ring_clear_wakeup_flag(ctx); - ret = 0; + if (ret & SQT_SPIN) { + io_run_task_work(); + cond_resched(); + } else if (ret == SQT_IDLE) { + if (kthread_should_park()) continue; - } - finish_wait(&ctx->sqo_wait, &wait); - - io_ring_clear_wakeup_flag(ctx); + list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) + io_ring_set_wakeup_flag(ctx); + schedule(); + start_jiffies = jiffies; + list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) + io_ring_clear_wakeup_flag(ctx); } - - mutex_lock(&ctx->uring_lock); - if (likely(!percpu_ref_is_dying(&ctx->refs))) - ret = io_submit_sqes(ctx, to_submit, NULL, -1); - mutex_unlock(&ctx->uring_lock); - timeout = jiffies + ctx->sq_thread_idle; } io_run_task_work(); - io_sq_thread_drop_mm(); - revert_creds(old_cred); + if (cur_css) + io_sq_thread_unassociate_blkcg(); + if (old_cred) + revert_creds(old_cred); kthread_parkme(); @@ -6678,6 +6738,22 @@ static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode, return autoremove_wake_function(curr, mode, wake_flags, key); } +static int io_run_task_work_sig(void) +{ + if (io_run_task_work()) + return 1; + if (!signal_pending(current)) + return 0; + if (current->jobctl & JOBCTL_TASK_WORK) { + spin_lock_irq(¤t->sighand->siglock); + current->jobctl &= ~JOBCTL_TASK_WORK; + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); + return 1; + } + return -EINTR; +} + /* * Wait until events become available, if we don't already have some. The * application must reap them itself, as they reside on the shared cq ring. @@ -6723,19 +6799,11 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, prepare_to_wait_exclusive(&ctx->wait, &iowq.wq, TASK_INTERRUPTIBLE); /* make sure we run task_work before checking for signals */ - if (io_run_task_work()) + ret = io_run_task_work_sig(); + if (ret > 0) continue; - if (signal_pending(current)) { - if (current->jobctl & JOBCTL_TASK_WORK) { - spin_lock_irq(¤t->sighand->siglock); - current->jobctl &= ~JOBCTL_TASK_WORK; - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - continue; - } - ret = -EINTR; + else if (ret < 0) break; - } if (io_should_wake(&iowq, false)) break; schedule(); @@ -6813,18 +6881,116 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx) return 0; } -static void io_sq_thread_stop(struct io_ring_ctx *ctx) +static void io_put_sq_data(struct io_sq_data *sqd) { - if (ctx->sqo_thread) { - wait_for_completion(&ctx->sq_thread_comp); + if (refcount_dec_and_test(&sqd->refs)) { /* * The park is a bit of a work-around, without it we get * warning spews on shutdown with SQPOLL set and affinity * set to a single CPU. */ - kthread_park(ctx->sqo_thread); - kthread_stop(ctx->sqo_thread); - ctx->sqo_thread = NULL; + if (sqd->thread) { + kthread_park(sqd->thread); + kthread_stop(sqd->thread); + } + + kfree(sqd); + } +} + +static struct io_sq_data *io_attach_sq_data(struct io_uring_params *p) +{ + struct io_ring_ctx *ctx_attach; + struct io_sq_data *sqd; + struct fd f; + + f = fdget(p->wq_fd); + if (!f.file) + return ERR_PTR(-ENXIO); + if (f.file->f_op != &io_uring_fops) { + fdput(f); + return ERR_PTR(-EINVAL); + } + + ctx_attach = f.file->private_data; + sqd = ctx_attach->sq_data; + if (!sqd) { + fdput(f); + return ERR_PTR(-EINVAL); + } + + refcount_inc(&sqd->refs); + fdput(f); + return sqd; +} + +static struct io_sq_data *io_get_sq_data(struct io_uring_params *p) +{ + struct io_sq_data *sqd; + + if (p->flags & IORING_SETUP_ATTACH_WQ) + return io_attach_sq_data(p); + + sqd = kzalloc(sizeof(*sqd), GFP_KERNEL); + if (!sqd) + return ERR_PTR(-ENOMEM); + + refcount_set(&sqd->refs, 1); + INIT_LIST_HEAD(&sqd->ctx_list); + INIT_LIST_HEAD(&sqd->ctx_new_list); + mutex_init(&sqd->ctx_lock); + mutex_init(&sqd->lock); + init_waitqueue_head(&sqd->wait); + return sqd; +} + +static void io_sq_thread_unpark(struct io_sq_data *sqd) + __releases(&sqd->lock) +{ + if (!sqd->thread) + return; + kthread_unpark(sqd->thread); + mutex_unlock(&sqd->lock); +} + +static void io_sq_thread_park(struct io_sq_data *sqd) + __acquires(&sqd->lock) +{ + if (!sqd->thread) + return; + mutex_lock(&sqd->lock); + kthread_park(sqd->thread); +} + +static void io_sq_thread_stop(struct io_ring_ctx *ctx) +{ + struct io_sq_data *sqd = ctx->sq_data; + + if (sqd) { + if (sqd->thread) { + /* + * We may arrive here from the error branch in + * io_sq_offload_create() where the kthread is created + * without being waked up, thus wake it up now to make + * sure the wait will complete. + */ + wake_up_process(sqd->thread); + wait_for_completion(&ctx->sq_thread_comp); + + io_sq_thread_park(sqd); + } + + mutex_lock(&sqd->ctx_lock); + list_del(&ctx->sqd_list); + mutex_unlock(&sqd->ctx_lock); + + if (sqd->thread) { + finish_wait(&sqd->wait, &ctx->sqo_wait_entry); + io_sq_thread_unpark(sqd); + } + + io_put_sq_data(sqd); + ctx->sq_data = NULL; } } @@ -6935,13 +7101,13 @@ static int io_sqe_files_scm(struct io_ring_ctx *ctx) } #endif -static int io_sqe_alloc_file_tables(struct io_ring_ctx *ctx, unsigned nr_tables, - unsigned nr_files) +static int io_sqe_alloc_file_tables(struct fixed_file_data *file_data, + unsigned nr_tables, unsigned nr_files) { int i; for (i = 0; i < nr_tables; i++) { - struct fixed_file_table *table = &ctx->file_data->table[i]; + struct fixed_file_table *table = &file_data->table[i]; unsigned this_files; this_files = min(nr_files, IORING_MAX_FILES_TABLE); @@ -6956,7 +7122,7 @@ static int io_sqe_alloc_file_tables(struct io_ring_ctx *ctx, unsigned nr_tables, return 0; for (i = 0; i < nr_tables; i++) { - struct fixed_file_table *table = &ctx->file_data->table[i]; + struct fixed_file_table *table = &file_data->table[i]; kfree(table->files); } return 1; @@ -7118,11 +7284,11 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args) { __s32 __user *fds = (__s32 __user *) arg; - unsigned nr_tables; + unsigned nr_tables, i; struct file *file; - int fd, ret = 0; - unsigned i; + int fd, ret = -ENOMEM; struct fixed_file_ref_node *ref_node; + struct fixed_file_data *file_data; if (ctx->file_data) return -EBUSY; @@ -7131,60 +7297,43 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, if (nr_args > IORING_MAX_FIXED_FILES) return -EMFILE; - ctx->file_data = kzalloc(sizeof(*ctx->file_data), GFP_KERNEL); - if (!ctx->file_data) + file_data = kzalloc(sizeof(*ctx->file_data), GFP_KERNEL); + if (!file_data) return -ENOMEM; - ctx->file_data->ctx = ctx; - init_completion(&ctx->file_data->done); - INIT_LIST_HEAD(&ctx->file_data->ref_list); - spin_lock_init(&ctx->file_data->lock); + file_data->ctx = ctx; + init_completion(&file_data->done); + INIT_LIST_HEAD(&file_data->ref_list); + spin_lock_init(&file_data->lock); nr_tables = DIV_ROUND_UP(nr_args, IORING_MAX_FILES_TABLE); - ctx->file_data->table = kcalloc(nr_tables, - sizeof(struct fixed_file_table), - GFP_KERNEL); - if (!ctx->file_data->table) { - kfree(ctx->file_data); - ctx->file_data = NULL; - return -ENOMEM; - } + file_data->table = kcalloc(nr_tables, sizeof(file_data->table), + GFP_KERNEL); + if (!file_data->table) + goto out_free; - if (percpu_ref_init(&ctx->file_data->refs, io_file_ref_kill, - PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) { - kfree(ctx->file_data->table); - kfree(ctx->file_data); - ctx->file_data = NULL; - return -ENOMEM; - } + if (percpu_ref_init(&file_data->refs, io_file_ref_kill, + PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) + goto out_free; - if (io_sqe_alloc_file_tables(ctx, nr_tables, nr_args)) { - percpu_ref_exit(&ctx->file_data->refs); - kfree(ctx->file_data->table); - kfree(ctx->file_data); - ctx->file_data = NULL; - return -ENOMEM; - } + if (io_sqe_alloc_file_tables(file_data, nr_tables, nr_args)) + goto out_ref; for (i = 0; i < nr_args; i++, ctx->nr_user_files++) { struct fixed_file_table *table; unsigned index; - ret = -EFAULT; - if (copy_from_user(&fd, &fds[i], sizeof(fd))) - break; + if (copy_from_user(&fd, &fds[i], sizeof(fd))) { + ret = -EFAULT; + goto out_fput; + } /* allow sparse sets */ - if (fd == -1) { - ret = 0; + if (fd == -1) continue; - } - table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT]; - index = i & IORING_FILE_TABLE_MASK; file = fget(fd); - ret = -EBADF; if (!file) - break; + goto out_fput; /* * Don't allow io_uring instances to be registered. If UNIX @@ -7195,29 +7344,14 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, */ if (file->f_op == &io_uring_fops) { fput(file); - break; + goto out_fput; } - ret = 0; + table = &file_data->table[i >> IORING_FILE_TABLE_SHIFT]; + index = i & IORING_FILE_TABLE_MASK; table->files[index] = file; } - if (ret) { - for (i = 0; i < ctx->nr_user_files; i++) { - file = io_file_from_index(ctx, i); - if (file) - fput(file); - } - for (i = 0; i < nr_tables; i++) - kfree(ctx->file_data->table[i].files); - - percpu_ref_exit(&ctx->file_data->refs); - kfree(ctx->file_data->table); - kfree(ctx->file_data); - ctx->file_data = NULL; - ctx->nr_user_files = 0; - return ret; - } - + ctx->file_data = file_data; ret = io_sqe_files_scm(ctx); if (ret) { io_sqe_files_unregister(ctx); @@ -7230,11 +7364,26 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, return PTR_ERR(ref_node); } - ctx->file_data->cur_refs = &ref_node->refs; - spin_lock(&ctx->file_data->lock); - list_add(&ref_node->node, &ctx->file_data->ref_list); - spin_unlock(&ctx->file_data->lock); - percpu_ref_get(&ctx->file_data->refs); + file_data->node = ref_node; + spin_lock(&file_data->lock); + list_add(&ref_node->node, &file_data->ref_list); + spin_unlock(&file_data->lock); + percpu_ref_get(&file_data->refs); + return ret; +out_fput: + for (i = 0; i < ctx->nr_user_files; i++) { + file = io_file_from_index(ctx, i); + if (file) + fput(file); + } + for (i = 0; i < nr_tables; i++) + kfree(file_data->table[i].files); + ctx->nr_user_files = 0; +out_ref: + percpu_ref_exit(&file_data->refs); +out_free: + kfree(file_data->table); + kfree(file_data); return ret; } @@ -7285,14 +7434,12 @@ static int io_queue_file_removal(struct fixed_file_data *data, struct file *file) { struct io_file_put *pfile; - struct percpu_ref *refs = data->cur_refs; - struct fixed_file_ref_node *ref_node; + struct fixed_file_ref_node *ref_node = data->node; pfile = kzalloc(sizeof(*pfile), GFP_KERNEL); if (!pfile) return -ENOMEM; - ref_node = container_of(refs, struct fixed_file_ref_node, refs); pfile->file = file; list_add(&pfile->list, &ref_node->file_list); @@ -7375,10 +7522,10 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, } if (needs_switch) { - percpu_ref_kill(data->cur_refs); + percpu_ref_kill(&data->node->refs); spin_lock(&data->lock); list_add(&ref_node->node, &data->ref_list); - data->cur_refs = &ref_node->refs; + data->node = ref_node; spin_unlock(&data->lock); percpu_ref_get(&ctx->file_data->refs); } else @@ -7459,20 +7606,65 @@ out_fput: return ret; } -static int io_sq_offload_start(struct io_ring_ctx *ctx, - struct io_uring_params *p) +static int io_uring_alloc_task_context(struct task_struct *task) +{ + struct io_uring_task *tctx; + + tctx = kmalloc(sizeof(*tctx), GFP_KERNEL); + if (unlikely(!tctx)) + return -ENOMEM; + + xa_init(&tctx->xa); + init_waitqueue_head(&tctx->wait); + tctx->last = NULL; + tctx->in_idle = 0; + atomic_long_set(&tctx->req_issue, 0); + atomic_long_set(&tctx->req_complete, 0); + task->io_uring = tctx; + return 0; +} + +void __io_uring_free(struct task_struct *tsk) +{ + struct io_uring_task *tctx = tsk->io_uring; + + WARN_ON_ONCE(!xa_empty(&tctx->xa)); + kfree(tctx); + tsk->io_uring = NULL; +} + +static int io_sq_offload_create(struct io_ring_ctx *ctx, + struct io_uring_params *p) { int ret; if (ctx->flags & IORING_SETUP_SQPOLL) { + struct io_sq_data *sqd; + ret = -EPERM; if (!capable(CAP_SYS_ADMIN)) goto err; + sqd = io_get_sq_data(p); + if (IS_ERR(sqd)) { + ret = PTR_ERR(sqd); + goto err; + } + + ctx->sq_data = sqd; + io_sq_thread_park(sqd); + mutex_lock(&sqd->ctx_lock); + list_add(&ctx->sqd_list, &sqd->ctx_new_list); + mutex_unlock(&sqd->ctx_lock); + io_sq_thread_unpark(sqd); + ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle); if (!ctx->sq_thread_idle) ctx->sq_thread_idle = HZ; + if (sqd->thread) + goto done; + if (p->flags & IORING_SETUP_SQ_AFF) { int cpu = p->sq_thread_cpu; @@ -7482,25 +7674,27 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx, if (!cpu_online(cpu)) goto err; - ctx->sqo_thread = kthread_create_on_cpu(io_sq_thread, - ctx, cpu, - "io_uring-sq"); + sqd->thread = kthread_create_on_cpu(io_sq_thread, sqd, + cpu, "io_uring-sq"); } else { - ctx->sqo_thread = kthread_create(io_sq_thread, ctx, + sqd->thread = kthread_create(io_sq_thread, sqd, "io_uring-sq"); } - if (IS_ERR(ctx->sqo_thread)) { - ret = PTR_ERR(ctx->sqo_thread); - ctx->sqo_thread = NULL; + if (IS_ERR(sqd->thread)) { + ret = PTR_ERR(sqd->thread); + sqd->thread = NULL; goto err; } - wake_up_process(ctx->sqo_thread); + ret = io_uring_alloc_task_context(sqd->thread); + if (ret) + goto err; } else if (p->flags & IORING_SETUP_SQ_AFF) { /* Can't have SQ_AFF without SQPOLL */ ret = -EINVAL; goto err; } +done: ret = io_init_wq_offload(ctx, p); if (ret) goto err; @@ -7511,6 +7705,14 @@ err: return ret; } +static void io_sq_offload_start(struct io_ring_ctx *ctx) +{ + struct io_sq_data *sqd = ctx->sq_data; + + if ((ctx->flags & IORING_SETUP_SQPOLL) && sqd->thread) + wake_up_process(sqd->thread); +} + static inline void __io_unaccount_mem(struct user_struct *user, unsigned long nr_pages) { @@ -7542,11 +7744,11 @@ static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages, if (ctx->limit_mem) __io_unaccount_mem(ctx->user, nr_pages); - if (ctx->sqo_mm) { + if (ctx->mm_account) { if (acct == ACCT_LOCKED) - ctx->sqo_mm->locked_vm -= nr_pages; + ctx->mm_account->locked_vm -= nr_pages; else if (acct == ACCT_PINNED) - atomic64_sub(nr_pages, &ctx->sqo_mm->pinned_vm); + atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm); } } @@ -7561,11 +7763,11 @@ static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages, return ret; } - if (ctx->sqo_mm) { + if (ctx->mm_account) { if (acct == ACCT_LOCKED) - ctx->sqo_mm->locked_vm += nr_pages; + ctx->mm_account->locked_vm += nr_pages; else if (acct == ACCT_PINNED) - atomic64_add(nr_pages, &ctx->sqo_mm->pinned_vm); + atomic64_add(nr_pages, &ctx->mm_account->pinned_vm); } return 0; @@ -7645,7 +7847,8 @@ static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx) for (j = 0; j < imu->nr_bvecs; j++) unpin_user_page(imu->bvec[j].bv_page); - io_unaccount_mem(ctx, imu->nr_bvecs, ACCT_PINNED); + if (imu->acct_pages) + io_unaccount_mem(ctx, imu->acct_pages, ACCT_PINNED); kvfree(imu->bvec); imu->nr_bvecs = 0; } @@ -7681,11 +7884,80 @@ static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst, return 0; } +/* + * Not super efficient, but this is just a registration time. And we do cache + * the last compound head, so generally we'll only do a full search if we don't + * match that one. + * + * We check if the given compound head page has already been accounted, to + * avoid double accounting it. This allows us to account the full size of the + * page, not just the constituent pages of a huge page. + */ +static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages, + int nr_pages, struct page *hpage) +{ + int i, j; + + /* check current page array */ + for (i = 0; i < nr_pages; i++) { + if (!PageCompound(pages[i])) + continue; + if (compound_head(pages[i]) == hpage) + return true; + } + + /* check previously registered pages */ + for (i = 0; i < ctx->nr_user_bufs; i++) { + struct io_mapped_ubuf *imu = &ctx->user_bufs[i]; + + for (j = 0; j < imu->nr_bvecs; j++) { + if (!PageCompound(imu->bvec[j].bv_page)) + continue; + if (compound_head(imu->bvec[j].bv_page) == hpage) + return true; + } + } + + return false; +} + +static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages, + int nr_pages, struct io_mapped_ubuf *imu, + struct page **last_hpage) +{ + int i, ret; + + for (i = 0; i < nr_pages; i++) { + if (!PageCompound(pages[i])) { + imu->acct_pages++; + } else { + struct page *hpage; + + hpage = compound_head(pages[i]); + if (hpage == *last_hpage) + continue; + *last_hpage = hpage; + if (headpage_already_acct(ctx, pages, i, hpage)) + continue; + imu->acct_pages += page_size(hpage) >> PAGE_SHIFT; + } + } + + if (!imu->acct_pages) + return 0; + + ret = io_account_mem(ctx, imu->acct_pages, ACCT_PINNED); + if (ret) + imu->acct_pages = 0; + return ret; +} + static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args) { struct vm_area_struct **vmas = NULL; struct page **pages = NULL; + struct page *last_hpage = NULL; int i, j, got_pages = 0; int ret = -EINVAL; @@ -7728,10 +8000,6 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, start = ubuf >> PAGE_SHIFT; nr_pages = end - start; - ret = io_account_mem(ctx, nr_pages, ACCT_PINNED); - if (ret) - goto err; - ret = 0; if (!pages || nr_pages > got_pages) { kvfree(vmas); @@ -7743,7 +8011,6 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, GFP_KERNEL); if (!pages || !vmas) { ret = -ENOMEM; - io_unaccount_mem(ctx, nr_pages, ACCT_PINNED); goto err; } got_pages = nr_pages; @@ -7752,10 +8019,8 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, imu->bvec = kvmalloc_array(nr_pages, sizeof(struct bio_vec), GFP_KERNEL); ret = -ENOMEM; - if (!imu->bvec) { - io_unaccount_mem(ctx, nr_pages, ACCT_PINNED); + if (!imu->bvec) goto err; - } ret = 0; mmap_read_lock(current->mm); @@ -7784,7 +8049,13 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, */ if (pret > 0) unpin_user_pages(pages, pret); - io_unaccount_mem(ctx, nr_pages, ACCT_PINNED); + kvfree(imu->bvec); + goto err; + } + + ret = io_buffer_account_pin(ctx, pages, pret, imu, &last_hpage); + if (ret) { + unpin_user_pages(pages, pret); kvfree(imu->bvec); goto err; } @@ -7869,11 +8140,19 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx) { io_finish_async(ctx); io_sqe_buffer_unregister(ctx); - if (ctx->sqo_mm) { - mmdrop(ctx->sqo_mm); - ctx->sqo_mm = NULL; + + if (ctx->sqo_task) { + put_task_struct(ctx->sqo_task); + ctx->sqo_task = NULL; + mmdrop(ctx->mm_account); + ctx->mm_account = NULL; } +#ifdef CONFIG_BLK_CGROUP + if (ctx->sqo_blkcg_css) + css_put(ctx->sqo_blkcg_css); +#endif + io_sqe_files_unregister(ctx); io_eventfd_unregister(ctx); io_destroy_buffers(ctx); @@ -7908,8 +8187,7 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait) * io_commit_cqring */ smp_rmb(); - if (READ_ONCE(ctx->rings->sq.tail) - ctx->cached_sq_head != - ctx->rings->sq_ring_entries) + if (!io_sqring_full(ctx)) mask |= EPOLLOUT | EPOLLWRNORM; if (io_cqring_events(ctx, false)) mask |= EPOLLIN | EPOLLRDNORM; @@ -7948,7 +8226,7 @@ static void io_ring_exit_work(struct work_struct *work) */ do { if (ctx->rings) - io_cqring_overflow_flush(ctx, true); + io_cqring_overflow_flush(ctx, true, NULL, NULL); io_iopoll_try_reap_events(ctx); } while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20)); io_ring_ctx_free(ctx); @@ -7960,15 +8238,15 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) percpu_ref_kill(&ctx->refs); mutex_unlock(&ctx->uring_lock); - io_kill_timeouts(ctx); - io_poll_remove_all(ctx); + io_kill_timeouts(ctx, NULL); + io_poll_remove_all(ctx, NULL); if (ctx->io_wq) io_wq_cancel_all(ctx->io_wq); /* if we failed setting up the ctx, we might not have any rings */ if (ctx->rings) - io_cqring_overflow_flush(ctx, true); + io_cqring_overflow_flush(ctx, true, NULL, NULL); io_iopoll_try_reap_events(ctx); idr_for_each(&ctx->personality_idr, io_remove_personalities, ctx); @@ -8003,7 +8281,7 @@ static bool io_wq_files_match(struct io_wq_work *work, void *data) { struct files_struct *files = data; - return work->files == files; + return !files || work->files == files; } /* @@ -8024,12 +8302,6 @@ static bool io_match_link(struct io_kiocb *preq, struct io_kiocb *req) return false; } -static inline bool io_match_files(struct io_kiocb *req, - struct files_struct *files) -{ - return (req->flags & REQ_F_WORK_INITIALIZED) && req->work.files == files; -} - static bool io_match_link_files(struct io_kiocb *req, struct files_struct *files) { @@ -8145,11 +8417,14 @@ static void io_cancel_defer_files(struct io_ring_ctx *ctx, } } -static void io_uring_cancel_files(struct io_ring_ctx *ctx, +/* + * Returns true if we found and killed one or more files pinning requests + */ +static bool io_uring_cancel_files(struct io_ring_ctx *ctx, struct files_struct *files) { if (list_empty_careful(&ctx->inflight_list)) - return; + return false; io_cancel_defer_files(ctx, files); /* cancel all at once, should be faster than doing it one by one*/ @@ -8161,7 +8436,7 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx, spin_lock_irq(&ctx->inflight_lock); list_for_each_entry(req, &ctx->inflight_list, inflight_entry) { - if (req->work.files != files) + if (files && req->work.files != files) continue; /* req is being completed, ignore */ if (!refcount_inc_not_zero(&req->refs)) @@ -8180,9 +8455,13 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx, /* cancel this request, or head link requests */ io_attempt_cancel(ctx, cancel_req); io_put_req(cancel_req); + /* cancellations _may_ trigger task work */ + io_run_task_work(); schedule(); finish_wait(&ctx->inflight_wait, &wait); } + + return true; } static bool io_cancel_task_cb(struct io_wq_work *work, void *data) @@ -8190,21 +8469,198 @@ static bool io_cancel_task_cb(struct io_wq_work *work, void *data) struct io_kiocb *req = container_of(work, struct io_kiocb, work); struct task_struct *task = data; - return req->task == task; + return io_task_match(req, task); +} + +static bool __io_uring_cancel_task_requests(struct io_ring_ctx *ctx, + struct task_struct *task, + struct files_struct *files) +{ + bool ret; + + ret = io_uring_cancel_files(ctx, files); + if (!files) { + enum io_wq_cancel cret; + + cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, task, true); + if (cret != IO_WQ_CANCEL_NOTFOUND) + ret = true; + + /* SQPOLL thread does its own polling */ + if (!(ctx->flags & IORING_SETUP_SQPOLL)) { + while (!list_empty_careful(&ctx->iopoll_list)) { + io_iopoll_try_reap_events(ctx); + ret = true; + } + } + + ret |= io_poll_remove_all(ctx, task); + ret |= io_kill_timeouts(ctx, task); + } + + return ret; +} + +/* + * We need to iteratively cancel requests, in case a request has dependent + * hard links. These persist even for failure of cancelations, hence keep + * looping until none are found. + */ +static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx, + struct files_struct *files) +{ + struct task_struct *task = current; + + if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) + task = ctx->sq_data->thread; + + io_cqring_overflow_flush(ctx, true, task, files); + + while (__io_uring_cancel_task_requests(ctx, task, files)) { + io_run_task_work(); + cond_resched(); + } +} + +/* + * Note that this task has used io_uring. We use it for cancelation purposes. + */ +static int io_uring_add_task_file(struct file *file) +{ + struct io_uring_task *tctx = current->io_uring; + + if (unlikely(!tctx)) { + int ret; + + ret = io_uring_alloc_task_context(current); + if (unlikely(ret)) + return ret; + tctx = current->io_uring; + } + if (tctx->last != file) { + void *old = xa_load(&tctx->xa, (unsigned long)file); + + if (!old) { + get_file(file); + xa_store(&tctx->xa, (unsigned long)file, file, GFP_KERNEL); + } + tctx->last = file; + } + + return 0; +} + +/* + * Remove this io_uring_file -> task mapping. + */ +static void io_uring_del_task_file(struct file *file) +{ + struct io_uring_task *tctx = current->io_uring; + + if (tctx->last == file) + tctx->last = NULL; + file = xa_erase(&tctx->xa, (unsigned long)file); + if (file) + fput(file); +} + +static void __io_uring_attempt_task_drop(struct file *file) +{ + struct file *old = xa_load(¤t->io_uring->xa, (unsigned long)file); + + if (old == file) + io_uring_del_task_file(file); +} + +/* + * Drop task note for this file if we're the only ones that hold it after + * pending fput() + */ +static void io_uring_attempt_task_drop(struct file *file, bool exiting) +{ + if (!current->io_uring) + return; + /* + * fput() is pending, will be 2 if the only other ref is our potential + * task file note. If the task is exiting, drop regardless of count. + */ + if (!exiting && atomic_long_read(&file->f_count) != 2) + return; + + __io_uring_attempt_task_drop(file); +} + +void __io_uring_files_cancel(struct files_struct *files) +{ + struct io_uring_task *tctx = current->io_uring; + struct file *file; + unsigned long index; + + /* make sure overflow events are dropped */ + tctx->in_idle = true; + + xa_for_each(&tctx->xa, index, file) { + struct io_ring_ctx *ctx = file->private_data; + + io_uring_cancel_task_requests(ctx, files); + if (files) + io_uring_del_task_file(file); + } +} + +static inline bool io_uring_task_idle(struct io_uring_task *tctx) +{ + return atomic_long_read(&tctx->req_issue) == + atomic_long_read(&tctx->req_complete); +} + +/* + * Find any io_uring fd that this task has registered or done IO on, and cancel + * requests. + */ +void __io_uring_task_cancel(void) +{ + struct io_uring_task *tctx = current->io_uring; + DEFINE_WAIT(wait); + long completions; + + /* make sure overflow events are dropped */ + tctx->in_idle = true; + + while (!io_uring_task_idle(tctx)) { + /* read completions before cancelations */ + completions = atomic_long_read(&tctx->req_complete); + __io_uring_files_cancel(NULL); + + prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE); + + /* + * If we've seen completions, retry. This avoids a race where + * a completion comes in before we did prepare_to_wait(). + */ + if (completions != atomic_long_read(&tctx->req_complete)) + continue; + if (io_uring_task_idle(tctx)) + break; + schedule(); + } + + finish_wait(&tctx->wait, &wait); + tctx->in_idle = false; } static int io_uring_flush(struct file *file, void *data) { struct io_ring_ctx *ctx = file->private_data; - io_uring_cancel_files(ctx, data); - /* * If the task is going away, cancel work it may have pending */ if (fatal_signal_pending(current) || (current->flags & PF_EXITING)) - io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, current, true); + data = NULL; + io_uring_cancel_task_requests(ctx, data); + io_uring_attempt_task_drop(file, !data); return 0; } @@ -8278,6 +8734,25 @@ static unsigned long io_uring_nommu_get_unmapped_area(struct file *file, #endif /* !CONFIG_MMU */ +static void io_sqpoll_wait_sq(struct io_ring_ctx *ctx) +{ + DEFINE_WAIT(wait); + + do { + if (!io_sqring_full(ctx)) + break; + + prepare_to_wait(&ctx->sqo_sq_wait, &wait, TASK_INTERRUPTIBLE); + + if (!io_sqring_full(ctx)) + break; + + schedule(); + } while (!signal_pending(current)); + + finish_wait(&ctx->sqo_sq_wait, &wait); +} + SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, u32, min_complete, u32, flags, const sigset_t __user *, sig, size_t, sigsz) @@ -8289,7 +8764,8 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, io_run_task_work(); - if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP)) + if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP | + IORING_ENTER_SQ_WAIT)) return -EINVAL; f = fdget(fd); @@ -8305,6 +8781,10 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, if (!percpu_ref_tryget(&ctx->refs)) goto out_fput; + ret = -EBADFD; + if (ctx->flags & IORING_SETUP_R_DISABLED) + goto out; + /* * For SQ polling, the thread will do all submissions and completions. * Just return the requested submit count, and wake the thread if @@ -8313,13 +8793,18 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, ret = 0; if (ctx->flags & IORING_SETUP_SQPOLL) { if (!list_empty_careful(&ctx->cq_overflow_list)) - io_cqring_overflow_flush(ctx, false); + io_cqring_overflow_flush(ctx, false, NULL, NULL); if (flags & IORING_ENTER_SQ_WAKEUP) - wake_up(&ctx->sqo_wait); + wake_up(&ctx->sq_data->wait); + if (flags & IORING_ENTER_SQ_WAIT) + io_sqpoll_wait_sq(ctx); submitted = to_submit; } else if (to_submit) { + ret = io_uring_add_task_file(f.file); + if (unlikely(ret)) + goto out; mutex_lock(&ctx->uring_lock); - submitted = io_submit_sqes(ctx, to_submit, f.file, fd); + submitted = io_submit_sqes(ctx, to_submit); mutex_unlock(&ctx->uring_lock); if (submitted != to_submit) @@ -8385,11 +8870,25 @@ static int io_uring_show_cred(int id, void *p, void *data) static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) { + struct io_sq_data *sq = NULL; + bool has_lock; int i; - mutex_lock(&ctx->uring_lock); + /* + * Avoid ABBA deadlock between the seq lock and the io_uring mutex, + * since fdinfo case grabs it in the opposite direction of normal use + * cases. If we fail to get the lock, we just don't iterate any + * structures that could be going away outside the io_uring mutex. + */ + has_lock = mutex_trylock(&ctx->uring_lock); + + if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) + sq = ctx->sq_data; + + seq_printf(m, "SqThread:\t%d\n", sq ? task_pid_nr(sq->thread) : -1); + seq_printf(m, "SqThreadCpu:\t%d\n", sq ? task_cpu(sq->thread) : -1); seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files); - for (i = 0; i < ctx->nr_user_files; i++) { + for (i = 0; has_lock && i < ctx->nr_user_files; i++) { struct fixed_file_table *table; struct file *f; @@ -8401,13 +8900,13 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) seq_printf(m, "%5u: <none>\n", i); } seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs); - for (i = 0; i < ctx->nr_user_bufs; i++) { + for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) { struct io_mapped_ubuf *buf = &ctx->user_bufs[i]; seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf, (unsigned int) buf->len); } - if (!idr_is_empty(&ctx->personality_idr)) { + if (has_lock && !idr_is_empty(&ctx->personality_idr)) { seq_printf(m, "Personalities:\n"); idr_for_each(&ctx->personality_idr, io_uring_show_cred, m); } @@ -8422,7 +8921,8 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) req->task->task_works != NULL); } spin_unlock_irq(&ctx->completion_lock); - mutex_unlock(&ctx->uring_lock); + if (has_lock) + mutex_unlock(&ctx->uring_lock); } static void io_uring_show_fdinfo(struct seq_file *m, struct file *f) @@ -8520,6 +9020,7 @@ static int io_uring_get_fd(struct io_ring_ctx *ctx) file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx, O_RDWR | O_CLOEXEC); if (IS_ERR(file)) { +err_fd: put_unused_fd(ret); ret = PTR_ERR(file); goto err; @@ -8528,6 +9029,10 @@ static int io_uring_get_fd(struct io_ring_ctx *ctx) #if defined(CONFIG_UNIX) ctx->ring_sock->file = file; #endif + if (unlikely(io_uring_add_task_file(file))) { + file = ERR_PTR(-ENOMEM); + goto err_fd; + } fd_install(ret, file); return ret; err: @@ -8605,8 +9110,35 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, ctx->user = user; ctx->creds = get_current_cred(); + ctx->sqo_task = get_task_struct(current); + + /* + * This is just grabbed for accounting purposes. When a process exits, + * the mm is exited and dropped before the files, hence we need to hang + * on to this mm purely for the purposes of being able to unaccount + * memory (locked/pinned vm). It's not used for anything else. + */ mmgrab(current->mm); - ctx->sqo_mm = current->mm; + ctx->mm_account = current->mm; + +#ifdef CONFIG_BLK_CGROUP + /* + * The sq thread will belong to the original cgroup it was inited in. + * If the cgroup goes offline (e.g. disabling the io controller), then + * issued bios will be associated with the closest cgroup later in the + * block layer. + */ + rcu_read_lock(); + ctx->sqo_blkcg_css = blkcg_css(); + ret = css_tryget_online(ctx->sqo_blkcg_css); + rcu_read_unlock(); + if (!ret) { + /* don't init against a dying cgroup, have the user try again */ + ctx->sqo_blkcg_css = NULL; + ret = -ENODEV; + goto err; + } +#endif /* * Account memory _before_ installing the file descriptor. Once @@ -8622,10 +9154,13 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, if (ret) goto err; - ret = io_sq_offload_start(ctx, p); + ret = io_sq_offload_create(ctx, p); if (ret) goto err; + if (!(p->flags & IORING_SETUP_R_DISABLED)) + io_sq_offload_start(ctx); + memset(&p->sq_off, 0, sizeof(p->sq_off)); p->sq_off.head = offsetof(struct io_rings, sq.head); p->sq_off.tail = offsetof(struct io_rings, sq.tail); @@ -8688,7 +9223,8 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params) if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL | IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE | - IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ)) + IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ | + IORING_SETUP_R_DISABLED)) return -EINVAL; return io_uring_create(entries, &p, params); @@ -8764,6 +9300,91 @@ static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id) return -EINVAL; } +static int io_register_restrictions(struct io_ring_ctx *ctx, void __user *arg, + unsigned int nr_args) +{ + struct io_uring_restriction *res; + size_t size; + int i, ret; + + /* Restrictions allowed only if rings started disabled */ + if (!(ctx->flags & IORING_SETUP_R_DISABLED)) + return -EBADFD; + + /* We allow only a single restrictions registration */ + if (ctx->restrictions.registered) + return -EBUSY; + + if (!arg || nr_args > IORING_MAX_RESTRICTIONS) + return -EINVAL; + + size = array_size(nr_args, sizeof(*res)); + if (size == SIZE_MAX) + return -EOVERFLOW; + + res = memdup_user(arg, size); + if (IS_ERR(res)) + return PTR_ERR(res); + + ret = 0; + + for (i = 0; i < nr_args; i++) { + switch (res[i].opcode) { + case IORING_RESTRICTION_REGISTER_OP: + if (res[i].register_op >= IORING_REGISTER_LAST) { + ret = -EINVAL; + goto out; + } + + __set_bit(res[i].register_op, + ctx->restrictions.register_op); + break; + case IORING_RESTRICTION_SQE_OP: + if (res[i].sqe_op >= IORING_OP_LAST) { + ret = -EINVAL; + goto out; + } + + __set_bit(res[i].sqe_op, ctx->restrictions.sqe_op); + break; + case IORING_RESTRICTION_SQE_FLAGS_ALLOWED: + ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags; + break; + case IORING_RESTRICTION_SQE_FLAGS_REQUIRED: + ctx->restrictions.sqe_flags_required = res[i].sqe_flags; + break; + default: + ret = -EINVAL; + goto out; + } + } + +out: + /* Reset all restrictions if an error happened */ + if (ret != 0) + memset(&ctx->restrictions, 0, sizeof(ctx->restrictions)); + else + ctx->restrictions.registered = true; + + kfree(res); + return ret; +} + +static int io_register_enable_rings(struct io_ring_ctx *ctx) +{ + if (!(ctx->flags & IORING_SETUP_R_DISABLED)) + return -EBADFD; + + if (ctx->restrictions.registered) + ctx->restricted = 1; + + ctx->flags &= ~IORING_SETUP_R_DISABLED; + + io_sq_offload_start(ctx); + + return 0; +} + static bool io_register_op_must_quiesce(int op) { switch (op) { @@ -8805,11 +9426,31 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, * after we've killed the percpu ref. */ mutex_unlock(&ctx->uring_lock); - ret = wait_for_completion_interruptible(&ctx->ref_comp); + do { + ret = wait_for_completion_interruptible(&ctx->ref_comp); + if (!ret) + break; + ret = io_run_task_work_sig(); + if (ret < 0) + break; + } while (1); + mutex_lock(&ctx->uring_lock); + if (ret) { percpu_ref_resurrect(&ctx->refs); - ret = -EINTR; + goto out_quiesce; + } + } + + if (ctx->restricted) { + if (opcode >= IORING_REGISTER_LAST) { + ret = -EINVAL; + goto out; + } + + if (!test_bit(opcode, ctx->restrictions.register_op)) { + ret = -EACCES; goto out; } } @@ -8873,15 +9514,25 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, break; ret = io_unregister_personality(ctx, nr_args); break; + case IORING_REGISTER_ENABLE_RINGS: + ret = -EINVAL; + if (arg || nr_args) + break; + ret = io_register_enable_rings(ctx); + break; + case IORING_REGISTER_RESTRICTIONS: + ret = io_register_restrictions(ctx, arg, nr_args); + break; default: ret = -EINVAL; break; } +out: if (io_register_op_must_quiesce(opcode)) { /* bring the ctx back to life */ percpu_ref_reinit(&ctx->refs); -out: +out_quiesce: reinit_completion(&ctx->ref_comp); } return ret; diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index bcfc288dba3f..8180061b9e16 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -22,18 +22,25 @@ #include "../internal.h" /* - * Structure allocated for each page when block size < PAGE_SIZE to track - * sub-page uptodate status and I/O completions. + * Structure allocated for each page or THP when block size < page size + * to track sub-page uptodate status and I/O completions. */ struct iomap_page { - atomic_t read_count; - atomic_t write_count; + atomic_t read_bytes_pending; + atomic_t write_bytes_pending; spinlock_t uptodate_lock; - DECLARE_BITMAP(uptodate, PAGE_SIZE / 512); + unsigned long uptodate[]; }; static inline struct iomap_page *to_iomap_page(struct page *page) { + /* + * per-block data is stored in the head page. Callers should + * not be dealing with tail pages (and if they are, they can + * call thp_head() first. + */ + VM_BUG_ON_PGFLAGS(PageTail(page), page); + if (page_has_private(page)) return (struct iomap_page *)page_private(page); return NULL; @@ -45,20 +52,16 @@ static struct iomap_page * iomap_page_create(struct inode *inode, struct page *page) { struct iomap_page *iop = to_iomap_page(page); + unsigned int nr_blocks = i_blocks_per_page(inode, page); - if (iop || i_blocksize(inode) == PAGE_SIZE) + if (iop || nr_blocks <= 1) return iop; - iop = kmalloc(sizeof(*iop), GFP_NOFS | __GFP_NOFAIL); - atomic_set(&iop->read_count, 0); - atomic_set(&iop->write_count, 0); + iop = kzalloc(struct_size(iop, uptodate, BITS_TO_LONGS(nr_blocks)), + GFP_NOFS | __GFP_NOFAIL); spin_lock_init(&iop->uptodate_lock); - bitmap_zero(iop->uptodate, PAGE_SIZE / SECTOR_SIZE); - - /* - * migrate_page_move_mapping() assumes that pages with private data have - * their count elevated by 1. - */ + if (PageUptodate(page)) + bitmap_fill(iop->uptodate, nr_blocks); attach_page_private(page, iop); return iop; } @@ -67,11 +70,14 @@ static void iomap_page_release(struct page *page) { struct iomap_page *iop = detach_page_private(page); + unsigned int nr_blocks = i_blocks_per_page(page->mapping->host, page); if (!iop) return; - WARN_ON_ONCE(atomic_read(&iop->read_count)); - WARN_ON_ONCE(atomic_read(&iop->write_count)); + WARN_ON_ONCE(atomic_read(&iop->read_bytes_pending)); + WARN_ON_ONCE(atomic_read(&iop->write_bytes_pending)); + WARN_ON_ONCE(bitmap_full(iop->uptodate, nr_blocks) != + PageUptodate(page)); kfree(iop); } @@ -142,19 +148,11 @@ iomap_iop_set_range_uptodate(struct page *page, unsigned off, unsigned len) struct inode *inode = page->mapping->host; unsigned first = off >> inode->i_blkbits; unsigned last = (off + len - 1) >> inode->i_blkbits; - bool uptodate = true; unsigned long flags; - unsigned int i; spin_lock_irqsave(&iop->uptodate_lock, flags); - for (i = 0; i < PAGE_SIZE / i_blocksize(inode); i++) { - if (i >= first && i <= last) - set_bit(i, iop->uptodate); - else if (!test_bit(i, iop->uptodate)) - uptodate = false; - } - - if (uptodate) + bitmap_set(iop->uptodate, first, last - first + 1); + if (bitmap_full(iop->uptodate, i_blocks_per_page(inode, page))) SetPageUptodate(page); spin_unlock_irqrestore(&iop->uptodate_lock, flags); } @@ -172,13 +170,6 @@ iomap_set_range_uptodate(struct page *page, unsigned off, unsigned len) } static void -iomap_read_finish(struct iomap_page *iop, struct page *page) -{ - if (!iop || atomic_dec_and_test(&iop->read_count)) - unlock_page(page); -} - -static void iomap_read_page_end_io(struct bio_vec *bvec, int error) { struct page *page = bvec->bv_page; @@ -191,7 +182,8 @@ iomap_read_page_end_io(struct bio_vec *bvec, int error) iomap_set_range_uptodate(page, bvec->bv_offset, bvec->bv_len); } - iomap_read_finish(iop, page); + if (!iop || atomic_sub_and_test(bvec->bv_len, &iop->read_bytes_pending)) + unlock_page(page); } static void @@ -271,30 +263,19 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data, } ctx->cur_page_in_bio = true; + if (iop) + atomic_add(plen, &iop->read_bytes_pending); - /* - * Try to merge into a previous segment if we can. - */ + /* Try to merge into a previous segment if we can */ sector = iomap_sector(iomap, pos); - if (ctx->bio && bio_end_sector(ctx->bio) == sector) + if (ctx->bio && bio_end_sector(ctx->bio) == sector) { + if (__bio_try_merge_page(ctx->bio, page, plen, poff, + &same_page)) + goto done; is_contig = true; - - if (is_contig && - __bio_try_merge_page(ctx->bio, page, plen, poff, &same_page)) { - if (!same_page && iop) - atomic_inc(&iop->read_count); - goto done; } - /* - * If we start a new segment we need to increase the read count, and we - * need to do so before submitting any previous full bio to make sure - * that we don't prematurely unlock the page. - */ - if (iop) - atomic_inc(&iop->read_count); - - if (!ctx->bio || !is_contig || bio_full(ctx->bio, plen)) { + if (!is_contig || bio_full(ctx->bio, plen)) { gfp_t gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL); gfp_t orig_gfp = gfp; int nr_vecs = (length + PAGE_SIZE - 1) >> PAGE_SHIFT; @@ -571,13 +552,13 @@ __iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, int flags, { struct iomap_page *iop = iomap_page_create(inode, page); loff_t block_size = i_blocksize(inode); - loff_t block_start = pos & ~(block_size - 1); - loff_t block_end = (pos + len + block_size - 1) & ~(block_size - 1); + loff_t block_start = round_down(pos, block_size); + loff_t block_end = round_up(pos + len, block_size); unsigned from = offset_in_page(pos), to = from + len, poff, plen; - int status; if (PageUptodate(page)) return 0; + ClearPageError(page); do { iomap_adjust_read_range(inode, iop, &block_start, @@ -594,14 +575,13 @@ __iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, int flags, if (WARN_ON_ONCE(flags & IOMAP_WRITE_F_UNSHARE)) return -EIO; zero_user_segments(page, poff, from, to, poff + plen); - iomap_set_range_uptodate(page, poff, plen); - continue; + } else { + int status = iomap_read_page_sync(block_start, page, + poff, plen, srcmap); + if (status) + return status; } - - status = iomap_read_page_sync(block_start, page, poff, plen, - srcmap); - if (status) - return status; + iomap_set_range_uptodate(page, poff, plen); } while ((block_start += plen) < block_end); return 0; @@ -685,9 +665,8 @@ iomap_set_page_dirty(struct page *page) } EXPORT_SYMBOL_GPL(iomap_set_page_dirty); -static int -__iomap_write_end(struct inode *inode, loff_t pos, unsigned len, - unsigned copied, struct page *page) +static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len, + size_t copied, struct page *page) { flush_dcache_page(page); @@ -709,15 +688,15 @@ __iomap_write_end(struct inode *inode, loff_t pos, unsigned len, return copied; } -static int -iomap_write_end_inline(struct inode *inode, struct page *page, - struct iomap *iomap, loff_t pos, unsigned copied) +static size_t iomap_write_end_inline(struct inode *inode, struct page *page, + struct iomap *iomap, loff_t pos, size_t copied) { void *addr; WARN_ON_ONCE(!PageUptodate(page)); BUG_ON(pos + copied > PAGE_SIZE - offset_in_page(iomap->inline_data)); + flush_dcache_page(page); addr = kmap_atomic(page); memcpy(iomap->inline_data + pos, addr + pos, copied); kunmap_atomic(addr); @@ -726,13 +705,14 @@ iomap_write_end_inline(struct inode *inode, struct page *page, return copied; } -static int -iomap_write_end(struct inode *inode, loff_t pos, unsigned len, unsigned copied, - struct page *page, struct iomap *iomap, struct iomap *srcmap) +/* Returns the number of bytes copied. May be 0. Cannot be an errno. */ +static size_t iomap_write_end(struct inode *inode, loff_t pos, size_t len, + size_t copied, struct page *page, struct iomap *iomap, + struct iomap *srcmap) { const struct iomap_page_ops *page_ops = iomap->page_ops; loff_t old_size = inode->i_size; - int ret; + size_t ret; if (srcmap->type == IOMAP_INLINE) { ret = iomap_write_end_inline(inode, page, iomap, pos, copied); @@ -811,13 +791,8 @@ again: copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); - flush_dcache_page(page); - - status = iomap_write_end(inode, pos, bytes, copied, page, iomap, + copied = iomap_write_end(inode, pos, bytes, copied, page, iomap, srcmap); - if (unlikely(status < 0)) - break; - copied = status; cond_resched(); @@ -891,11 +866,8 @@ iomap_unshare_actor(struct inode *inode, loff_t pos, loff_t length, void *data, status = iomap_write_end(inode, pos, bytes, bytes, page, iomap, srcmap); - if (unlikely(status <= 0)) { - if (WARN_ON_ONCE(status == 0)) - return -EIO; - return status; - } + if (WARN_ON_ONCE(status == 0)) + return -EIO; cond_resched(); @@ -928,11 +900,13 @@ iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len, } EXPORT_SYMBOL_GPL(iomap_file_unshare); -static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset, - unsigned bytes, struct iomap *iomap, struct iomap *srcmap) +static s64 iomap_zero(struct inode *inode, loff_t pos, u64 length, + struct iomap *iomap, struct iomap *srcmap) { struct page *page; int status; + unsigned offset = offset_in_page(pos); + unsigned bytes = min_t(u64, PAGE_SIZE - offset, length); status = iomap_write_begin(inode, pos, bytes, 0, &page, iomap, srcmap); if (status) @@ -944,38 +918,33 @@ static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset, return iomap_write_end(inode, pos, bytes, bytes, page, iomap, srcmap); } -static loff_t -iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count, - void *data, struct iomap *iomap, struct iomap *srcmap) +static loff_t iomap_zero_range_actor(struct inode *inode, loff_t pos, + loff_t length, void *data, struct iomap *iomap, + struct iomap *srcmap) { bool *did_zero = data; loff_t written = 0; - int status; /* already zeroed? we're done. */ if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN) - return count; + return length; do { - unsigned offset, bytes; - - offset = offset_in_page(pos); - bytes = min_t(loff_t, PAGE_SIZE - offset, count); + s64 bytes; if (IS_DAX(inode)) - status = dax_iomap_zero(pos, offset, bytes, iomap); + bytes = dax_iomap_zero(pos, length, iomap); else - status = iomap_zero(inode, pos, offset, bytes, iomap, - srcmap); - if (status < 0) - return status; + bytes = iomap_zero(inode, pos, length, iomap, srcmap); + if (bytes < 0) + return bytes; pos += bytes; - count -= bytes; + length -= bytes; written += bytes; if (did_zero) *did_zero = true; - } while (count > 0); + } while (length > 0); return written; } @@ -1070,7 +1039,7 @@ EXPORT_SYMBOL_GPL(iomap_page_mkwrite); static void iomap_finish_page_writeback(struct inode *inode, struct page *page, - int error) + int error, unsigned int len) { struct iomap_page *iop = to_iomap_page(page); @@ -1079,10 +1048,10 @@ iomap_finish_page_writeback(struct inode *inode, struct page *page, mapping_set_error(inode->i_mapping, -EIO); } - WARN_ON_ONCE(i_blocksize(inode) < PAGE_SIZE && !iop); - WARN_ON_ONCE(iop && atomic_read(&iop->write_count) <= 0); + WARN_ON_ONCE(i_blocks_per_page(inode, page) > 1 && !iop); + WARN_ON_ONCE(iop && atomic_read(&iop->write_bytes_pending) <= 0); - if (!iop || atomic_dec_and_test(&iop->write_count)) + if (!iop || atomic_sub_and_test(len, &iop->write_bytes_pending)) end_page_writeback(page); } @@ -1116,7 +1085,8 @@ iomap_finish_ioend(struct iomap_ioend *ioend, int error) /* walk each page on bio, ending page IO on them */ bio_for_each_segment_all(bv, bio, iter_all) - iomap_finish_page_writeback(inode, bv->bv_page, error); + iomap_finish_page_writeback(inode, bv->bv_page, error, + bv->bv_len); bio_put(bio); } /* The ioend has been freed by bio_put() */ @@ -1332,8 +1302,8 @@ iomap_add_to_ioend(struct inode *inode, loff_t offset, struct page *page, merged = __bio_try_merge_page(wpc->ioend->io_bio, page, len, poff, &same_page); - if (iop && !same_page) - atomic_inc(&iop->write_count); + if (iop) + atomic_add(len, &iop->write_bytes_pending); if (!merged) { if (bio_full(wpc->ioend->io_bio, len)) { @@ -1375,8 +1345,8 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc, int error = 0, count = 0, i; LIST_HEAD(submit_list); - WARN_ON_ONCE(i_blocksize(inode) < PAGE_SIZE && !iop); - WARN_ON_ONCE(iop && atomic_read(&iop->write_count) != 0); + WARN_ON_ONCE(i_blocks_per_page(inode, page) > 1 && !iop); + WARN_ON_ONCE(iop && atomic_read(&iop->write_bytes_pending) != 0); /* * Walk through the page to find areas to write back. If we run off the diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index c1aafb2ab990..933f234d5bec 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -76,7 +76,7 @@ static void iomap_dio_submit_bio(struct iomap_dio *dio, struct iomap *iomap, dio->submit.cookie = submit_bio(bio); } -static ssize_t iomap_dio_complete(struct iomap_dio *dio) +ssize_t iomap_dio_complete(struct iomap_dio *dio) { const struct iomap_dio_ops *dops = dio->dops; struct kiocb *iocb = dio->iocb; @@ -108,7 +108,7 @@ static ssize_t iomap_dio_complete(struct iomap_dio *dio) * ->end_io() when necessary, otherwise a racing buffer read would cache * zeros from unwritten extents. */ - if (!dio->error && + if (!dio->error && dio->size && (dio->flags & IOMAP_DIO_WRITE) && inode->i_mapping->nrpages) { int err; err = invalidate_inode_pages2_range(inode->i_mapping, @@ -118,6 +118,7 @@ static ssize_t iomap_dio_complete(struct iomap_dio *dio) dio_warn_stale_pagecache(iocb->ki_filp); } + inode_dio_end(file_inode(iocb->ki_filp)); /* * If this is a DSYNC write, make sure we push it to stable storage now * that we've written data. @@ -125,11 +126,11 @@ static ssize_t iomap_dio_complete(struct iomap_dio *dio) if (ret > 0 && (dio->flags & IOMAP_DIO_NEED_SYNC)) ret = generic_write_sync(iocb, ret); - inode_dio_end(file_inode(iocb->ki_filp)); kfree(dio); return ret; } +EXPORT_SYMBOL_GPL(iomap_dio_complete); static void iomap_dio_complete_work(struct work_struct *work) { @@ -388,6 +389,16 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, return iomap_dio_bio_actor(inode, pos, length, dio, iomap); case IOMAP_INLINE: return iomap_dio_inline_actor(inode, pos, length, dio, iomap); + case IOMAP_DELALLOC: + /* + * DIO is not serialised against mmap() access at all, and so + * if the page_mkwrite occurs between the writeback and the + * iomap_apply() call in the DIO path, then it will see the + * DELALLOC block that the page-mkwrite allocated. + */ + pr_warn_ratelimited("Direct I/O collision with buffered writes! File: %pD4 Comm: %.20s\n", + dio->iocb->ki_filp, current->comm); + return -EIO; default: WARN_ON_ONCE(1); return -EIO; @@ -406,8 +417,8 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, * Returns -ENOTBLK In case of a page invalidation invalidation failure for * writes. The callers needs to fall back to buffered I/O in this case. */ -ssize_t -iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, +struct iomap_dio * +__iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops, const struct iomap_dio_ops *dops, bool wait_for_completion) { @@ -421,14 +432,14 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, struct iomap_dio *dio; if (!count) - return 0; + return NULL; if (WARN_ON(is_sync_kiocb(iocb) && !wait_for_completion)) - return -EIO; + return ERR_PTR(-EIO); dio = kmalloc(sizeof(*dio), GFP_KERNEL); if (!dio) - return -ENOMEM; + return ERR_PTR(-ENOMEM); dio->iocb = iocb; atomic_set(&dio->ref, 1); @@ -558,7 +569,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, dio->wait_for_completion = wait_for_completion; if (!atomic_dec_and_test(&dio->ref)) { if (!wait_for_completion) - return -EIOCBQUEUED; + return ERR_PTR(-EIOCBQUEUED); for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); @@ -574,10 +585,26 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, __set_current_state(TASK_RUNNING); } - return iomap_dio_complete(dio); + return dio; out_free_dio: kfree(dio); - return ret; + if (ret) + return ERR_PTR(ret); + return NULL; +} +EXPORT_SYMBOL_GPL(__iomap_dio_rw); + +ssize_t +iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, + const struct iomap_ops *ops, const struct iomap_dio_ops *dops, + bool wait_for_completion) +{ + struct iomap_dio *dio; + + dio = __iomap_dio_rw(iocb, iter, ops, dops, wait_for_completion); + if (IS_ERR_OR_NULL(dio)) + return PTR_ERR_OR_ZERO(dio); + return iomap_dio_complete(dio); } EXPORT_SYMBOL_GPL(iomap_dio_rw); diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c index a2f5338a5ea1..176580f54af9 100644 --- a/fs/jfs/jfs_metapage.c +++ b/fs/jfs/jfs_metapage.c @@ -473,7 +473,7 @@ static int metapage_readpage(struct file *fp, struct page *page) struct inode *inode = page->mapping->host; struct bio *bio = NULL; int block_offset; - int blocks_per_page = PAGE_SIZE >> inode->i_blkbits; + int blocks_per_page = i_blocks_per_page(inode, page); sector_t page_start; /* address of page in fs blocks */ sector_t pblock; int xlen; diff --git a/fs/kernel_read_file.c b/fs/kernel_read_file.c new file mode 100644 index 000000000000..90d255fbdd9b --- /dev/null +++ b/fs/kernel_read_file.c @@ -0,0 +1,189 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <linux/fs.h> +#include <linux/fs_struct.h> +#include <linux/kernel_read_file.h> +#include <linux/security.h> +#include <linux/vmalloc.h> + +/** + * kernel_read_file() - read file contents into a kernel buffer + * + * @file file to read from + * @offset where to start reading from (see below). + * @buf pointer to a "void *" buffer for reading into (if + * *@buf is NULL, a buffer will be allocated, and + * @buf_size will be ignored) + * @buf_size size of buf, if already allocated. If @buf not + * allocated, this is the largest size to allocate. + * @file_size if non-NULL, the full size of @file will be + * written here. + * @id the kernel_read_file_id identifying the type of + * file contents being read (for LSMs to examine) + * + * @offset must be 0 unless both @buf and @file_size are non-NULL + * (i.e. the caller must be expecting to read partial file contents + * via an already-allocated @buf, in at most @buf_size chunks, and + * will be able to determine when the entire file was read by + * checking @file_size). This isn't a recommended way to read a + * file, though, since it is possible that the contents might + * change between calls to kernel_read_file(). + * + * Returns number of bytes read (no single read will be bigger + * than INT_MAX), or negative on error. + * + */ +int kernel_read_file(struct file *file, loff_t offset, void **buf, + size_t buf_size, size_t *file_size, + enum kernel_read_file_id id) +{ + loff_t i_size, pos; + size_t copied; + void *allocated = NULL; + bool whole_file; + int ret; + + if (offset != 0 && (!*buf || !file_size)) + return -EINVAL; + + if (!S_ISREG(file_inode(file)->i_mode)) + return -EINVAL; + + ret = deny_write_access(file); + if (ret) + return ret; + + i_size = i_size_read(file_inode(file)); + if (i_size <= 0) { + ret = -EINVAL; + goto out; + } + /* The file is too big for sane activities. */ + if (i_size > INT_MAX) { + ret = -EFBIG; + goto out; + } + /* The entire file cannot be read in one buffer. */ + if (!file_size && offset == 0 && i_size > buf_size) { + ret = -EFBIG; + goto out; + } + + whole_file = (offset == 0 && i_size <= buf_size); + ret = security_kernel_read_file(file, id, whole_file); + if (ret) + goto out; + + if (file_size) + *file_size = i_size; + + if (!*buf) + *buf = allocated = vmalloc(i_size); + if (!*buf) { + ret = -ENOMEM; + goto out; + } + + pos = offset; + copied = 0; + while (copied < buf_size) { + ssize_t bytes; + size_t wanted = min_t(size_t, buf_size - copied, + i_size - pos); + + bytes = kernel_read(file, *buf + copied, wanted, &pos); + if (bytes < 0) { + ret = bytes; + goto out_free; + } + + if (bytes == 0) + break; + copied += bytes; + } + + if (whole_file) { + if (pos != i_size) { + ret = -EIO; + goto out_free; + } + + ret = security_kernel_post_read_file(file, *buf, i_size, id); + } + +out_free: + if (ret < 0) { + if (allocated) { + vfree(*buf); + *buf = NULL; + } + } + +out: + allow_write_access(file); + return ret == 0 ? copied : ret; +} +EXPORT_SYMBOL_GPL(kernel_read_file); + +int kernel_read_file_from_path(const char *path, loff_t offset, void **buf, + size_t buf_size, size_t *file_size, + enum kernel_read_file_id id) +{ + struct file *file; + int ret; + + if (!path || !*path) + return -EINVAL; + + file = filp_open(path, O_RDONLY, 0); + if (IS_ERR(file)) + return PTR_ERR(file); + + ret = kernel_read_file(file, offset, buf, buf_size, file_size, id); + fput(file); + return ret; +} +EXPORT_SYMBOL_GPL(kernel_read_file_from_path); + +int kernel_read_file_from_path_initns(const char *path, loff_t offset, + void **buf, size_t buf_size, + size_t *file_size, + enum kernel_read_file_id id) +{ + struct file *file; + struct path root; + int ret; + + if (!path || !*path) + return -EINVAL; + + task_lock(&init_task); + get_fs_root(init_task.fs, &root); + task_unlock(&init_task); + + file = file_open_root(root.dentry, root.mnt, path, O_RDONLY, 0); + path_put(&root); + if (IS_ERR(file)) + return PTR_ERR(file); + + ret = kernel_read_file(file, offset, buf, buf_size, file_size, id); + fput(file); + return ret; +} +EXPORT_SYMBOL_GPL(kernel_read_file_from_path_initns); + +int kernel_read_file_from_fd(int fd, loff_t offset, void **buf, + size_t buf_size, size_t *file_size, + enum kernel_read_file_id id) +{ + struct fd f = fdget(fd); + int ret = -EBADF; + + if (!f.file) + goto out; + + ret = kernel_read_file(f.file, offset, buf, buf_size, file_size, id); +out: + fdput(f); + return ret; +} +EXPORT_SYMBOL_GPL(kernel_read_file_from_fd); diff --git a/fs/namei.c b/fs/namei.c index e99e2a9da0f7..f1eb8ccd2be9 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -568,8 +568,8 @@ static bool path_connected(struct vfsmount *mnt, struct dentry *dentry) { struct super_block *sb = mnt->mnt_sb; - /* Bind mounts and multi-root filesystems can have disconnected paths */ - if (!(sb->s_iflags & SB_I_MULTIROOT) && (mnt->mnt_root == sb->s_root)) + /* Bind mounts can have disconnected paths */ + if (mnt->mnt_root == sb->s_root) return true; return is_subdir(dentry, mnt->mnt_root); diff --git a/fs/namespace.c b/fs/namespace.c index bae0e95b3713..294e05a13d17 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -3072,10 +3072,10 @@ static void shrink_submounts(struct mount *mnt) } } -void *copy_mount_options(const void __user * data) +static void *copy_mount_options(const void __user * data) { char *copy; - unsigned size; + unsigned left, offset; if (!data) return NULL; @@ -3084,20 +3084,31 @@ void *copy_mount_options(const void __user * data) if (!copy) return ERR_PTR(-ENOMEM); - size = PAGE_SIZE - offset_in_page(data); + left = copy_from_user(copy, data, PAGE_SIZE); - if (copy_from_user(copy, data, size)) { + /* + * Not all architectures have an exact copy_from_user(). Resort to + * byte at a time. + */ + offset = PAGE_SIZE - left; + while (left) { + char c; + if (get_user(c, (const char __user *)data + offset)) + break; + copy[offset] = c; + left--; + offset++; + } + + if (left == PAGE_SIZE) { kfree(copy); return ERR_PTR(-EFAULT); } - if (size != PAGE_SIZE) { - if (copy_from_user(copy + size, data + size, PAGE_SIZE - size)) - memset(copy + size, 0, PAGE_SIZE - size); - } + return copy; } -char *copy_mount_string(const void __user *data) +static char *copy_mount_string(const void __user *data) { return data ? strndup_user(data, PATH_MAX) : NULL; } diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index e732580fe47b..cb52db9a0cfb 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -579,6 +579,9 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); do { + if (entry->label) + entry->label->len = NFS4_MAXLABELLEN; + status = xdr_decode(desc, entry, &stream); if (status != 0) { if (status == -EAGAIN) diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index ff8965d1a4d4..a163533446fa 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -715,7 +715,7 @@ nfs4_ff_layout_stat_io_end_write(struct rpc_task *task, } static void -ff_layout_mark_ds_unreachable(struct pnfs_layout_segment *lseg, int idx) +ff_layout_mark_ds_unreachable(struct pnfs_layout_segment *lseg, u32 idx) { struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx); @@ -724,7 +724,7 @@ ff_layout_mark_ds_unreachable(struct pnfs_layout_segment *lseg, int idx) } static void -ff_layout_mark_ds_reachable(struct pnfs_layout_segment *lseg, int idx) +ff_layout_mark_ds_reachable(struct pnfs_layout_segment *lseg, u32 idx) { struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx); @@ -734,14 +734,14 @@ ff_layout_mark_ds_reachable(struct pnfs_layout_segment *lseg, int idx) static struct nfs4_pnfs_ds * ff_layout_choose_ds_for_read(struct pnfs_layout_segment *lseg, - int start_idx, int *best_idx, + u32 start_idx, u32 *best_idx, bool check_device) { struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg); struct nfs4_ff_layout_mirror *mirror; struct nfs4_pnfs_ds *ds; bool fail_return = false; - int idx; + u32 idx; /* mirrors are initially sorted by efficiency */ for (idx = start_idx; idx < fls->mirror_array_cnt; idx++) { @@ -766,21 +766,21 @@ ff_layout_choose_ds_for_read(struct pnfs_layout_segment *lseg, static struct nfs4_pnfs_ds * ff_layout_choose_any_ds_for_read(struct pnfs_layout_segment *lseg, - int start_idx, int *best_idx) + u32 start_idx, u32 *best_idx) { return ff_layout_choose_ds_for_read(lseg, start_idx, best_idx, false); } static struct nfs4_pnfs_ds * ff_layout_choose_valid_ds_for_read(struct pnfs_layout_segment *lseg, - int start_idx, int *best_idx) + u32 start_idx, u32 *best_idx) { return ff_layout_choose_ds_for_read(lseg, start_idx, best_idx, true); } static struct nfs4_pnfs_ds * ff_layout_choose_best_ds_for_read(struct pnfs_layout_segment *lseg, - int start_idx, int *best_idx) + u32 start_idx, u32 *best_idx) { struct nfs4_pnfs_ds *ds; @@ -791,7 +791,8 @@ ff_layout_choose_best_ds_for_read(struct pnfs_layout_segment *lseg, } static struct nfs4_pnfs_ds * -ff_layout_get_ds_for_read(struct nfs_pageio_descriptor *pgio, int *best_idx) +ff_layout_get_ds_for_read(struct nfs_pageio_descriptor *pgio, + u32 *best_idx) { struct pnfs_layout_segment *lseg = pgio->pg_lseg; struct nfs4_pnfs_ds *ds; @@ -837,7 +838,7 @@ ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_pgio_mirror *pgm; struct nfs4_ff_layout_mirror *mirror; struct nfs4_pnfs_ds *ds; - int ds_idx; + u32 ds_idx, i; retry: ff_layout_pg_check_layout(pgio, req); @@ -863,14 +864,14 @@ retry: goto retry; } - mirror = FF_LAYOUT_COMP(pgio->pg_lseg, ds_idx); + for (i = 0; i < pgio->pg_mirror_count; i++) { + mirror = FF_LAYOUT_COMP(pgio->pg_lseg, i); + pgm = &pgio->pg_mirrors[i]; + pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].rsize; + } pgio->pg_mirror_idx = ds_idx; - /* read always uses only one mirror - idx 0 for pgio layer */ - pgm = &pgio->pg_mirrors[0]; - pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].rsize; - if (NFS_SERVER(pgio->pg_inode)->flags & (NFS_MOUNT_SOFT|NFS_MOUNT_SOFTERR)) pgio->pg_maxretrans = io_maxretrans; @@ -894,7 +895,7 @@ ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs4_ff_layout_mirror *mirror; struct nfs_pgio_mirror *pgm; struct nfs4_pnfs_ds *ds; - int i; + u32 i; retry: ff_layout_pg_check_layout(pgio, req); @@ -1038,7 +1039,7 @@ static void ff_layout_reset_write(struct nfs_pgio_header *hdr, bool retry_pnfs) static void ff_layout_resend_pnfs_read(struct nfs_pgio_header *hdr) { u32 idx = hdr->pgio_mirror_idx + 1; - int new_idx = 0; + u32 new_idx = 0; if (ff_layout_choose_any_ds_for_read(hdr->lseg, idx + 1, &new_idx)) ff_layout_send_layouterror(hdr->lseg); @@ -1075,7 +1076,7 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task, struct nfs4_state *state, struct nfs_client *clp, struct pnfs_layout_segment *lseg, - int idx) + u32 idx) { struct pnfs_layout_hdr *lo = lseg->pls_layout; struct inode *inode = lo->plh_inode; @@ -1149,7 +1150,7 @@ reset: /* Retry all errors through either pNFS or MDS except for -EJUKEBOX */ static int ff_layout_async_handle_error_v3(struct rpc_task *task, struct pnfs_layout_segment *lseg, - int idx) + u32 idx) { struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx); @@ -1184,7 +1185,7 @@ static int ff_layout_async_handle_error(struct rpc_task *task, struct nfs4_state *state, struct nfs_client *clp, struct pnfs_layout_segment *lseg, - int idx) + u32 idx) { int vers = clp->cl_nfs_mod->rpc_vers->number; @@ -1211,7 +1212,7 @@ static int ff_layout_async_handle_error(struct rpc_task *task, } static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg, - int idx, u64 offset, u64 length, + u32 idx, u64 offset, u64 length, u32 *op_status, int opnum, int error) { struct nfs4_ff_layout_mirror *mirror; @@ -1809,7 +1810,7 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync) loff_t offset = hdr->args.offset; int vers; struct nfs_fh *fh; - int idx = hdr->pgio_mirror_idx; + u32 idx = hdr->pgio_mirror_idx; mirror = FF_LAYOUT_COMP(lseg, idx); ds = nfs4_ff_layout_prepare_ds(lseg, mirror, true); diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c index 524812984e2d..222afba70bc0 100644 --- a/fs/nfs/fs_context.c +++ b/fs/nfs/fs_context.c @@ -1039,6 +1039,65 @@ out_invalid_fh: } #if IS_ENABLED(CONFIG_NFS_V4) +struct compat_nfs_string { + compat_uint_t len; + compat_uptr_t data; +}; + +static inline void compat_nfs_string(struct nfs_string *dst, + struct compat_nfs_string *src) +{ + dst->data = compat_ptr(src->data); + dst->len = src->len; +} + +struct compat_nfs4_mount_data_v1 { + compat_int_t version; + compat_int_t flags; + compat_int_t rsize; + compat_int_t wsize; + compat_int_t timeo; + compat_int_t retrans; + compat_int_t acregmin; + compat_int_t acregmax; + compat_int_t acdirmin; + compat_int_t acdirmax; + struct compat_nfs_string client_addr; + struct compat_nfs_string mnt_path; + struct compat_nfs_string hostname; + compat_uint_t host_addrlen; + compat_uptr_t host_addr; + compat_int_t proto; + compat_int_t auth_flavourlen; + compat_uptr_t auth_flavours; +}; + +static void nfs4_compat_mount_data_conv(struct nfs4_mount_data *data) +{ + struct compat_nfs4_mount_data_v1 *compat = + (struct compat_nfs4_mount_data_v1 *)data; + + /* copy the fields backwards */ + data->auth_flavours = compat_ptr(compat->auth_flavours); + data->auth_flavourlen = compat->auth_flavourlen; + data->proto = compat->proto; + data->host_addr = compat_ptr(compat->host_addr); + data->host_addrlen = compat->host_addrlen; + compat_nfs_string(&data->hostname, &compat->hostname); + compat_nfs_string(&data->mnt_path, &compat->mnt_path); + compat_nfs_string(&data->client_addr, &compat->client_addr); + data->acdirmax = compat->acdirmax; + data->acdirmin = compat->acdirmin; + data->acregmax = compat->acregmax; + data->acregmin = compat->acregmin; + data->retrans = compat->retrans; + data->timeo = compat->timeo; + data->wsize = compat->wsize; + data->rsize = compat->rsize; + data->flags = compat->flags; + data->version = compat->version; +} + /* * Validate NFSv4 mount options */ @@ -1049,89 +1108,83 @@ static int nfs4_parse_monolithic(struct fs_context *fc, struct sockaddr *sap = (struct sockaddr *)&ctx->nfs_server.address; char *c; - if (data == NULL) - goto out_no_data; + if (!data) { + if (is_remount_fc(fc)) + goto done; + return nfs_invalf(fc, + "NFS4: mount program didn't pass any mount data"); + } ctx->version = 4; - switch (data->version) { - case 1: - if (data->host_addrlen > sizeof(ctx->nfs_server.address)) - goto out_no_address; - if (data->host_addrlen == 0) - goto out_no_address; - ctx->nfs_server.addrlen = data->host_addrlen; - if (copy_from_user(sap, data->host_addr, data->host_addrlen)) - return -EFAULT; - if (!nfs_verify_server_address(sap)) - goto out_no_address; - ctx->nfs_server.port = ntohs(((struct sockaddr_in *)sap)->sin_port); - - if (data->auth_flavourlen) { - rpc_authflavor_t pseudoflavor; - if (data->auth_flavourlen > 1) - goto out_inval_auth; - if (copy_from_user(&pseudoflavor, - data->auth_flavours, - sizeof(pseudoflavor))) - return -EFAULT; - ctx->selected_flavor = pseudoflavor; - } else - ctx->selected_flavor = RPC_AUTH_UNIX; - - c = strndup_user(data->hostname.data, NFS4_MAXNAMLEN); - if (IS_ERR(c)) - return PTR_ERR(c); - ctx->nfs_server.hostname = c; + if (data->version != 1) + return generic_parse_monolithic(fc, data); - c = strndup_user(data->mnt_path.data, NFS4_MAXPATHLEN); - if (IS_ERR(c)) - return PTR_ERR(c); - ctx->nfs_server.export_path = c; - dfprintk(MOUNT, "NFS: MNTPATH: '%s'\n", c); + if (in_compat_syscall()) + nfs4_compat_mount_data_conv(data); - c = strndup_user(data->client_addr.data, 16); - if (IS_ERR(c)) - return PTR_ERR(c); - ctx->client_address = c; - - /* - * Translate to nfs_fs_context, which nfs_fill_super - * can deal with. - */ + if (data->host_addrlen > sizeof(ctx->nfs_server.address)) + goto out_no_address; + if (data->host_addrlen == 0) + goto out_no_address; + ctx->nfs_server.addrlen = data->host_addrlen; + if (copy_from_user(sap, data->host_addr, data->host_addrlen)) + return -EFAULT; + if (!nfs_verify_server_address(sap)) + goto out_no_address; + ctx->nfs_server.port = ntohs(((struct sockaddr_in *)sap)->sin_port); - ctx->flags = data->flags & NFS4_MOUNT_FLAGMASK; - ctx->rsize = data->rsize; - ctx->wsize = data->wsize; - ctx->timeo = data->timeo; - ctx->retrans = data->retrans; - ctx->acregmin = data->acregmin; - ctx->acregmax = data->acregmax; - ctx->acdirmin = data->acdirmin; - ctx->acdirmax = data->acdirmax; - ctx->nfs_server.protocol = data->proto; - nfs_validate_transport_protocol(ctx); - if (ctx->nfs_server.protocol == XPRT_TRANSPORT_UDP) - goto out_invalid_transport_udp; + if (data->auth_flavourlen) { + rpc_authflavor_t pseudoflavor; - break; - default: - goto generic; + if (data->auth_flavourlen > 1) + goto out_inval_auth; + if (copy_from_user(&pseudoflavor, data->auth_flavours, + sizeof(pseudoflavor))) + return -EFAULT; + ctx->selected_flavor = pseudoflavor; + } else { + ctx->selected_flavor = RPC_AUTH_UNIX; } + c = strndup_user(data->hostname.data, NFS4_MAXNAMLEN); + if (IS_ERR(c)) + return PTR_ERR(c); + ctx->nfs_server.hostname = c; + + c = strndup_user(data->mnt_path.data, NFS4_MAXPATHLEN); + if (IS_ERR(c)) + return PTR_ERR(c); + ctx->nfs_server.export_path = c; + dfprintk(MOUNT, "NFS: MNTPATH: '%s'\n", c); + + c = strndup_user(data->client_addr.data, 16); + if (IS_ERR(c)) + return PTR_ERR(c); + ctx->client_address = c; + + /* + * Translate to nfs_fs_context, which nfs_fill_super + * can deal with. + */ + + ctx->flags = data->flags & NFS4_MOUNT_FLAGMASK; + ctx->rsize = data->rsize; + ctx->wsize = data->wsize; + ctx->timeo = data->timeo; + ctx->retrans = data->retrans; + ctx->acregmin = data->acregmin; + ctx->acregmax = data->acregmax; + ctx->acdirmin = data->acdirmin; + ctx->acdirmax = data->acdirmax; + ctx->nfs_server.protocol = data->proto; + nfs_validate_transport_protocol(ctx); + if (ctx->nfs_server.protocol == XPRT_TRANSPORT_UDP) + goto out_invalid_transport_udp; +done: ctx->skip_reconfig_option_check = true; return 0; -generic: - return generic_parse_monolithic(fc, data); - -out_no_data: - if (is_remount_fc(fc)) { - ctx->skip_reconfig_option_check = true; - return 0; - } - return nfs_invalf(fc, "NFS4: mount program didn't pass any mount data"); - out_inval_auth: return nfs_invalf(fc, "NFS4: Invalid number of RPC auth flavours %d", data->auth_flavourlen); diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 142225f0af59..2b2211d1234e 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -356,7 +356,15 @@ static ssize_t _nfs42_proc_copy(struct file *src, truncate_pagecache_range(dst_inode, pos_dst, pos_dst + res->write_res.count); - + spin_lock(&dst_inode->i_lock); + NFS_I(dst_inode)->cache_validity |= (NFS_INO_REVAL_PAGECACHE | + NFS_INO_REVAL_FORCED | NFS_INO_INVALID_SIZE | + NFS_INO_INVALID_ATTR | NFS_INO_INVALID_DATA); + spin_unlock(&dst_inode->i_lock); + spin_lock(&src_inode->i_lock); + NFS_I(src_inode)->cache_validity |= (NFS_INO_REVAL_PAGECACHE | + NFS_INO_REVAL_FORCED | NFS_INO_INVALID_ATIME); + spin_unlock(&src_inode->i_lock); status = res->write_res.count; out: if (args->sync) diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 7a70287f21a2..f943e37853fa 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1200,13 +1200,6 @@ static void nfs_get_cache_cookie(struct super_block *sb, } #endif -static void nfs_set_readahead(struct backing_dev_info *bdi, - unsigned long iomax_pages) -{ - bdi->ra_pages = VM_READAHEAD_PAGES; - bdi->io_pages = iomax_pages; -} - int nfs_get_tree_common(struct fs_context *fc) { struct nfs_fs_context *ctx = nfs_fc2context(fc); @@ -1251,7 +1244,7 @@ int nfs_get_tree_common(struct fs_context *fc) MINOR(server->s_dev)); if (error) goto error_splat_super; - nfs_set_readahead(s->s_bdi, server->rpages); + s->s_bdi->io_pages = server->rpages; server->super = s; } diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c index 311e5ce80cfc..a07c39c94bbd 100644 --- a/fs/nfsd/blocklayout.c +++ b/fs/nfsd/blocklayout.c @@ -170,7 +170,7 @@ nfsd4_block_proc_getdeviceinfo(struct super_block *sb, struct nfs4_client *clp, struct nfsd4_getdeviceinfo *gdp) { - if (sb->s_bdev != sb->s_bdev->bd_contains) + if (bdev_is_partition(sb->s_bdev)) return nfserr_inval; return nfserrno(nfsd4_block_get_device_info_simple(sb, gdp)); } @@ -382,7 +382,7 @@ nfsd4_scsi_proc_getdeviceinfo(struct super_block *sb, struct nfs4_client *clp, struct nfsd4_getdeviceinfo *gdp) { - if (sb->s_bdev != sb->s_bdev->bd_contains) + if (bdev_is_partition(sb->s_bdev)) return nfserr_inval; return nfserrno(nfsd4_block_get_device_info_scsi(sb, clp, gdp)); } diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c index e516ae389ca5..5900879d5693 100644 --- a/fs/nilfs2/bmap.c +++ b/fs/nilfs2/bmap.c @@ -355,7 +355,7 @@ void nilfs_bmap_lookup_dirty_buffers(struct nilfs_bmap *bmap, /** * nilfs_bmap_assign - assign a new block number to a block * @bmap: bmap - * @bhp: pointer to buffer head + * @bh: pointer to buffer head * @blocknr: block number * @binfo: block information * diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c index 86d4d850d130..025fb082575a 100644 --- a/fs/nilfs2/cpfile.c +++ b/fs/nilfs2/cpfile.c @@ -889,7 +889,7 @@ int nilfs_cpfile_is_snapshot(struct inode *cpfile, __u64 cno) * nilfs_cpfile_change_cpmode - change checkpoint mode * @cpfile: inode of checkpoint file * @cno: checkpoint number - * @status: mode of checkpoint + * @mode: mode of checkpoint * * Description: nilfs_change_cpmode() changes the mode of the checkpoint * specified by @cno. The mode @mode is NILFS_CHECKPOINT or NILFS_SNAPSHOT. @@ -930,12 +930,12 @@ int nilfs_cpfile_change_cpmode(struct inode *cpfile, __u64 cno, int mode) /** * nilfs_cpfile_get_stat - get checkpoint statistics * @cpfile: inode of checkpoint file - * @stat: pointer to a structure of checkpoint statistics + * @cpstat: pointer to a structure of checkpoint statistics * * Description: nilfs_cpfile_get_stat() returns information about checkpoints. * * Return Value: On success, 0 is returned, and checkpoints information is - * stored in the place pointed by @stat. On error, one of the following + * stored in the place pointed by @cpstat. On error, one of the following * negative error codes is returned. * * %-EIO - I/O error. diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index b175f1330408..171fb5cd427f 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -69,7 +69,6 @@ struct buffer_head *nilfs_grab_buffer(struct inode *inode, /** * nilfs_forget_buffer - discard dirty state - * @inode: owner inode of the buffer * @bh: buffer head of the buffer to be discarded */ void nilfs_forget_buffer(struct buffer_head *bh) diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c index 42ff67c0c14f..63722475e17e 100644 --- a/fs/nilfs2/sufile.c +++ b/fs/nilfs2/sufile.c @@ -546,13 +546,13 @@ int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum, /** * nilfs_sufile_get_stat - get segment usage statistics * @sufile: inode of segment usage file - * @stat: pointer to a structure of segment usage statistics + * @sustat: pointer to a structure of segment usage statistics * * Description: nilfs_sufile_get_stat() returns information about segment * usage. * * Return Value: On success, 0 is returned, and segment usage information is - * stored in the place pointed by @stat. On error, one of the following + * stored in the place pointed by @sustat. On error, one of the following * negative error codes is returned. * * %-EIO - I/O error. diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index 9bb9f0952b18..caf563981532 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -1810,6 +1810,12 @@ int ntfs_read_inode_mount(struct inode *vi) brelse(bh); } + if (le32_to_cpu(m->bytes_allocated) != vol->mft_record_size) { + ntfs_error(sb, "Incorrect mft record size %u in superblock, should be %u.", + le32_to_cpu(m->bytes_allocated), vol->mft_record_size); + goto err_out; + } + /* Apply the mst fixups. */ if (post_read_mst_fixup((NTFS_RECORD*)m, vol->mft_record_size)) { /* FIXME: Try to use the $MFTMirr now. */ diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 4c1b90442d6f..78710788c237 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -6013,7 +6013,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb) goto out; } - /* Appending truncate log(TA) and and flushing truncate log(TF) are + /* Appending truncate log(TA) and flushing truncate log(TF) are * two separated transactions. They can be both committed but not * checkpointed. If crash occurs then, both two transaction will be * replayed with several already released to global bitmap clusters. @@ -7654,8 +7654,10 @@ out_mutex: * main_bm related locks for avoiding the current IO starve, then go to * trim the next group */ - if (ret >= 0 && group <= last_group) + if (ret >= 0 && group <= last_group) { + cond_resched(); goto next_group; + } out: range->len = trimmed * sb->s_blocksize; return ret; diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 89d13e0705fe..0179a73a3fa2 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -1766,7 +1766,6 @@ static ssize_t o2hb_region_dev_store(struct config_item *item, int sectsize; char *p = (char *)page; struct fd f; - struct inode *inode; ssize_t ret = -EINVAL; int live_threshold; @@ -1793,20 +1792,16 @@ static ssize_t o2hb_region_dev_store(struct config_item *item, reg->hr_block_bytes == 0) goto out2; - inode = igrab(f.file->f_mapping->host); - if (inode == NULL) + if (!S_ISBLK(f.file->f_mapping->host->i_mode)) goto out2; - if (!S_ISBLK(inode->i_mode)) - goto out3; - - reg->hr_bdev = I_BDEV(f.file->f_mapping->host); - ret = blkdev_get(reg->hr_bdev, FMODE_WRITE | FMODE_READ, NULL); - if (ret) { + reg->hr_bdev = blkdev_get_by_dev(f.file->f_mapping->host->i_rdev, + FMODE_WRITE | FMODE_READ, NULL); + if (IS_ERR(reg->hr_bdev)) { + ret = PTR_ERR(reg->hr_bdev); reg->hr_bdev = NULL; - goto out3; + goto out2; } - inode = NULL; bdevname(reg->hr_bdev, reg->hr_dev_name); @@ -1909,16 +1904,13 @@ static ssize_t o2hb_region_dev_store(struct config_item *item, config_item_name(®->hr_item), reg->hr_dev_name); out3: - iput(inode); + if (ret < 0) { + blkdev_put(reg->hr_bdev, FMODE_READ | FMODE_WRITE); + reg->hr_bdev = NULL; + } out2: fdput(f); out: - if (ret < 0) { - if (reg->hr_bdev) { - blkdev_put(reg->hr_bdev, FMODE_READ|FMODE_WRITE); - reg->hr_bdev = NULL; - } - } return ret; } diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index 720e9f94957e..fc8252a28cb1 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -677,7 +677,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb, /* * Under certain conditions, the window slide code * might have reduced the number of bits available or - * disabled the the local alloc entirely. Re-check + * disabled the local alloc entirely. Re-check * here and return -ENOSPC if necessary. */ status = -ENOSPC; diff --git a/fs/pipe.c b/fs/pipe.c index 60dbee457143..0ac197658a2d 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -106,25 +106,6 @@ void pipe_double_lock(struct pipe_inode_info *pipe1, } } -/* Drop the inode semaphore and wait for a pipe event, atomically */ -void pipe_wait(struct pipe_inode_info *pipe) -{ - DEFINE_WAIT(rdwait); - DEFINE_WAIT(wrwait); - - /* - * Pipes are system-local resources, so sleeping on them - * is considered a noninteractive wait: - */ - prepare_to_wait(&pipe->rd_wait, &rdwait, TASK_INTERRUPTIBLE); - prepare_to_wait(&pipe->wr_wait, &wrwait, TASK_INTERRUPTIBLE); - pipe_unlock(pipe); - schedule(); - finish_wait(&pipe->rd_wait, &rdwait); - finish_wait(&pipe->wr_wait, &wrwait); - pipe_lock(pipe); -} - static void anon_pipe_buf_release(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { @@ -913,19 +894,18 @@ int create_pipe_files(struct file **res, int flags) { struct inode *inode = get_pipe_inode(); struct file *f; + int error; if (!inode) return -ENFILE; if (flags & O_NOTIFICATION_PIPE) { -#ifdef CONFIG_WATCH_QUEUE - if (watch_queue_init(inode->i_pipe) < 0) { + error = watch_queue_init(inode->i_pipe); + if (error) { + free_pipe_info(inode->i_pipe); iput(inode); - return -ENOMEM; + return error; } -#else - return -ENOPKG; -#endif } f = alloc_file_pseudo(inode, pipe_mnt, "", @@ -1035,12 +1015,52 @@ SYSCALL_DEFINE1(pipe, int __user *, fildes) return do_pipe2(fildes, 0); } +/* + * This is the stupid "wait for pipe to be readable or writable" + * model. + * + * See pipe_read/write() for the proper kind of exclusive wait, + * but that requires that we wake up any other readers/writers + * if we then do not end up reading everything (ie the whole + * "wake_next_reader/writer" logic in pipe_read/write()). + */ +void pipe_wait_readable(struct pipe_inode_info *pipe) +{ + pipe_unlock(pipe); + wait_event_interruptible(pipe->rd_wait, pipe_readable(pipe)); + pipe_lock(pipe); +} + +void pipe_wait_writable(struct pipe_inode_info *pipe) +{ + pipe_unlock(pipe); + wait_event_interruptible(pipe->wr_wait, pipe_writable(pipe)); + pipe_lock(pipe); +} + +/* + * This depends on both the wait (here) and the wakeup (wake_up_partner) + * holding the pipe lock, so "*cnt" is stable and we know a wakeup cannot + * race with the count check and waitqueue prep. + * + * Normally in order to avoid races, you'd do the prepare_to_wait() first, + * then check the condition you're waiting for, and only then sleep. But + * because of the pipe lock, we can check the condition before being on + * the wait queue. + * + * We use the 'rd_wait' waitqueue for pipe partner waiting. + */ static int wait_for_partner(struct pipe_inode_info *pipe, unsigned int *cnt) { + DEFINE_WAIT(rdwait); int cur = *cnt; while (cur == *cnt) { - pipe_wait(pipe); + prepare_to_wait(&pipe->rd_wait, &rdwait, TASK_INTERRUPTIBLE); + pipe_unlock(pipe); + schedule(); + finish_wait(&pipe->rd_wait, &rdwait); + pipe_lock(pipe); if (signal_pending(current)) break; } @@ -1050,7 +1070,6 @@ static int wait_for_partner(struct pipe_inode_info *pipe, unsigned int *cnt) static void wake_up_partner(struct pipe_inode_info *pipe) { wake_up_interruptible_all(&pipe->rd_wait); - wake_up_interruptible_all(&pipe->wr_wait); } static int fifo_open(struct inode *inode, struct file *filp) diff --git a/fs/proc/base.c b/fs/proc/base.c index 617db4e0faa0..aa69c35d904c 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1055,7 +1055,6 @@ static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count, static int __set_oom_adj(struct file *file, int oom_adj, bool legacy) { - static DEFINE_MUTEX(oom_adj_mutex); struct mm_struct *mm = NULL; struct task_struct *task; int err = 0; @@ -1095,7 +1094,7 @@ static int __set_oom_adj(struct file *file, int oom_adj, bool legacy) struct task_struct *p = find_lock_task_mm(task); if (p) { - if (atomic_read(&p->mm->mm_users) > 1) { + if (test_bit(MMF_MULTIPROCESS, &p->mm->flags)) { mm = p->mm; mmgrab(mm); } diff --git a/fs/proc/page.c b/fs/proc/page.c index f909243d4a66..9f1077d94cde 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -217,6 +217,9 @@ u64 stable_page_flags(struct page *page) u |= kpf_copy_bit(k, KPF_PRIVATE_2, PG_private_2); u |= kpf_copy_bit(k, KPF_OWNER_PRIVATE, PG_owner_priv_1); u |= kpf_copy_bit(k, KPF_ARCH, PG_arch_1); +#ifdef CONFIG_64BIT + u |= kpf_copy_bit(k, KPF_ARCH_2, PG_arch_2); +#endif return u; }; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 5066b0251ed8..217aa2705d5d 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -520,16 +520,10 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, page = device_private_entry_to_page(swpent); } else if (unlikely(IS_ENABLED(CONFIG_SHMEM) && mss->check_shmem_swap && pte_none(*pte))) { - page = find_get_entry(vma->vm_file->f_mapping, + page = xa_load(&vma->vm_file->f_mapping->i_pages, linear_page_index(vma, addr)); - if (!page) - return; - if (xa_is_value(page)) mss->swap += PAGE_SIZE; - else - put_page(page); - return; } @@ -653,6 +647,10 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) [ilog2(VM_MERGEABLE)] = "mg", [ilog2(VM_UFFD_MISSING)]= "um", [ilog2(VM_UFFD_WP)] = "uw", +#ifdef CONFIG_ARM64_MTE + [ilog2(VM_MTE)] = "mt", + [ilog2(VM_MTE_ALLOWED)] = "", +#endif #ifdef CONFIG_ARCH_HAS_PKEYS /* These come out via ProtectionKey: */ [ilog2(VM_PKEY_BIT0)] = "", @@ -723,9 +721,21 @@ static const struct mm_walk_ops smaps_shmem_walk_ops = { .pte_hole = smaps_pte_hole, }; +/* + * Gather mem stats from @vma with the indicated beginning + * address @start, and keep them in @mss. + * + * Use vm_start of @vma as the beginning address if @start is 0. + */ static void smap_gather_stats(struct vm_area_struct *vma, - struct mem_size_stats *mss) + struct mem_size_stats *mss, unsigned long start) { + const struct mm_walk_ops *ops = &smaps_walk_ops; + + /* Invalid start */ + if (start >= vma->vm_end) + return; + #ifdef CONFIG_SHMEM /* In case of smaps_rollup, reset the value from previous vma */ mss->check_shmem_swap = false; @@ -742,18 +752,20 @@ static void smap_gather_stats(struct vm_area_struct *vma, */ unsigned long shmem_swapped = shmem_swap_usage(vma); - if (!shmem_swapped || (vma->vm_flags & VM_SHARED) || - !(vma->vm_flags & VM_WRITE)) { + if (!start && (!shmem_swapped || (vma->vm_flags & VM_SHARED) || + !(vma->vm_flags & VM_WRITE))) { mss->swap += shmem_swapped; } else { mss->check_shmem_swap = true; - walk_page_vma(vma, &smaps_shmem_walk_ops, mss); - return; + ops = &smaps_shmem_walk_ops; } } #endif /* mmap_lock is held in m_start */ - walk_page_vma(vma, &smaps_walk_ops, mss); + if (!start) + walk_page_vma(vma, ops, mss); + else + walk_page_range(vma->vm_mm, start, vma->vm_end, ops, mss); } #define SEQ_PUT_DEC(str, val) \ @@ -805,7 +817,7 @@ static int show_smap(struct seq_file *m, void *v) memset(&mss, 0, sizeof(mss)); - smap_gather_stats(vma, &mss); + smap_gather_stats(vma, &mss, 0); show_map_vma(m, vma); @@ -853,9 +865,73 @@ static int show_smaps_rollup(struct seq_file *m, void *v) hold_task_mempolicy(priv); - for (vma = priv->mm->mmap; vma; vma = vma->vm_next) { - smap_gather_stats(vma, &mss); + for (vma = priv->mm->mmap; vma;) { + smap_gather_stats(vma, &mss, 0); last_vma_end = vma->vm_end; + + /* + * Release mmap_lock temporarily if someone wants to + * access it for write request. + */ + if (mmap_lock_is_contended(mm)) { + mmap_read_unlock(mm); + ret = mmap_read_lock_killable(mm); + if (ret) { + release_task_mempolicy(priv); + goto out_put_mm; + } + + /* + * After dropping the lock, there are four cases to + * consider. See the following example for explanation. + * + * +------+------+-----------+ + * | VMA1 | VMA2 | VMA3 | + * +------+------+-----------+ + * | | | | + * 4k 8k 16k 400k + * + * Suppose we drop the lock after reading VMA2 due to + * contention, then we get: + * + * last_vma_end = 16k + * + * 1) VMA2 is freed, but VMA3 exists: + * + * find_vma(mm, 16k - 1) will return VMA3. + * In this case, just continue from VMA3. + * + * 2) VMA2 still exists: + * + * find_vma(mm, 16k - 1) will return VMA2. + * Iterate the loop like the original one. + * + * 3) No more VMAs can be found: + * + * find_vma(mm, 16k - 1) will return NULL. + * No more things to do, just break. + * + * 4) (last_vma_end - 1) is the middle of a vma (VMA'): + * + * find_vma(mm, 16k - 1) will return VMA' whose range + * contains last_vma_end. + * Iterate VMA' from last_vma_end. + */ + vma = find_vma(mm, last_vma_end - 1); + /* Case 3 above */ + if (!vma) + break; + + /* Case 1 above */ + if (vma->vm_start >= last_vma_end) + continue; + + /* Case 4 above */ + if (vma->vm_end > last_vma_end) + smap_gather_stats(vma, &mss, last_vma_end); + } + /* Case 2 above */ + vma = vma->vm_next; } show_vma_header_prefix(m, priv->mm->mmap->vm_start, @@ -1168,24 +1244,6 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, count = -EINTR; goto out_mm; } - /* - * Avoid to modify vma->vm_flags - * without locked ops while the - * coredump reads the vm_flags. - */ - if (!mmget_still_valid(mm)) { - /* - * Silently return "count" - * like if get_task_mm() - * failed. FIXME: should this - * function have returned - * -ESRCH if get_task_mm() - * failed like if - * get_proc_task() fails? - */ - mmap_write_unlock(mm); - goto out_mm; - } for (vma = mm->mmap; vma; vma = vma->vm_next) { vma->vm_flags &= ~VM_SOFTDIRTY; vma_set_page_prot(vma); diff --git a/fs/quota/Kconfig b/fs/quota/Kconfig index d1ceb76adb71..b59cd172b5f9 100644 --- a/fs/quota/Kconfig +++ b/fs/quota/Kconfig @@ -70,8 +70,3 @@ config QFMT_V2 config QUOTACTL bool default n - -config QUOTACTL_COMPAT - bool - depends on QUOTACTL && COMPAT_FOR_U64_ALIGNMENT - default y diff --git a/fs/quota/Makefile b/fs/quota/Makefile index f2b49d0f0287..9160639daffa 100644 --- a/fs/quota/Makefile +++ b/fs/quota/Makefile @@ -4,5 +4,4 @@ obj-$(CONFIG_QFMT_V1) += quota_v1.o obj-$(CONFIG_QFMT_V2) += quota_v2.o obj-$(CONFIG_QUOTA_TREE) += quota_tree.o obj-$(CONFIG_QUOTACTL) += quota.o kqid.o -obj-$(CONFIG_QUOTACTL_COMPAT) += compat.o obj-$(CONFIG_QUOTA_NETLINK_INTERFACE) += netlink.o diff --git a/fs/quota/compat.c b/fs/quota/compat.c deleted file mode 100644 index c30572857619..000000000000 --- a/fs/quota/compat.c +++ /dev/null @@ -1,120 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include <linux/syscalls.h> -#include <linux/compat.h> -#include <linux/quotaops.h> - -/* - * This code works only for 32 bit quota tools over 64 bit OS (x86_64, ia64) - * and is necessary due to alignment problems. - */ -struct compat_if_dqblk { - compat_u64 dqb_bhardlimit; - compat_u64 dqb_bsoftlimit; - compat_u64 dqb_curspace; - compat_u64 dqb_ihardlimit; - compat_u64 dqb_isoftlimit; - compat_u64 dqb_curinodes; - compat_u64 dqb_btime; - compat_u64 dqb_itime; - compat_uint_t dqb_valid; -}; - -/* XFS structures */ -struct compat_fs_qfilestat { - compat_u64 dqb_bhardlimit; - compat_u64 qfs_nblks; - compat_uint_t qfs_nextents; -}; - -struct compat_fs_quota_stat { - __s8 qs_version; - __u16 qs_flags; - __s8 qs_pad; - struct compat_fs_qfilestat qs_uquota; - struct compat_fs_qfilestat qs_gquota; - compat_uint_t qs_incoredqs; - compat_int_t qs_btimelimit; - compat_int_t qs_itimelimit; - compat_int_t qs_rtbtimelimit; - __u16 qs_bwarnlimit; - __u16 qs_iwarnlimit; -}; - -COMPAT_SYSCALL_DEFINE4(quotactl32, unsigned int, cmd, - const char __user *, special, qid_t, id, - void __user *, addr) -{ - unsigned int cmds; - struct if_dqblk __user *dqblk; - struct compat_if_dqblk __user *compat_dqblk; - struct fs_quota_stat __user *fsqstat; - struct compat_fs_quota_stat __user *compat_fsqstat; - compat_uint_t data; - u16 xdata; - long ret; - - cmds = cmd >> SUBCMDSHIFT; - - switch (cmds) { - case Q_GETQUOTA: - dqblk = compat_alloc_user_space(sizeof(struct if_dqblk)); - compat_dqblk = addr; - ret = kernel_quotactl(cmd, special, id, dqblk); - if (ret) - break; - if (copy_in_user(compat_dqblk, dqblk, sizeof(*compat_dqblk)) || - get_user(data, &dqblk->dqb_valid) || - put_user(data, &compat_dqblk->dqb_valid)) - ret = -EFAULT; - break; - case Q_SETQUOTA: - dqblk = compat_alloc_user_space(sizeof(struct if_dqblk)); - compat_dqblk = addr; - ret = -EFAULT; - if (copy_in_user(dqblk, compat_dqblk, sizeof(*compat_dqblk)) || - get_user(data, &compat_dqblk->dqb_valid) || - put_user(data, &dqblk->dqb_valid)) - break; - ret = kernel_quotactl(cmd, special, id, dqblk); - break; - case Q_XGETQSTAT: - fsqstat = compat_alloc_user_space(sizeof(struct fs_quota_stat)); - compat_fsqstat = addr; - ret = kernel_quotactl(cmd, special, id, fsqstat); - if (ret) - break; - ret = -EFAULT; - /* Copying qs_version, qs_flags, qs_pad */ - if (copy_in_user(compat_fsqstat, fsqstat, - offsetof(struct compat_fs_quota_stat, qs_uquota))) - break; - /* Copying qs_uquota */ - if (copy_in_user(&compat_fsqstat->qs_uquota, - &fsqstat->qs_uquota, - sizeof(compat_fsqstat->qs_uquota)) || - get_user(data, &fsqstat->qs_uquota.qfs_nextents) || - put_user(data, &compat_fsqstat->qs_uquota.qfs_nextents)) - break; - /* Copying qs_gquota */ - if (copy_in_user(&compat_fsqstat->qs_gquota, - &fsqstat->qs_gquota, - sizeof(compat_fsqstat->qs_gquota)) || - get_user(data, &fsqstat->qs_gquota.qfs_nextents) || - put_user(data, &compat_fsqstat->qs_gquota.qfs_nextents)) - break; - /* Copying the rest */ - if (copy_in_user(&compat_fsqstat->qs_incoredqs, - &fsqstat->qs_incoredqs, - sizeof(struct compat_fs_quota_stat) - - offsetof(struct compat_fs_quota_stat, qs_incoredqs)) || - get_user(xdata, &fsqstat->qs_iwarnlimit) || - put_user(xdata, &compat_fsqstat->qs_iwarnlimit)) - break; - ret = 0; - break; - default: - ret = kernel_quotactl(cmd, special, id, addr); - } - return ret; -} diff --git a/fs/quota/compat.h b/fs/quota/compat.h new file mode 100644 index 000000000000..ef7d1e12d650 --- /dev/null +++ b/fs/quota/compat.h @@ -0,0 +1,34 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/compat.h> + +struct compat_if_dqblk { + compat_u64 dqb_bhardlimit; + compat_u64 dqb_bsoftlimit; + compat_u64 dqb_curspace; + compat_u64 dqb_ihardlimit; + compat_u64 dqb_isoftlimit; + compat_u64 dqb_curinodes; + compat_u64 dqb_btime; + compat_u64 dqb_itime; + compat_uint_t dqb_valid; +}; + +struct compat_fs_qfilestat { + compat_u64 dqb_bhardlimit; + compat_u64 qfs_nblks; + compat_uint_t qfs_nextents; +}; + +struct compat_fs_quota_stat { + __s8 qs_version; + __u16 qs_flags; + __s8 qs_pad; + struct compat_fs_qfilestat qs_uquota; + struct compat_fs_qfilestat qs_gquota; + compat_uint_t qs_incoredqs; + compat_int_t qs_btimelimit; + compat_int_t qs_itimelimit; + compat_int_t qs_rtbtimelimit; + __u16 qs_bwarnlimit; + __u16 qs_iwarnlimit; +}; diff --git a/fs/quota/quota.c b/fs/quota/quota.c index 47f9e151988b..9af95c7a0bbe 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -19,6 +19,7 @@ #include <linux/types.h> #include <linux/writeback.h> #include <linux/nospec.h> +#include "compat.h" static int check_quotactl_permission(struct super_block *sb, int type, int cmd, qid_t id) @@ -211,8 +212,18 @@ static int quota_getquota(struct super_block *sb, int type, qid_t id, if (ret) return ret; copy_to_if_dqblk(&idq, &fdq); - if (copy_to_user(addr, &idq, sizeof(idq))) - return -EFAULT; + + if (compat_need_64bit_alignment_fixup()) { + struct compat_if_dqblk __user *compat_dqblk = addr; + + if (copy_to_user(compat_dqblk, &idq, sizeof(*compat_dqblk))) + return -EFAULT; + if (put_user(idq.dqb_valid, &compat_dqblk->dqb_valid)) + return -EFAULT; + } else { + if (copy_to_user(addr, &idq, sizeof(idq))) + return -EFAULT; + } return 0; } @@ -277,8 +288,16 @@ static int quota_setquota(struct super_block *sb, int type, qid_t id, struct if_dqblk idq; struct kqid qid; - if (copy_from_user(&idq, addr, sizeof(idq))) - return -EFAULT; + if (compat_need_64bit_alignment_fixup()) { + struct compat_if_dqblk __user *compat_dqblk = addr; + + if (copy_from_user(&idq, compat_dqblk, sizeof(*compat_dqblk)) || + get_user(idq.dqb_valid, &compat_dqblk->dqb_valid)) + return -EFAULT; + } else { + if (copy_from_user(&idq, addr, sizeof(idq))) + return -EFAULT; + } if (!sb->s_qcop->set_dqblk) return -ENOSYS; qid = make_kqid(current_user_ns(), type, id); @@ -382,6 +401,33 @@ static int quota_getstate(struct super_block *sb, int type, return 0; } +static int compat_copy_fs_qfilestat(struct compat_fs_qfilestat __user *to, + struct fs_qfilestat *from) +{ + if (copy_to_user(to, from, sizeof(*to)) || + put_user(from->qfs_nextents, &to->qfs_nextents)) + return -EFAULT; + return 0; +} + +static int compat_copy_fs_quota_stat(struct compat_fs_quota_stat __user *to, + struct fs_quota_stat *from) +{ + if (put_user(from->qs_version, &to->qs_version) || + put_user(from->qs_flags, &to->qs_flags) || + put_user(from->qs_pad, &to->qs_pad) || + compat_copy_fs_qfilestat(&to->qs_uquota, &from->qs_uquota) || + compat_copy_fs_qfilestat(&to->qs_gquota, &from->qs_gquota) || + put_user(from->qs_incoredqs, &to->qs_incoredqs) || + put_user(from->qs_btimelimit, &to->qs_btimelimit) || + put_user(from->qs_itimelimit, &to->qs_itimelimit) || + put_user(from->qs_rtbtimelimit, &to->qs_rtbtimelimit) || + put_user(from->qs_bwarnlimit, &to->qs_bwarnlimit) || + put_user(from->qs_iwarnlimit, &to->qs_iwarnlimit)) + return -EFAULT; + return 0; +} + static int quota_getxstate(struct super_block *sb, int type, void __user *addr) { struct fs_quota_stat fqs; @@ -390,9 +436,14 @@ static int quota_getxstate(struct super_block *sb, int type, void __user *addr) if (!sb->s_qcop->get_state) return -ENOSYS; ret = quota_getstate(sb, type, &fqs); - if (!ret && copy_to_user(addr, &fqs, sizeof(fqs))) + if (ret) + return ret; + + if (compat_need_64bit_alignment_fixup()) + return compat_copy_fs_quota_stat(addr, &fqs); + if (copy_to_user(addr, &fqs, sizeof(fqs))) return -EFAULT; - return ret; + return 0; } static int quota_getstatev(struct super_block *sb, int type, @@ -481,6 +532,14 @@ static inline u64 quota_btobb(u64 bytes) return (bytes + (1 << XFS_BB_SHIFT) - 1) >> XFS_BB_SHIFT; } +static inline s64 copy_from_xfs_dqblk_ts(const struct fs_disk_quota *d, + __s32 timer, __s8 timer_hi) +{ + if (d->d_fieldmask & FS_DQ_BIGTIME) + return (u32)timer | (s64)timer_hi << 32; + return timer; +} + static void copy_from_xfs_dqblk(struct qc_dqblk *dst, struct fs_disk_quota *src) { dst->d_spc_hardlimit = quota_bbtob(src->d_blk_hardlimit); @@ -489,14 +548,17 @@ static void copy_from_xfs_dqblk(struct qc_dqblk *dst, struct fs_disk_quota *src) dst->d_ino_softlimit = src->d_ino_softlimit; dst->d_space = quota_bbtob(src->d_bcount); dst->d_ino_count = src->d_icount; - dst->d_ino_timer = src->d_itimer; - dst->d_spc_timer = src->d_btimer; + dst->d_ino_timer = copy_from_xfs_dqblk_ts(src, src->d_itimer, + src->d_itimer_hi); + dst->d_spc_timer = copy_from_xfs_dqblk_ts(src, src->d_btimer, + src->d_btimer_hi); dst->d_ino_warns = src->d_iwarns; dst->d_spc_warns = src->d_bwarns; dst->d_rt_spc_hardlimit = quota_bbtob(src->d_rtb_hardlimit); dst->d_rt_spc_softlimit = quota_bbtob(src->d_rtb_softlimit); dst->d_rt_space = quota_bbtob(src->d_rtbcount); - dst->d_rt_spc_timer = src->d_rtbtimer; + dst->d_rt_spc_timer = copy_from_xfs_dqblk_ts(src, src->d_rtbtimer, + src->d_rtbtimer_hi); dst->d_rt_spc_warns = src->d_rtbwarns; dst->d_fieldmask = 0; if (src->d_fieldmask & FS_DQ_ISOFT) @@ -588,10 +650,26 @@ static int quota_setxquota(struct super_block *sb, int type, qid_t id, return sb->s_qcop->set_dqblk(sb, qid, &qdq); } +static inline void copy_to_xfs_dqblk_ts(const struct fs_disk_quota *d, + __s32 *timer_lo, __s8 *timer_hi, s64 timer) +{ + *timer_lo = timer; + if (d->d_fieldmask & FS_DQ_BIGTIME) + *timer_hi = timer >> 32; +} + +static inline bool want_bigtime(s64 timer) +{ + return timer > S32_MAX || timer < S32_MIN; +} + static void copy_to_xfs_dqblk(struct fs_disk_quota *dst, struct qc_dqblk *src, int type, qid_t id) { memset(dst, 0, sizeof(*dst)); + if (want_bigtime(src->d_ino_timer) || want_bigtime(src->d_spc_timer) || + want_bigtime(src->d_rt_spc_timer)) + dst->d_fieldmask |= FS_DQ_BIGTIME; dst->d_version = FS_DQUOT_VERSION; dst->d_id = id; if (type == USRQUOTA) @@ -606,14 +684,17 @@ static void copy_to_xfs_dqblk(struct fs_disk_quota *dst, struct qc_dqblk *src, dst->d_ino_softlimit = src->d_ino_softlimit; dst->d_bcount = quota_btobb(src->d_space); dst->d_icount = src->d_ino_count; - dst->d_itimer = src->d_ino_timer; - dst->d_btimer = src->d_spc_timer; + copy_to_xfs_dqblk_ts(dst, &dst->d_itimer, &dst->d_itimer_hi, + src->d_ino_timer); + copy_to_xfs_dqblk_ts(dst, &dst->d_btimer, &dst->d_btimer_hi, + src->d_spc_timer); dst->d_iwarns = src->d_ino_warns; dst->d_bwarns = src->d_spc_warns; dst->d_rtb_hardlimit = quota_btobb(src->d_rt_spc_hardlimit); dst->d_rtb_softlimit = quota_btobb(src->d_rt_spc_softlimit); dst->d_rtbcount = quota_btobb(src->d_rt_space); - dst->d_rtbtimer = src->d_rt_spc_timer; + copy_to_xfs_dqblk_ts(dst, &dst->d_rtbtimer, &dst->d_rtbtimer_hi, + src->d_rt_spc_timer); dst->d_rtbwarns = src->d_rt_spc_warns; } @@ -816,8 +897,8 @@ static struct super_block *quotactl_block(const char __user *special, int cmd) * calls. Maybe we need to add the process quotas etc. in the future, * but we probably should use rlimits for that. */ -int kernel_quotactl(unsigned int cmd, const char __user *special, - qid_t id, void __user *addr) +SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special, + qid_t, id, void __user *, addr) { uint cmds, type; struct super_block *sb = NULL; @@ -871,9 +952,3 @@ out: path_put(pathp); return ret; } - -SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special, - qid_t, id, void __user *, addr) -{ - return kernel_quotactl(cmd, special, id, addr); -} diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c index 58fc2a7c7fd1..e69a2bfdd81c 100644 --- a/fs/quota/quota_v2.c +++ b/fs/quota/quota_v2.c @@ -282,6 +282,7 @@ static void v2r1_mem2diskdqb(void *dp, struct dquot *dquot) d->dqb_curspace = cpu_to_le64(m->dqb_curspace); d->dqb_btime = cpu_to_le64(m->dqb_btime); d->dqb_id = cpu_to_le32(from_kqid(&init_user_ns, dquot->dq_id)); + d->dqb_pad = 0; if (qtree_entry_unused(info, dp)) d->dqb_itime = cpu_to_le64(1); } diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index 414695454956..355523f4a4bf 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -224,7 +224,7 @@ static unsigned long ramfs_nommu_get_unmapped_area(struct file *file, if (!pages) goto out_free; - nr = find_get_pages(inode->i_mapping, &pgoff, lpages, pages); + nr = find_get_pages_contig(inode->i_mapping, pgoff, lpages, pages); if (nr != lpages) goto out_free_pages; /* leave if some pages were missing */ diff --git a/fs/read_write.c b/fs/read_write.c index 5db58b8c78d0..19f5c4bf75aa 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -538,6 +538,14 @@ ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t inc_syscw(current); return ret; } +/* + * This "EXPORT_SYMBOL_GPL()" is more of a "EXPORT_SYMBOL_DONTUSE()", + * but autofs is one of the few internal kernel users that actually + * wants this _and_ can be built as a module. So we need to export + * this symbol for autofs, even though it really isn't appropriate + * for any other kernel modules. + */ +EXPORT_SYMBOL_GPL(__kernel_write); ssize_t kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos) @@ -752,185 +760,6 @@ static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter, return ret; } -/** - * rw_copy_check_uvector() - Copy an array of &struct iovec from userspace - * into the kernel and check that it is valid. - * - * @type: One of %CHECK_IOVEC_ONLY, %READ, or %WRITE. - * @uvector: Pointer to the userspace array. - * @nr_segs: Number of elements in userspace array. - * @fast_segs: Number of elements in @fast_pointer. - * @fast_pointer: Pointer to (usually small on-stack) kernel array. - * @ret_pointer: (output parameter) Pointer to a variable that will point to - * either @fast_pointer, a newly allocated kernel array, or NULL, - * depending on which array was used. - * - * This function copies an array of &struct iovec of @nr_segs from - * userspace into the kernel and checks that each element is valid (e.g. - * it does not point to a kernel address or cause overflow by being too - * large, etc.). - * - * As an optimization, the caller may provide a pointer to a small - * on-stack array in @fast_pointer, typically %UIO_FASTIOV elements long - * (the size of this array, or 0 if unused, should be given in @fast_segs). - * - * @ret_pointer will always point to the array that was used, so the - * caller must take care not to call kfree() on it e.g. in case the - * @fast_pointer array was used and it was allocated on the stack. - * - * Return: The total number of bytes covered by the iovec array on success - * or a negative error code on error. - */ -ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector, - unsigned long nr_segs, unsigned long fast_segs, - struct iovec *fast_pointer, - struct iovec **ret_pointer) -{ - unsigned long seg; - ssize_t ret; - struct iovec *iov = fast_pointer; - - /* - * SuS says "The readv() function *may* fail if the iovcnt argument - * was less than or equal to 0, or greater than {IOV_MAX}. Linux has - * traditionally returned zero for zero segments, so... - */ - if (nr_segs == 0) { - ret = 0; - goto out; - } - - /* - * First get the "struct iovec" from user memory and - * verify all the pointers - */ - if (nr_segs > UIO_MAXIOV) { - ret = -EINVAL; - goto out; - } - if (nr_segs > fast_segs) { - iov = kmalloc_array(nr_segs, sizeof(struct iovec), GFP_KERNEL); - if (iov == NULL) { - ret = -ENOMEM; - goto out; - } - } - if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector))) { - ret = -EFAULT; - goto out; - } - - /* - * According to the Single Unix Specification we should return EINVAL - * if an element length is < 0 when cast to ssize_t or if the - * total length would overflow the ssize_t return value of the - * system call. - * - * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the - * overflow case. - */ - ret = 0; - for (seg = 0; seg < nr_segs; seg++) { - void __user *buf = iov[seg].iov_base; - ssize_t len = (ssize_t)iov[seg].iov_len; - - /* see if we we're about to use an invalid len or if - * it's about to overflow ssize_t */ - if (len < 0) { - ret = -EINVAL; - goto out; - } - if (type >= 0 - && unlikely(!access_ok(buf, len))) { - ret = -EFAULT; - goto out; - } - if (len > MAX_RW_COUNT - ret) { - len = MAX_RW_COUNT - ret; - iov[seg].iov_len = len; - } - ret += len; - } -out: - *ret_pointer = iov; - return ret; -} - -#ifdef CONFIG_COMPAT -ssize_t compat_rw_copy_check_uvector(int type, - const struct compat_iovec __user *uvector, unsigned long nr_segs, - unsigned long fast_segs, struct iovec *fast_pointer, - struct iovec **ret_pointer) -{ - compat_ssize_t tot_len; - struct iovec *iov = *ret_pointer = fast_pointer; - ssize_t ret = 0; - int seg; - - /* - * SuS says "The readv() function *may* fail if the iovcnt argument - * was less than or equal to 0, or greater than {IOV_MAX}. Linux has - * traditionally returned zero for zero segments, so... - */ - if (nr_segs == 0) - goto out; - - ret = -EINVAL; - if (nr_segs > UIO_MAXIOV) - goto out; - if (nr_segs > fast_segs) { - ret = -ENOMEM; - iov = kmalloc_array(nr_segs, sizeof(struct iovec), GFP_KERNEL); - if (iov == NULL) - goto out; - } - *ret_pointer = iov; - - ret = -EFAULT; - if (!access_ok(uvector, nr_segs*sizeof(*uvector))) - goto out; - - /* - * Single unix specification: - * We should -EINVAL if an element length is not >= 0 and fitting an - * ssize_t. - * - * In Linux, the total length is limited to MAX_RW_COUNT, there is - * no overflow possibility. - */ - tot_len = 0; - ret = -EINVAL; - for (seg = 0; seg < nr_segs; seg++) { - compat_uptr_t buf; - compat_ssize_t len; - - if (__get_user(len, &uvector->iov_len) || - __get_user(buf, &uvector->iov_base)) { - ret = -EFAULT; - goto out; - } - if (len < 0) /* size_t not fitting in compat_ssize_t .. */ - goto out; - if (type >= 0 && - !access_ok(compat_ptr(buf), len)) { - ret = -EFAULT; - goto out; - } - if (len > MAX_RW_COUNT - tot_len) - len = MAX_RW_COUNT - tot_len; - tot_len += len; - iov->iov_base = compat_ptr(buf); - iov->iov_len = (compat_size_t) len; - uvector++; - iov++; - } - ret = tot_len; - -out: - return ret; -} -#endif - static ssize_t do_iter_read(struct file *file, struct iov_iter *iter, loff_t *pos, rwf_t flags) { @@ -1247,224 +1076,93 @@ SYSCALL_DEFINE6(pwritev2, unsigned long, fd, const struct iovec __user *, vec, return do_pwritev(fd, vec, vlen, pos, flags); } +/* + * Various compat syscalls. Note that they all pretend to take a native + * iovec - import_iovec will properly treat those as compat_iovecs based on + * in_compat_syscall(). + */ #ifdef CONFIG_COMPAT -static size_t compat_readv(struct file *file, - const struct compat_iovec __user *vec, - unsigned long vlen, loff_t *pos, rwf_t flags) -{ - struct iovec iovstack[UIO_FASTIOV]; - struct iovec *iov = iovstack; - struct iov_iter iter; - ssize_t ret; - - ret = compat_import_iovec(READ, vec, vlen, UIO_FASTIOV, &iov, &iter); - if (ret >= 0) { - ret = do_iter_read(file, &iter, pos, flags); - kfree(iov); - } - if (ret > 0) - add_rchar(current, ret); - inc_syscr(current); - return ret; -} - -static size_t do_compat_readv(compat_ulong_t fd, - const struct compat_iovec __user *vec, - compat_ulong_t vlen, rwf_t flags) -{ - struct fd f = fdget_pos(fd); - ssize_t ret; - loff_t pos; - - if (!f.file) - return -EBADF; - pos = f.file->f_pos; - ret = compat_readv(f.file, vec, vlen, &pos, flags); - if (ret >= 0) - f.file->f_pos = pos; - fdput_pos(f); - return ret; - -} - -COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd, - const struct compat_iovec __user *,vec, - compat_ulong_t, vlen) -{ - return do_compat_readv(fd, vec, vlen, 0); -} - -static long do_compat_preadv64(unsigned long fd, - const struct compat_iovec __user *vec, - unsigned long vlen, loff_t pos, rwf_t flags) -{ - struct fd f; - ssize_t ret; - - if (pos < 0) - return -EINVAL; - f = fdget(fd); - if (!f.file) - return -EBADF; - ret = -ESPIPE; - if (f.file->f_mode & FMODE_PREAD) - ret = compat_readv(f.file, vec, vlen, &pos, flags); - fdput(f); - return ret; -} - #ifdef __ARCH_WANT_COMPAT_SYS_PREADV64 COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd, - const struct compat_iovec __user *,vec, + const struct iovec __user *, vec, unsigned long, vlen, loff_t, pos) { - return do_compat_preadv64(fd, vec, vlen, pos, 0); + return do_preadv(fd, vec, vlen, pos, 0); } #endif COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd, - const struct compat_iovec __user *,vec, + const struct iovec __user *, vec, compat_ulong_t, vlen, u32, pos_low, u32, pos_high) { loff_t pos = ((loff_t)pos_high << 32) | pos_low; - return do_compat_preadv64(fd, vec, vlen, pos, 0); + return do_preadv(fd, vec, vlen, pos, 0); } #ifdef __ARCH_WANT_COMPAT_SYS_PREADV64V2 COMPAT_SYSCALL_DEFINE5(preadv64v2, unsigned long, fd, - const struct compat_iovec __user *,vec, + const struct iovec __user *, vec, unsigned long, vlen, loff_t, pos, rwf_t, flags) { if (pos == -1) - return do_compat_readv(fd, vec, vlen, flags); - - return do_compat_preadv64(fd, vec, vlen, pos, flags); + return do_readv(fd, vec, vlen, flags); + return do_preadv(fd, vec, vlen, pos, flags); } #endif COMPAT_SYSCALL_DEFINE6(preadv2, compat_ulong_t, fd, - const struct compat_iovec __user *,vec, + const struct iovec __user *, vec, compat_ulong_t, vlen, u32, pos_low, u32, pos_high, rwf_t, flags) { loff_t pos = ((loff_t)pos_high << 32) | pos_low; if (pos == -1) - return do_compat_readv(fd, vec, vlen, flags); - - return do_compat_preadv64(fd, vec, vlen, pos, flags); -} - -static size_t compat_writev(struct file *file, - const struct compat_iovec __user *vec, - unsigned long vlen, loff_t *pos, rwf_t flags) -{ - struct iovec iovstack[UIO_FASTIOV]; - struct iovec *iov = iovstack; - struct iov_iter iter; - ssize_t ret; - - ret = compat_import_iovec(WRITE, vec, vlen, UIO_FASTIOV, &iov, &iter); - if (ret >= 0) { - file_start_write(file); - ret = do_iter_write(file, &iter, pos, flags); - file_end_write(file); - kfree(iov); - } - if (ret > 0) - add_wchar(current, ret); - inc_syscw(current); - return ret; -} - -static size_t do_compat_writev(compat_ulong_t fd, - const struct compat_iovec __user* vec, - compat_ulong_t vlen, rwf_t flags) -{ - struct fd f = fdget_pos(fd); - ssize_t ret; - loff_t pos; - - if (!f.file) - return -EBADF; - pos = f.file->f_pos; - ret = compat_writev(f.file, vec, vlen, &pos, flags); - if (ret >= 0) - f.file->f_pos = pos; - fdput_pos(f); - return ret; -} - -COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd, - const struct compat_iovec __user *, vec, - compat_ulong_t, vlen) -{ - return do_compat_writev(fd, vec, vlen, 0); -} - -static long do_compat_pwritev64(unsigned long fd, - const struct compat_iovec __user *vec, - unsigned long vlen, loff_t pos, rwf_t flags) -{ - struct fd f; - ssize_t ret; - - if (pos < 0) - return -EINVAL; - f = fdget(fd); - if (!f.file) - return -EBADF; - ret = -ESPIPE; - if (f.file->f_mode & FMODE_PWRITE) - ret = compat_writev(f.file, vec, vlen, &pos, flags); - fdput(f); - return ret; + return do_readv(fd, vec, vlen, flags); + return do_preadv(fd, vec, vlen, pos, flags); } #ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64 COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd, - const struct compat_iovec __user *,vec, + const struct iovec __user *, vec, unsigned long, vlen, loff_t, pos) { - return do_compat_pwritev64(fd, vec, vlen, pos, 0); + return do_pwritev(fd, vec, vlen, pos, 0); } #endif COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd, - const struct compat_iovec __user *,vec, + const struct iovec __user *,vec, compat_ulong_t, vlen, u32, pos_low, u32, pos_high) { loff_t pos = ((loff_t)pos_high << 32) | pos_low; - return do_compat_pwritev64(fd, vec, vlen, pos, 0); + return do_pwritev(fd, vec, vlen, pos, 0); } #ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64V2 COMPAT_SYSCALL_DEFINE5(pwritev64v2, unsigned long, fd, - const struct compat_iovec __user *,vec, + const struct iovec __user *, vec, unsigned long, vlen, loff_t, pos, rwf_t, flags) { if (pos == -1) - return do_compat_writev(fd, vec, vlen, flags); - - return do_compat_pwritev64(fd, vec, vlen, pos, flags); + return do_writev(fd, vec, vlen, flags); + return do_pwritev(fd, vec, vlen, pos, flags); } #endif COMPAT_SYSCALL_DEFINE6(pwritev2, compat_ulong_t, fd, - const struct compat_iovec __user *,vec, + const struct iovec __user *,vec, compat_ulong_t, vlen, u32, pos_low, u32, pos_high, rwf_t, flags) { loff_t pos = ((loff_t)pos_high << 32) | pos_low; if (pos == -1) - return do_compat_writev(fd, vec, vlen, flags); - - return do_compat_pwritev64(fd, vec, vlen, pos, flags); + return do_writev(fd, vec, vlen, flags); + return do_pwritev(fd, vec, vlen, pos, flags); } - -#endif +#endif /* CONFIG_COMPAT */ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, size_t count, loff_t max) diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 1509775da040..c76d563dec0e 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -1551,11 +1551,7 @@ void reiserfs_read_locked_inode(struct inode *inode, * set version 1, version 2 could be used too, because stat data * key is the same in both versions */ - key.version = KEY_FORMAT_3_5; - key.on_disk_key.k_dir_id = dirino; - key.on_disk_key.k_objectid = inode->i_ino; - key.on_disk_key.k_offset = 0; - key.on_disk_key.k_type = 0; + _make_cpu_key(&key, KEY_FORMAT_3_5, dirino, inode->i_ino, 0, 0, 3); /* look for the object's stat data */ retval = search_item(inode->i_sb, &key, &path_to_sd); @@ -2163,7 +2159,8 @@ out_end_trans: out_inserted_sd: clear_nlink(inode); th->t_trans_id = 0; /* so the caller can't use this handle later */ - unlock_new_inode(inode); /* OK to do even if we hadn't locked it */ + if (inode->i_state & I_NEW) + unlock_new_inode(inode); iput(inode); return err; } diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index a6bce5b1fb1d..1b9c7a387dc7 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -1258,6 +1258,10 @@ static int reiserfs_parse_options(struct super_block *s, "turned on."); return 0; } + if (qf_names[qtype] != + REISERFS_SB(s)->s_qf_names[qtype]) + kfree(qf_names[qtype]); + qf_names[qtype] = NULL; if (*arg) { /* Some filename specified? */ if (REISERFS_SB(s)->s_qf_names[qtype] && strcmp(REISERFS_SB(s)->s_qf_names[qtype], @@ -1287,10 +1291,6 @@ static int reiserfs_parse_options(struct super_block *s, else *mount_options |= 1 << REISERFS_GRPQUOTA; } else { - if (qf_names[qtype] != - REISERFS_SB(s)->s_qf_names[qtype]) - kfree(qf_names[qtype]); - qf_names[qtype] = NULL; if (qtype == USRQUOTA) *mount_options &= ~(1 << REISERFS_USRQUOTA); else diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 28b241cd6987..fe63a7c3e0da 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -674,6 +674,13 @@ reiserfs_xattr_get(struct inode *inode, const char *name, void *buffer, if (get_inode_sd_version(inode) == STAT_DATA_V1) return -EOPNOTSUPP; + /* + * priv_root needn't be initialized during mount so allow initial + * lookups to succeed. + */ + if (!REISERFS_SB(inode->i_sb)->priv_root) + return 0; + dentry = xattr_lookup(inode, name, XATTR_REPLACE); if (IS_ERR(dentry)) { err = PTR_ERR(dentry); diff --git a/fs/romfs/super.c b/fs/romfs/super.c index e582d001f792..b1b7d3f5752f 100644 --- a/fs/romfs/super.c +++ b/fs/romfs/super.c @@ -356,6 +356,7 @@ static struct inode *romfs_iget(struct super_block *sb, unsigned long pos) } i->i_mode = mode; + i->i_blocks = (i->i_size + 511) >> 9; unlock_new_inode(i); return i; diff --git a/fs/splice.c b/fs/splice.c index d7c8a7c4db07..70cc52af780b 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -33,7 +33,6 @@ #include <linux/security.h> #include <linux/gfp.h> #include <linux/socket.h> -#include <linux/compat.h> #include <linux/sched/signal.h> #include "internal.h" @@ -526,6 +525,22 @@ static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_des return 1; } +/* We know we have a pipe buffer, but maybe it's empty? */ +static inline bool eat_empty_buffer(struct pipe_inode_info *pipe) +{ + unsigned int tail = pipe->tail; + unsigned int mask = pipe->ring_size - 1; + struct pipe_buffer *buf = &pipe->bufs[tail & mask]; + + if (unlikely(!buf->len)) { + pipe_buf_release(pipe, buf); + pipe->tail = tail+1; + return true; + } + + return false; +} + /** * splice_from_pipe_next - wait for some data to splice from * @pipe: pipe to splice from @@ -545,6 +560,7 @@ static int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_des if (signal_pending(current)) return -ERESTARTSYS; +repeat: while (pipe_empty(pipe->head, pipe->tail)) { if (!pipe->writers) return 0; @@ -563,9 +579,12 @@ static int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_des sd->need_wakeup = false; } - pipe_wait(pipe); + pipe_wait_readable(pipe); } + if (eat_empty_buffer(pipe)) + goto repeat; + return 1; } @@ -1077,7 +1096,7 @@ static int wait_for_space(struct pipe_inode_info *pipe, unsigned flags) return -EAGAIN; if (signal_pending(current)) return -ERESTARTSYS; - pipe_wait(pipe); + pipe_wait_writable(pipe); } } @@ -1332,20 +1351,6 @@ static int vmsplice_type(struct fd f, int *type) * Currently we punt and implement it as a normal copy, see pipe_to_user(). * */ -static long do_vmsplice(struct file *f, struct iov_iter *iter, unsigned int flags) -{ - if (unlikely(flags & ~SPLICE_F_ALL)) - return -EINVAL; - - if (!iov_iter_count(iter)) - return 0; - - if (iov_iter_rw(iter) == WRITE) - return vmsplice_to_pipe(f, iter, flags); - else - return vmsplice_to_user(f, iter, flags); -} - SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov, unsigned long, nr_segs, unsigned int, flags) { @@ -1356,6 +1361,9 @@ SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov, struct fd f; int type; + if (unlikely(flags & ~SPLICE_F_ALL)) + return -EINVAL; + f = fdget(fd); error = vmsplice_type(f, &type); if (error) @@ -1363,40 +1371,21 @@ SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov, error = import_iovec(type, uiov, nr_segs, ARRAY_SIZE(iovstack), &iov, &iter); - if (error >= 0) { - error = do_vmsplice(f.file, &iter, flags); - kfree(iov); - } - fdput(f); - return error; -} - -#ifdef CONFIG_COMPAT -COMPAT_SYSCALL_DEFINE4(vmsplice, int, fd, const struct compat_iovec __user *, iov32, - unsigned int, nr_segs, unsigned int, flags) -{ - struct iovec iovstack[UIO_FASTIOV]; - struct iovec *iov = iovstack; - struct iov_iter iter; - ssize_t error; - struct fd f; - int type; + if (error < 0) + goto out_fdput; - f = fdget(fd); - error = vmsplice_type(f, &type); - if (error) - return error; + if (!iov_iter_count(&iter)) + error = 0; + else if (iov_iter_rw(&iter) == WRITE) + error = vmsplice_to_pipe(f.file, &iter, flags); + else + error = vmsplice_to_user(f.file, &iter, flags); - error = compat_import_iovec(type, iov32, nr_segs, - ARRAY_SIZE(iovstack), &iov, &iter); - if (error >= 0) { - error = do_vmsplice(f.file, &iter, flags); - kfree(iov); - } + kfree(iov); +out_fdput: fdput(f); return error; } -#endif SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in, int, fd_out, loff_t __user *, off_out, @@ -1454,7 +1443,7 @@ static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags) ret = -EAGAIN; break; } - pipe_wait(pipe); + pipe_wait_readable(pipe); } pipe_unlock(pipe); @@ -1493,7 +1482,7 @@ static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags) ret = -ERESTARTSYS; break; } - pipe_wait(pipe); + pipe_wait_writable(pipe); } pipe_unlock(pipe); diff --git a/fs/super.c b/fs/super.c index 904459b35119..a51c2083cd6b 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1256,6 +1256,8 @@ static int set_bdev_super(struct super_block *s, void *data) s->s_dev = s->s_bdev->bd_dev; s->s_bdi = bdi_get(s->s_bdev->bd_bdi); + if (blk_queue_stable_writes(s->s_bdev->bd_disk->queue)) + s->s_iflags |= SB_I_STABLE_WRITES; return 0; } diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index eb6897ab78e7..96d0da65e088 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -15,6 +15,7 @@ #include <linux/list.h> #include <linux/mutex.h> #include <linux/seq_file.h> +#include <linux/mm.h> #include "sysfs.h" @@ -707,3 +708,57 @@ int sysfs_change_owner(struct kobject *kobj, kuid_t kuid, kgid_t kgid) return 0; } EXPORT_SYMBOL_GPL(sysfs_change_owner); + +/** + * sysfs_emit - scnprintf equivalent, aware of PAGE_SIZE buffer. + * @buf: start of PAGE_SIZE buffer. + * @fmt: format + * @...: optional arguments to @format + * + * + * Returns number of characters written to @buf. + */ +int sysfs_emit(char *buf, const char *fmt, ...) +{ + va_list args; + int len; + + if (WARN(!buf || offset_in_page(buf), + "invalid sysfs_emit: buf:%p\n", buf)) + return 0; + + va_start(args, fmt); + len = vscnprintf(buf, PAGE_SIZE, fmt, args); + va_end(args); + + return len; +} +EXPORT_SYMBOL_GPL(sysfs_emit); + +/** + * sysfs_emit_at - scnprintf equivalent, aware of PAGE_SIZE buffer. + * @buf: start of PAGE_SIZE buffer. + * @at: offset in @buf to start write in bytes + * @at must be >= 0 && < PAGE_SIZE + * @fmt: format + * @...: optional arguments to @fmt + * + * + * Returns number of characters written starting at &@buf[@at]. + */ +int sysfs_emit_at(char *buf, int at, const char *fmt, ...) +{ + va_list args; + int len; + + if (WARN(!buf || offset_in_page(buf) || at < 0 || at >= PAGE_SIZE, + "invalid sysfs_emit_at: buf:%p at:%d\n", buf, at)) + return 0; + + va_start(args, fmt); + len = vscnprintf(buf + at, PAGE_SIZE - at, fmt, args); + va_end(args); + + return len; +} +EXPORT_SYMBOL_GPL(sysfs_emit_at); diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 9d042942d8b2..155521e51ac5 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -81,19 +81,6 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir, struct ubifs_inode *ui; bool encrypted = false; - if (IS_ENCRYPTED(dir)) { - err = fscrypt_get_encryption_info(dir); - if (err) { - ubifs_err(c, "fscrypt_get_encryption_info failed: %i", err); - return ERR_PTR(err); - } - - if (!fscrypt_has_encryption_key(dir)) - return ERR_PTR(-EPERM); - - encrypted = true; - } - inode = new_inode(c->vfs_sb); ui = ubifs_inode(inode); if (!inode) @@ -112,6 +99,12 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir, current_time(inode); inode->i_mapping->nrpages = 0; + err = fscrypt_prepare_new_inode(dir, inode, &encrypted); + if (err) { + ubifs_err(c, "fscrypt_prepare_new_inode failed: %i", err); + goto out_iput; + } + switch (mode & S_IFMT) { case S_IFREG: inode->i_mapping->a_ops = &ubifs_file_address_operations; @@ -131,7 +124,6 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir, case S_IFBLK: case S_IFCHR: inode->i_op = &ubifs_file_inode_operations; - encrypted = false; break; default: BUG(); @@ -151,9 +143,8 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir, if (c->highest_inum >= INUM_WATERMARK) { spin_unlock(&c->cnt_lock); ubifs_err(c, "out of inode numbers"); - make_bad_inode(inode); - iput(inode); - return ERR_PTR(-EINVAL); + err = -EINVAL; + goto out_iput; } ubifs_warn(c, "running out of inode numbers (current %lu, max %u)", (unsigned long)c->highest_inum, INUM_WATERMARK); @@ -171,16 +162,19 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir, spin_unlock(&c->cnt_lock); if (encrypted) { - err = fscrypt_inherit_context(dir, inode, &encrypted, true); + err = fscrypt_set_context(inode, NULL); if (err) { - ubifs_err(c, "fscrypt_inherit_context failed: %i", err); - make_bad_inode(inode); - iput(inode); - return ERR_PTR(err); + ubifs_err(c, "fscrypt_set_context failed: %i", err); + goto out_iput; } } return inode; + +out_iput: + make_bad_inode(inode); + iput(inode); + return ERR_PTR(err); } static int dbg_check_name(const struct ubifs_info *c, @@ -515,7 +509,7 @@ static int ubifs_readdir(struct file *file, struct dir_context *ctx) if (err) return err; - err = fscrypt_fname_alloc_buffer(dir, UBIFS_MAX_NLEN, &fstr); + err = fscrypt_fname_alloc_buffer(UBIFS_MAX_NLEN, &fstr); if (err) return err; diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index a2420c900275..fbddb2a1c03f 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -2177,6 +2177,8 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent) c->vi.vol_id); if (err) goto out_close; + sb->s_bdi->ra_pages = 0; + sb->s_bdi->io_pages = 0; sb->s_fs_info = c; sb->s_magic = UBIFS_SUPER_MAGIC; diff --git a/fs/udf/directory.c b/fs/udf/directory.c index d9523013096f..73720320f0ab 100644 --- a/fs/udf/directory.c +++ b/fs/udf/directory.c @@ -34,7 +34,7 @@ struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos, fibh->soffset = fibh->eoffset; if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { - fi = udf_get_fileident(iinfo->i_ext.i_data - + fi = udf_get_fileident(iinfo->i_data - (iinfo->i_efe ? sizeof(struct extendedFileEntry) : sizeof(struct fileEntry)), diff --git a/fs/udf/file.c b/fs/udf/file.c index 628941a6b79a..ad8eefad27d7 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -50,7 +50,7 @@ static void __udf_adinicb_readpage(struct page *page) * So just sample it once and use the same value everywhere. */ kaddr = kmap_atomic(page); - memcpy(kaddr, iinfo->i_ext.i_data + iinfo->i_lenEAttr, isize); + memcpy(kaddr, iinfo->i_data + iinfo->i_lenEAttr, isize); memset(kaddr + isize, 0, PAGE_SIZE - isize); flush_dcache_page(page); SetPageUptodate(page); @@ -76,8 +76,7 @@ static int udf_adinicb_writepage(struct page *page, BUG_ON(!PageLocked(page)); kaddr = kmap_atomic(page); - memcpy(iinfo->i_ext.i_data + iinfo->i_lenEAttr, kaddr, - i_size_read(inode)); + memcpy(iinfo->i_data + iinfo->i_lenEAttr, kaddr, i_size_read(inode)); SetPageUptodate(page); kunmap_atomic(kaddr); mark_inode_dirty(inode); @@ -215,7 +214,7 @@ long udf_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return put_user(UDF_I(inode)->i_lenEAttr, (int __user *)arg); case UDF_GETEABLOCK: return copy_to_user((char __user *)arg, - UDF_I(inode)->i_ext.i_data, + UDF_I(inode)->i_data, UDF_I(inode)->i_lenEAttr) ? -EFAULT : 0; default: return -ENOIOCTLCMD; diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c index 0adb40718a5d..84ed23edebfd 100644 --- a/fs/udf/ialloc.c +++ b/fs/udf/ialloc.c @@ -67,16 +67,16 @@ struct inode *udf_new_inode(struct inode *dir, umode_t mode) iinfo->i_efe = 1; if (UDF_VERS_USE_EXTENDED_FE > sbi->s_udfrev) sbi->s_udfrev = UDF_VERS_USE_EXTENDED_FE; - iinfo->i_ext.i_data = kzalloc(inode->i_sb->s_blocksize - - sizeof(struct extendedFileEntry), - GFP_KERNEL); + iinfo->i_data = kzalloc(inode->i_sb->s_blocksize - + sizeof(struct extendedFileEntry), + GFP_KERNEL); } else { iinfo->i_efe = 0; - iinfo->i_ext.i_data = kzalloc(inode->i_sb->s_blocksize - - sizeof(struct fileEntry), - GFP_KERNEL); + iinfo->i_data = kzalloc(inode->i_sb->s_blocksize - + sizeof(struct fileEntry), + GFP_KERNEL); } - if (!iinfo->i_ext.i_data) { + if (!iinfo->i_data) { iput(inode); return ERR_PTR(-ENOMEM); } diff --git a/fs/udf/inode.c b/fs/udf/inode.c index adaba8e8b326..bb89c3e43212 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -139,23 +139,26 @@ void udf_evict_inode(struct inode *inode) struct udf_inode_info *iinfo = UDF_I(inode); int want_delete = 0; - if (!inode->i_nlink && !is_bad_inode(inode)) { - want_delete = 1; - udf_setsize(inode, 0); - udf_update_inode(inode, IS_SYNC(inode)); + if (!is_bad_inode(inode)) { + if (!inode->i_nlink) { + want_delete = 1; + udf_setsize(inode, 0); + udf_update_inode(inode, IS_SYNC(inode)); + } + if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB && + inode->i_size != iinfo->i_lenExtents) { + udf_warn(inode->i_sb, + "Inode %lu (mode %o) has inode size %llu different from extent length %llu. Filesystem need not be standards compliant.\n", + inode->i_ino, inode->i_mode, + (unsigned long long)inode->i_size, + (unsigned long long)iinfo->i_lenExtents); + } } truncate_inode_pages_final(&inode->i_data); invalidate_inode_buffers(inode); clear_inode(inode); - if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB && - inode->i_size != iinfo->i_lenExtents) { - udf_warn(inode->i_sb, "Inode %lu (mode %o) has inode size %llu different from extent length %llu. Filesystem need not be standards compliant.\n", - inode->i_ino, inode->i_mode, - (unsigned long long)inode->i_size, - (unsigned long long)iinfo->i_lenExtents); - } - kfree(iinfo->i_ext.i_data); - iinfo->i_ext.i_data = NULL; + kfree(iinfo->i_data); + iinfo->i_data = NULL; udf_clear_extent_cache(inode); if (want_delete) { udf_free_inode(inode); @@ -285,14 +288,14 @@ int udf_expand_file_adinicb(struct inode *inode) kaddr = kmap_atomic(page); memset(kaddr + iinfo->i_lenAlloc, 0x00, PAGE_SIZE - iinfo->i_lenAlloc); - memcpy(kaddr, iinfo->i_ext.i_data + iinfo->i_lenEAttr, + memcpy(kaddr, iinfo->i_data + iinfo->i_lenEAttr, iinfo->i_lenAlloc); flush_dcache_page(page); SetPageUptodate(page); kunmap_atomic(kaddr); } down_write(&iinfo->i_data_sem); - memset(iinfo->i_ext.i_data + iinfo->i_lenEAttr, 0x00, + memset(iinfo->i_data + iinfo->i_lenEAttr, 0x00, iinfo->i_lenAlloc); iinfo->i_lenAlloc = 0; if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD)) @@ -308,8 +311,7 @@ int udf_expand_file_adinicb(struct inode *inode) lock_page(page); down_write(&iinfo->i_data_sem); kaddr = kmap_atomic(page); - memcpy(iinfo->i_ext.i_data + iinfo->i_lenEAttr, kaddr, - inode->i_size); + memcpy(iinfo->i_data + iinfo->i_lenEAttr, kaddr, inode->i_size); kunmap_atomic(kaddr); unlock_page(page); iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB; @@ -396,8 +398,7 @@ struct buffer_head *udf_expand_dir_adinicb(struct inode *inode, } mark_buffer_dirty_inode(dbh, inode); - memset(iinfo->i_ext.i_data + iinfo->i_lenEAttr, 0, - iinfo->i_lenAlloc); + memset(iinfo->i_data + iinfo->i_lenEAttr, 0, iinfo->i_lenAlloc); iinfo->i_lenAlloc = 0; eloc.logicalBlockNum = *block; eloc.partitionReferenceNum = @@ -1260,7 +1261,7 @@ set_size: if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { down_write(&iinfo->i_data_sem); udf_clear_extent_cache(inode); - memset(iinfo->i_ext.i_data + iinfo->i_lenEAttr + newsize, + memset(iinfo->i_data + iinfo->i_lenEAttr + newsize, 0x00, bsize - newsize - udf_file_entry_alloc_offset(inode)); iinfo->i_lenAlloc = newsize; @@ -1411,7 +1412,7 @@ reread: sizeof(struct extendedFileEntry)); if (ret) goto out; - memcpy(iinfo->i_ext.i_data, + memcpy(iinfo->i_data, bh->b_data + sizeof(struct extendedFileEntry), bs - sizeof(struct extendedFileEntry)); } else if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_FE)) { @@ -1420,7 +1421,7 @@ reread: ret = udf_alloc_i_data(inode, bs - sizeof(struct fileEntry)); if (ret) goto out; - memcpy(iinfo->i_ext.i_data, + memcpy(iinfo->i_data, bh->b_data + sizeof(struct fileEntry), bs - sizeof(struct fileEntry)); } else if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_USE)) { @@ -1433,7 +1434,7 @@ reread: sizeof(struct unallocSpaceEntry)); if (ret) goto out; - memcpy(iinfo->i_ext.i_data, + memcpy(iinfo->i_data, bh->b_data + sizeof(struct unallocSpaceEntry), bs - sizeof(struct unallocSpaceEntry)); return 0; @@ -1614,8 +1615,8 @@ out: static int udf_alloc_i_data(struct inode *inode, size_t size) { struct udf_inode_info *iinfo = UDF_I(inode); - iinfo->i_ext.i_data = kmalloc(size, GFP_KERNEL); - if (!iinfo->i_ext.i_data) + iinfo->i_data = kmalloc(size, GFP_KERNEL); + if (!iinfo->i_data) return -ENOMEM; return 0; } @@ -1706,7 +1707,7 @@ static int udf_update_inode(struct inode *inode, int do_sync) use->lengthAllocDescs = cpu_to_le32(iinfo->i_lenAlloc); memcpy(bh->b_data + sizeof(struct unallocSpaceEntry), - iinfo->i_ext.i_data, inode->i_sb->s_blocksize - + iinfo->i_data, inode->i_sb->s_blocksize - sizeof(struct unallocSpaceEntry)); use->descTag.tagIdent = cpu_to_le16(TAG_IDENT_USE); crclen = sizeof(struct unallocSpaceEntry); @@ -1772,7 +1773,7 @@ static int udf_update_inode(struct inode *inode, int do_sync) if (iinfo->i_efe == 0) { memcpy(bh->b_data + sizeof(struct fileEntry), - iinfo->i_ext.i_data, + iinfo->i_data, inode->i_sb->s_blocksize - sizeof(struct fileEntry)); fe->logicalBlocksRecorded = cpu_to_le64(lb_recorded); @@ -1791,7 +1792,7 @@ static int udf_update_inode(struct inode *inode, int do_sync) crclen = sizeof(struct fileEntry); } else { memcpy(bh->b_data + sizeof(struct extendedFileEntry), - iinfo->i_ext.i_data, + iinfo->i_data, inode->i_sb->s_blocksize - sizeof(struct extendedFileEntry)); efe->objectSize = @@ -2087,7 +2088,7 @@ void udf_write_aext(struct inode *inode, struct extent_position *epos, struct udf_inode_info *iinfo = UDF_I(inode); if (!epos->bh) - ptr = iinfo->i_ext.i_data + epos->offset - + ptr = iinfo->i_data + epos->offset - udf_file_entry_alloc_offset(inode) + iinfo->i_lenEAttr; else @@ -2179,7 +2180,7 @@ int8_t udf_current_aext(struct inode *inode, struct extent_position *epos, if (!epos->bh) { if (!epos->offset) epos->offset = udf_file_entry_alloc_offset(inode); - ptr = iinfo->i_ext.i_data + epos->offset - + ptr = iinfo->i_data + epos->offset - udf_file_entry_alloc_offset(inode) + iinfo->i_lenEAttr; alen = udf_file_entry_alloc_offset(inode) + diff --git a/fs/udf/misc.c b/fs/udf/misc.c index 401e64cde1be..eab94527340d 100644 --- a/fs/udf/misc.c +++ b/fs/udf/misc.c @@ -52,9 +52,9 @@ struct genericFormat *udf_add_extendedattr(struct inode *inode, uint32_t size, uint16_t crclen; struct udf_inode_info *iinfo = UDF_I(inode); - ea = iinfo->i_ext.i_data; + ea = iinfo->i_data; if (iinfo->i_lenEAttr) { - ad = iinfo->i_ext.i_data + iinfo->i_lenEAttr; + ad = iinfo->i_data + iinfo->i_lenEAttr; } else { ad = ea; size += sizeof(struct extendedAttrHeaderDesc); @@ -153,7 +153,7 @@ struct genericFormat *udf_get_extendedattr(struct inode *inode, uint32_t type, uint32_t offset; struct udf_inode_info *iinfo = UDF_I(inode); - ea = iinfo->i_ext.i_data; + ea = iinfo->i_data; if (iinfo->i_lenEAttr) { struct extendedAttrHeaderDesc *eahd; diff --git a/fs/udf/namei.c b/fs/udf/namei.c index 77b6d89b9bcd..e169d8fe35b5 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -460,8 +460,7 @@ add: if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { block = dinfo->i_location.logicalBlockNum; fi = (struct fileIdentDesc *) - (dinfo->i_ext.i_data + - fibh->soffset - + (dinfo->i_data + fibh->soffset - udf_ext0_offset(dir) + dinfo->i_lenEAttr); } else { @@ -940,7 +939,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry, mark_buffer_dirty_inode(epos.bh, inode); ea = epos.bh->b_data + udf_ext0_offset(inode); } else - ea = iinfo->i_ext.i_data + iinfo->i_lenEAttr; + ea = iinfo->i_data + iinfo->i_lenEAttr; eoffset = sb->s_blocksize - udf_ext0_offset(inode); pc = (struct pathComponent *)ea; @@ -1120,7 +1119,7 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry, retval = -EIO; if (old_iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { dir_fi = udf_get_fileident( - old_iinfo->i_ext.i_data - + old_iinfo->i_data - (old_iinfo->i_efe ? sizeof(struct extendedFileEntry) : sizeof(struct fileEntry)), diff --git a/fs/udf/partition.c b/fs/udf/partition.c index 090baff83990..4cbf40575965 100644 --- a/fs/udf/partition.c +++ b/fs/udf/partition.c @@ -65,7 +65,7 @@ uint32_t udf_get_pblock_virt15(struct super_block *sb, uint32_t block, } if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { - loc = le32_to_cpu(((__le32 *)(iinfo->i_ext.i_data + + loc = le32_to_cpu(((__le32 *)(iinfo->i_data + vdata->s_start_offset))[block]); goto translate; } diff --git a/fs/udf/super.c b/fs/udf/super.c index 1c42f544096d..faf2017ada11 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -168,7 +168,7 @@ static void init_once(void *foo) { struct udf_inode_info *ei = (struct udf_inode_info *)foo; - ei->i_ext.i_data = NULL; + ei->i_data = NULL; inode_init_once(&ei->vfs_inode); } @@ -854,7 +854,7 @@ static int udf_load_pvoldesc(struct super_block *sb, sector_t block) uint8_t *outstr; struct buffer_head *bh; uint16_t ident; - int ret = -ENOMEM; + int ret; struct timestamp *ts; outstr = kmalloc(128, GFP_NOFS); @@ -1006,18 +1006,10 @@ int udf_compute_nr_groups(struct super_block *sb, u32 partition) static struct udf_bitmap *udf_sb_alloc_bitmap(struct super_block *sb, u32 index) { struct udf_bitmap *bitmap; - int nr_groups; - int size; - - nr_groups = udf_compute_nr_groups(sb, index); - size = sizeof(struct udf_bitmap) + - (sizeof(struct buffer_head *) * nr_groups); - - if (size <= PAGE_SIZE) - bitmap = kzalloc(size, GFP_KERNEL); - else - bitmap = vzalloc(size); /* TODO: get rid of vzalloc */ + int nr_groups = udf_compute_nr_groups(sb, index); + bitmap = kvzalloc(struct_size(bitmap, s_block_bitmap, nr_groups), + GFP_KERNEL); if (!bitmap) return NULL; @@ -1210,7 +1202,7 @@ static int udf_load_vat(struct super_block *sb, int p_index, int type1_index) vat20 = (struct virtualAllocationTable20 *)bh->b_data; } else { vat20 = (struct virtualAllocationTable20 *) - vati->i_ext.i_data; + vati->i_data; } map->s_type_specific.s_virtual.s_start_offset = @@ -1353,6 +1345,12 @@ static int udf_load_sparable_map(struct super_block *sb, (int)spm->numSparingTables); return -EIO; } + if (le32_to_cpu(spm->sizeSparingTable) > sb->s_blocksize) { + udf_err(sb, "error loading logical volume descriptor: " + "Too big sparing table size (%u)\n", + le32_to_cpu(spm->sizeSparingTable)); + return -EIO; + } for (i = 0; i < spm->numSparingTables; i++) { loc = le32_to_cpu(spm->locSparingTable[i]); @@ -1698,7 +1696,8 @@ static noinline int udf_process_sequence( "Pointers (max %u supported)\n", UDF_MAX_TD_NESTING); brelse(bh); - return -EIO; + ret = -EIO; + goto out; } vdp = (struct volDescPtr *)bh->b_data; @@ -1718,7 +1717,8 @@ static noinline int udf_process_sequence( curr = get_volume_descriptor_record(ident, bh, &data); if (IS_ERR(curr)) { brelse(bh); - return PTR_ERR(curr); + ret = PTR_ERR(curr); + goto out; } /* Descriptor we don't care about? */ if (!curr) @@ -1740,28 +1740,31 @@ static noinline int udf_process_sequence( */ if (!data.vds[VDS_POS_PRIMARY_VOL_DESC].block) { udf_err(sb, "Primary Volume Descriptor not found!\n"); - return -EAGAIN; + ret = -EAGAIN; + goto out; } ret = udf_load_pvoldesc(sb, data.vds[VDS_POS_PRIMARY_VOL_DESC].block); if (ret < 0) - return ret; + goto out; if (data.vds[VDS_POS_LOGICAL_VOL_DESC].block) { ret = udf_load_logicalvol(sb, data.vds[VDS_POS_LOGICAL_VOL_DESC].block, fileset); if (ret < 0) - return ret; + goto out; } /* Now handle prevailing Partition Descriptors */ for (i = 0; i < data.num_part_descs; i++) { ret = udf_load_partdesc(sb, data.part_descs_loc[i].rec.block); if (ret < 0) - return ret; + goto out; } - - return 0; + ret = 0; +out: + kfree(data.part_descs_loc); + return ret; } /* diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c index 25ff91c7e94a..c973db239604 100644 --- a/fs/udf/symlink.c +++ b/fs/udf/symlink.c @@ -122,7 +122,7 @@ static int udf_symlink_filler(struct file *file, struct page *page) down_read(&iinfo->i_data_sem); if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { - symlink = iinfo->i_ext.i_data + iinfo->i_lenEAttr; + symlink = iinfo->i_data + iinfo->i_lenEAttr; } else { bh = sb_bread(inode->i_sb, pos); diff --git a/fs/udf/udf_i.h b/fs/udf/udf_i.h index 4245d1f63258..06ff7006b822 100644 --- a/fs/udf/udf_i.h +++ b/fs/udf/udf_i.h @@ -45,11 +45,7 @@ struct udf_inode_info { unsigned i_strat4096 : 1; unsigned i_streamdir : 1; unsigned reserved : 25; - union { - struct short_ad *i_sad; - struct long_ad *i_lad; - __u8 *i_data; - } i_ext; + __u8 *i_data; struct kernel_lb_addr i_locStreamdir; __u64 i_lenStreams; struct rw_semaphore i_data_sem; diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 0e4a3837da52..000b457ad087 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -601,8 +601,6 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, /* the various vma->vm_userfaultfd_ctx still points to it */ mmap_write_lock(mm); - /* no task can run (and in turn coredump) yet */ - VM_WARN_ON(!mmget_still_valid(mm)); for (vma = mm->mmap; vma; vma = vma->vm_next) if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) { vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; @@ -842,7 +840,6 @@ static int userfaultfd_release(struct inode *inode, struct file *file) /* len == 0 means wake all */ struct userfaultfd_wake_range range = { .len = 0, }; unsigned long new_flags; - bool still_valid; WRITE_ONCE(ctx->released, true); @@ -858,7 +855,6 @@ static int userfaultfd_release(struct inode *inode, struct file *file) * taking the mmap_lock for writing. */ mmap_write_lock(mm); - still_valid = mmget_still_valid(mm); prev = NULL; for (vma = mm->mmap; vma; vma = vma->vm_next) { cond_resched(); @@ -869,17 +865,15 @@ static int userfaultfd_release(struct inode *inode, struct file *file) continue; } new_flags = vma->vm_flags & ~(VM_UFFD_MISSING | VM_UFFD_WP); - if (still_valid) { - prev = vma_merge(mm, prev, vma->vm_start, vma->vm_end, - new_flags, vma->anon_vma, - vma->vm_file, vma->vm_pgoff, - vma_policy(vma), - NULL_VM_UFFD_CTX); - if (prev) - vma = prev; - else - prev = vma; - } + prev = vma_merge(mm, prev, vma->vm_start, vma->vm_end, + new_flags, vma->anon_vma, + vma->vm_file, vma->vm_pgoff, + vma_policy(vma), + NULL_VM_UFFD_CTX); + if (prev) + vma = prev; + else + prev = vma; vma->vm_flags = new_flags; vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; } @@ -1309,8 +1303,6 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, goto out; mmap_write_lock(mm); - if (!mmget_still_valid(mm)) - goto out_unlock; vma = find_vma_prev(mm, start, &prev); if (!vma) goto out_unlock; @@ -1511,8 +1503,6 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, goto out; mmap_write_lock(mm); - if (!mmget_still_valid(mm)) - goto out_unlock; vma = find_vma_prev(mm, start, &prev); if (!vma) goto out_unlock; diff --git a/fs/vboxsf/dir.c b/fs/vboxsf/dir.c index dd147b490982..4d569f14a8d8 100644 --- a/fs/vboxsf/dir.c +++ b/fs/vboxsf/dir.c @@ -134,7 +134,7 @@ try_next_entry: d_type = vboxsf_get_d_type(info->info.attr.mode); /* - * On 32 bit systems pos is 64 signed, while ino is 32 bit + * On 32-bit systems pos is 64-bit signed, while ino is 32-bit * unsigned so fake_ino may overflow, check for this. */ if ((ino_t)(ctx->pos + 1) != (u64)(ctx->pos + 1)) { diff --git a/fs/vboxsf/super.c b/fs/vboxsf/super.c index 8fe03b4a0d2b..d7816c01a4f6 100644 --- a/fs/vboxsf/super.c +++ b/fs/vboxsf/super.c @@ -167,6 +167,8 @@ static int vboxsf_fill_super(struct super_block *sb, struct fs_context *fc) err = super_setup_bdi_name(sb, "vboxsf-%d", sbi->bdi_id); if (err) goto fail_free; + sb->s_bdi->ra_pages = 0; + sb->s_bdi->io_pages = 0; /* Turn source into a shfl_string and map the folder */ size = strlen(fc->source) + 1; @@ -384,7 +386,7 @@ fail_nomem: static int vboxsf_parse_monolithic(struct fs_context *fc, void *data) { - char *options = data; + unsigned char *options = data; if (options && options[0] == VBSF_MOUNT_SIGNATURE_BYTE_0 && options[1] == VBSF_MOUNT_SIGNATURE_BYTE_1 && diff --git a/fs/xattr.c b/fs/xattr.c index 386b45676d7e..cd7a563e8bcd 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -232,15 +232,15 @@ int __vfs_setxattr_noperm(struct dentry *dentry, const char *name, } /** - * __vfs_setxattr_locked: set an extended attribute while holding the inode + * __vfs_setxattr_locked - set an extended attribute while holding the inode * lock * - * @dentry - object to perform setxattr on - * @name - xattr name to set - * @value - value to set @name to - * @size - size of @value - * @flags - flags to pass into filesystem operations - * @delegated_inode - on return, will contain an inode pointer that + * @dentry: object to perform setxattr on + * @name: xattr name to set + * @value: value to set @name to + * @size: size of @value + * @flags: flags to pass into filesystem operations + * @delegated_inode: on return, will contain an inode pointer that * a delegation was broken on, NULL if none. */ int @@ -443,12 +443,12 @@ __vfs_removexattr(struct dentry *dentry, const char *name) EXPORT_SYMBOL(__vfs_removexattr); /** - * __vfs_removexattr_locked: set an extended attribute while holding the inode + * __vfs_removexattr_locked - set an extended attribute while holding the inode * lock * - * @dentry - object to perform setxattr on - * @name - name of xattr to remove - * @delegated_inode - on return, will contain an inode pointer that + * @dentry: object to perform setxattr on + * @name: name of xattr to remove + * @delegated_inode: on return, will contain an inode pointer that * a delegation was broken on, NULL if none. */ int diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c index e841ed781a25..e986b95d94c9 100644 --- a/fs/xfs/kmem.c +++ b/fs/xfs/kmem.c @@ -93,25 +93,3 @@ kmem_alloc_large(size_t size, xfs_km_flags_t flags) return ptr; return __kmem_vmalloc(size, flags); } - -void * -kmem_realloc(const void *old, size_t newsize, xfs_km_flags_t flags) -{ - int retries = 0; - gfp_t lflags = kmem_flags_convert(flags); - void *ptr; - - trace_kmem_realloc(newsize, flags, _RET_IP_); - - do { - ptr = krealloc(old, newsize, lflags); - if (ptr || (flags & KM_MAYFAIL)) - return ptr; - if (!(++retries % 100)) - xfs_err(NULL, - "%s(%u) possible memory allocation deadlock size %zu in %s (mode:0x%x)", - current->comm, current->pid, - newsize, __func__, lflags); - congestion_wait(BLK_RW_ASYNC, HZ/50); - } while (1); -} diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h index 8e8555817e6d..38007117697e 100644 --- a/fs/xfs/kmem.h +++ b/fs/xfs/kmem.h @@ -59,7 +59,6 @@ kmem_flags_convert(xfs_km_flags_t flags) extern void *kmem_alloc(size_t, xfs_km_flags_t); extern void *kmem_alloc_io(size_t size, int align_mask, xfs_km_flags_t flags); extern void *kmem_alloc_large(size_t size, xfs_km_flags_t); -extern void *kmem_realloc(const void *, size_t, xfs_km_flags_t); static inline void kmem_free(const void *ptr) { kvfree(ptr); @@ -72,12 +71,6 @@ kmem_zalloc(size_t size, xfs_km_flags_t flags) return kmem_alloc(size, flags | KM_ZERO); } -static inline void * -kmem_zalloc_large(size_t size, xfs_km_flags_t flags) -{ - return kmem_alloc_large(size, flags | KM_ZERO); -} - /* * Zone interfaces */ diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index 8cf73fe4338e..9331f3516afa 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -333,6 +333,11 @@ xfs_agiblock_init( } for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO); + if (xfs_sb_version_hasinobtcounts(&mp->m_sb)) { + agi->agi_iblocks = cpu_to_be32(1); + if (xfs_sb_version_hasfinobt(&mp->m_sb)) + agi->agi_fblocks = cpu_to_be32(1); + } } typedef void (*aghdr_init_work_f)(struct xfs_mount *mp, struct xfs_buf *bp, diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 2e055c079f39..fd8e6418a0d3 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -428,7 +428,7 @@ xfs_attr_set( */ if (XFS_IFORK_Q(dp) == 0) { int sf_size = sizeof(struct xfs_attr_sf_hdr) + - XFS_ATTR_SF_ENTSIZE_BYNAME(args->namelen, + xfs_attr_sf_entsize_byname(args->namelen, args->valuelen); error = xfs_bmap_add_attrfork(dp, sf_size, rsvd); @@ -523,6 +523,14 @@ out_trans_cancel: * External routines when attribute list is inside the inode *========================================================================*/ +static inline int xfs_attr_sf_totsize(struct xfs_inode *dp) +{ + struct xfs_attr_shortform *sf; + + sf = (struct xfs_attr_shortform *)dp->i_afp->if_u1.if_data; + return be16_to_cpu(sf->hdr.totsize); +} + /* * Add a name to the shortform attribute list structure * This is the external routine. @@ -555,8 +563,8 @@ xfs_attr_shortform_addname(xfs_da_args_t *args) args->valuelen >= XFS_ATTR_SF_ENTSIZE_MAX) return -ENOSPC; - newsize = XFS_ATTR_SF_TOTSIZE(args->dp); - newsize += XFS_ATTR_SF_ENTSIZE_BYNAME(args->namelen, args->valuelen); + newsize = xfs_attr_sf_totsize(args->dp); + newsize += xfs_attr_sf_entsize_byname(args->namelen, args->valuelen); forkoff = xfs_attr_shortform_bytesfit(args->dp, newsize); if (!forkoff) diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 305d4bc07337..bb128db220ac 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -684,9 +684,9 @@ xfs_attr_sf_findname( sf = (struct xfs_attr_shortform *)args->dp->i_afp->if_u1.if_data; sfe = &sf->list[0]; end = sf->hdr.count; - for (i = 0; i < end; sfe = XFS_ATTR_SF_NEXTENTRY(sfe), + for (i = 0; i < end; sfe = xfs_attr_sf_nextentry(sfe), base += size, i++) { - size = XFS_ATTR_SF_ENTSIZE(sfe); + size = xfs_attr_sf_entsize(sfe); if (!xfs_attr_match(args, sfe->namelen, sfe->nameval, sfe->flags)) continue; @@ -728,15 +728,15 @@ xfs_attr_shortform_add( ifp = dp->i_afp; ASSERT(ifp->if_flags & XFS_IFINLINE); - sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data; + sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data; if (xfs_attr_sf_findname(args, &sfe, NULL) == -EEXIST) ASSERT(0); offset = (char *)sfe - (char *)sf; - size = XFS_ATTR_SF_ENTSIZE_BYNAME(args->namelen, args->valuelen); + size = xfs_attr_sf_entsize_byname(args->namelen, args->valuelen); xfs_idata_realloc(dp, size, XFS_ATTR_FORK); - sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data; - sfe = (xfs_attr_sf_entry_t *)((char *)sf + offset); + sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data; + sfe = (struct xfs_attr_sf_entry *)((char *)sf + offset); sfe->namelen = args->namelen; sfe->valuelen = args->valuelen; @@ -787,12 +787,12 @@ xfs_attr_shortform_remove( dp = args->dp; mp = dp->i_mount; - sf = (xfs_attr_shortform_t *)dp->i_afp->if_u1.if_data; + sf = (struct xfs_attr_shortform *)dp->i_afp->if_u1.if_data; error = xfs_attr_sf_findname(args, &sfe, &base); if (error != -EEXIST) return error; - size = XFS_ATTR_SF_ENTSIZE(sfe); + size = xfs_attr_sf_entsize(sfe); /* * Fix up the attribute fork data, covering the hole @@ -837,8 +837,8 @@ xfs_attr_shortform_remove( int xfs_attr_shortform_lookup(xfs_da_args_t *args) { - xfs_attr_shortform_t *sf; - xfs_attr_sf_entry_t *sfe; + struct xfs_attr_shortform *sf; + struct xfs_attr_sf_entry *sfe; int i; struct xfs_ifork *ifp; @@ -846,10 +846,10 @@ xfs_attr_shortform_lookup(xfs_da_args_t *args) ifp = args->dp->i_afp; ASSERT(ifp->if_flags & XFS_IFINLINE); - sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data; + sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data; sfe = &sf->list[0]; for (i = 0; i < sf->hdr.count; - sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) { + sfe = xfs_attr_sf_nextentry(sfe), i++) { if (xfs_attr_match(args, sfe->namelen, sfe->nameval, sfe->flags)) return -EEXIST; @@ -873,10 +873,10 @@ xfs_attr_shortform_getvalue( int i; ASSERT(args->dp->i_afp->if_flags == XFS_IFINLINE); - sf = (xfs_attr_shortform_t *)args->dp->i_afp->if_u1.if_data; + sf = (struct xfs_attr_shortform *)args->dp->i_afp->if_u1.if_data; sfe = &sf->list[0]; for (i = 0; i < sf->hdr.count; - sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) { + sfe = xfs_attr_sf_nextentry(sfe), i++) { if (xfs_attr_match(args, sfe->namelen, sfe->nameval, sfe->flags)) return xfs_attr_copy_value(args, @@ -908,12 +908,12 @@ xfs_attr_shortform_to_leaf( dp = args->dp; ifp = dp->i_afp; - sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data; + sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data; size = be16_to_cpu(sf->hdr.totsize); tmpbuffer = kmem_alloc(size, 0); ASSERT(tmpbuffer != NULL); memcpy(tmpbuffer, ifp->if_u1.if_data, size); - sf = (xfs_attr_shortform_t *)tmpbuffer; + sf = (struct xfs_attr_shortform *)tmpbuffer; xfs_idata_realloc(dp, -size, XFS_ATTR_FORK); xfs_bmap_local_to_extents_empty(args->trans, dp, XFS_ATTR_FORK); @@ -951,7 +951,7 @@ xfs_attr_shortform_to_leaf( ASSERT(error != -ENOSPC); if (error) goto out; - sfe = XFS_ATTR_SF_NEXTENTRY(sfe); + sfe = xfs_attr_sf_nextentry(sfe); } error = 0; *leaf_bp = bp; @@ -992,9 +992,8 @@ xfs_attr_shortform_allfit( return 0; if (be16_to_cpu(name_loc->valuelen) >= XFS_ATTR_SF_ENTSIZE_MAX) return 0; - bytes += sizeof(struct xfs_attr_sf_entry) - 1 - + name_loc->namelen - + be16_to_cpu(name_loc->valuelen); + bytes += xfs_attr_sf_entsize_byname(name_loc->namelen, + be16_to_cpu(name_loc->valuelen)); } if ((dp->i_mount->m_flags & XFS_MOUNT_ATTR2) && (dp->i_df.if_format != XFS_DINODE_FMT_BTREE) && @@ -1039,7 +1038,7 @@ xfs_attr_shortform_verify( * xfs_attr_sf_entry is defined with a 1-byte variable * array at the end, so we must subtract that off. */ - if (((char *)sfep + sizeof(*sfep) - 1) >= endp) + if (((char *)sfep + sizeof(*sfep)) >= endp) return __this_address; /* Don't allow names with known bad length. */ @@ -1051,7 +1050,7 @@ xfs_attr_shortform_verify( * within the data buffer. The next entry starts after the * name component, so nextentry is an acceptable test. */ - next_sfep = XFS_ATTR_SF_NEXTENTRY(sfep); + next_sfep = xfs_attr_sf_nextentry(sfep); if ((char *)next_sfep > endp) return __this_address; diff --git a/fs/xfs/libxfs/xfs_attr_sf.h b/fs/xfs/libxfs/xfs_attr_sf.h index bb004fb7944a..37578b369d9b 100644 --- a/fs/xfs/libxfs/xfs_attr_sf.h +++ b/fs/xfs/libxfs/xfs_attr_sf.h @@ -13,7 +13,6 @@ * to fit into the literal area of the inode. */ typedef struct xfs_attr_sf_hdr xfs_attr_sf_hdr_t; -typedef struct xfs_attr_sf_entry xfs_attr_sf_entry_t; /* * We generate this then sort it, attr_list() must return things in hash-order. @@ -27,16 +26,26 @@ typedef struct xfs_attr_sf_sort { unsigned char *name; /* name value, pointer into buffer */ } xfs_attr_sf_sort_t; -#define XFS_ATTR_SF_ENTSIZE_BYNAME(nlen,vlen) /* space name/value uses */ \ - (((int)sizeof(xfs_attr_sf_entry_t)-1 + (nlen)+(vlen))) #define XFS_ATTR_SF_ENTSIZE_MAX /* max space for name&value */ \ ((1 << (NBBY*(int)sizeof(uint8_t))) - 1) -#define XFS_ATTR_SF_ENTSIZE(sfep) /* space an entry uses */ \ - ((int)sizeof(xfs_attr_sf_entry_t)-1 + (sfep)->namelen+(sfep)->valuelen) -#define XFS_ATTR_SF_NEXTENTRY(sfep) /* next entry in struct */ \ - ((xfs_attr_sf_entry_t *)((char *)(sfep) + XFS_ATTR_SF_ENTSIZE(sfep))) -#define XFS_ATTR_SF_TOTSIZE(dp) /* total space in use */ \ - (be16_to_cpu(((xfs_attr_shortform_t *) \ - ((dp)->i_afp->if_u1.if_data))->hdr.totsize)) + +/* space name/value uses */ +static inline int xfs_attr_sf_entsize_byname(uint8_t nlen, uint8_t vlen) +{ + return sizeof(struct xfs_attr_sf_entry) + nlen + vlen; +} + +/* space an entry uses */ +static inline int xfs_attr_sf_entsize(struct xfs_attr_sf_entry *sfep) +{ + return struct_size(sfep, nameval, sfep->namelen + sfep->valuelen); +} + +/* next entry in struct */ +static inline struct xfs_attr_sf_entry * +xfs_attr_sf_nextentry(struct xfs_attr_sf_entry *sfep) +{ + return (void *)sfep + xfs_attr_sf_entsize(sfep); +} #endif /* __XFS_ATTR_SF_H__ */ diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h index 059ac108b1b3..b40a4e80f5ee 100644 --- a/fs/xfs/libxfs/xfs_da_format.h +++ b/fs/xfs/libxfs/xfs_da_format.h @@ -579,7 +579,7 @@ xfs_dir2_block_leaf_p(struct xfs_dir2_block_tail *btp) /* * Entries are packed toward the top as tight as possible. */ -typedef struct xfs_attr_shortform { +struct xfs_attr_shortform { struct xfs_attr_sf_hdr { /* constant-structure header block */ __be16 totsize; /* total bytes in shortform list */ __u8 count; /* count of active entries */ @@ -589,9 +589,9 @@ typedef struct xfs_attr_shortform { uint8_t namelen; /* actual length of name (no NULL) */ uint8_t valuelen; /* actual length of value (no NULL) */ uint8_t flags; /* flags bits (see xfs_attr_leaf.h) */ - uint8_t nameval[1]; /* name & value bytes concatenated */ + uint8_t nameval[]; /* name & value bytes concatenated */ } list[1]; /* variable sized array */ -} xfs_attr_shortform_t; +}; typedef struct xfs_attr_leaf_map { /* RLE map of free bytes */ __be16 base; /* base of free region */ diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c index 5a2db00b9d5f..6766417d5ba4 100644 --- a/fs/xfs/libxfs/xfs_dquot_buf.c +++ b/fs/xfs/libxfs/xfs_dquot_buf.c @@ -69,6 +69,13 @@ xfs_dquot_verify( ddq_type != XFS_DQTYPE_GROUP) return __this_address; + if ((ddq->d_type & XFS_DQTYPE_BIGTIME) && + !xfs_sb_version_hasbigtime(&mp->m_sb)) + return __this_address; + + if ((ddq->d_type & XFS_DQTYPE_BIGTIME) && !ddq->d_id) + return __this_address; + if (id != -1 && id != be32_to_cpu(ddq->d_id)) return __this_address; @@ -288,3 +295,31 @@ const struct xfs_buf_ops xfs_dquot_buf_ra_ops = { .verify_read = xfs_dquot_buf_readahead_verify, .verify_write = xfs_dquot_buf_write_verify, }; + +/* Convert an on-disk timer value into an incore timer value. */ +time64_t +xfs_dquot_from_disk_ts( + struct xfs_disk_dquot *ddq, + __be32 dtimer) +{ + uint32_t t = be32_to_cpu(dtimer); + + if (t != 0 && (ddq->d_type & XFS_DQTYPE_BIGTIME)) + return xfs_dq_bigtime_to_unix(t); + + return t; +} + +/* Convert an incore timer value into an on-disk timer value. */ +__be32 +xfs_dquot_to_disk_ts( + struct xfs_dquot *dqp, + time64_t timer) +{ + uint32_t t = timer; + + if (timer != 0 && (dqp->q_type & XFS_DQTYPE_BIGTIME)) + t = xfs_dq_unix_to_bigtime(timer); + + return cpu_to_be32(t); +} diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 31b7ece985bb..dd764da08f6f 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -449,10 +449,12 @@ xfs_sb_has_compat_feature( #define XFS_SB_FEAT_RO_COMPAT_FINOBT (1 << 0) /* free inode btree */ #define XFS_SB_FEAT_RO_COMPAT_RMAPBT (1 << 1) /* reverse map btree */ #define XFS_SB_FEAT_RO_COMPAT_REFLINK (1 << 2) /* reflinked files */ +#define XFS_SB_FEAT_RO_COMPAT_INOBTCNT (1 << 3) /* inobt block counts */ #define XFS_SB_FEAT_RO_COMPAT_ALL \ (XFS_SB_FEAT_RO_COMPAT_FINOBT | \ XFS_SB_FEAT_RO_COMPAT_RMAPBT | \ - XFS_SB_FEAT_RO_COMPAT_REFLINK) + XFS_SB_FEAT_RO_COMPAT_REFLINK| \ + XFS_SB_FEAT_RO_COMPAT_INOBTCNT) #define XFS_SB_FEAT_RO_COMPAT_UNKNOWN ~XFS_SB_FEAT_RO_COMPAT_ALL static inline bool xfs_sb_has_ro_compat_feature( @@ -465,10 +467,12 @@ xfs_sb_has_ro_compat_feature( #define XFS_SB_FEAT_INCOMPAT_FTYPE (1 << 0) /* filetype in dirent */ #define XFS_SB_FEAT_INCOMPAT_SPINODES (1 << 1) /* sparse inode chunks */ #define XFS_SB_FEAT_INCOMPAT_META_UUID (1 << 2) /* metadata UUID */ +#define XFS_SB_FEAT_INCOMPAT_BIGTIME (1 << 3) /* large timestamps */ #define XFS_SB_FEAT_INCOMPAT_ALL \ (XFS_SB_FEAT_INCOMPAT_FTYPE| \ XFS_SB_FEAT_INCOMPAT_SPINODES| \ - XFS_SB_FEAT_INCOMPAT_META_UUID) + XFS_SB_FEAT_INCOMPAT_META_UUID| \ + XFS_SB_FEAT_INCOMPAT_BIGTIME) #define XFS_SB_FEAT_INCOMPAT_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_ALL static inline bool @@ -563,6 +567,23 @@ static inline bool xfs_sb_version_hasreflink(struct xfs_sb *sbp) (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_REFLINK); } +static inline bool xfs_sb_version_hasbigtime(struct xfs_sb *sbp) +{ + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 && + (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_BIGTIME); +} + +/* + * Inode btree block counter. We record the number of inobt and finobt blocks + * in the AGI header so that we can skip the finobt walk at mount time when + * setting up per-AG reservations. + */ +static inline bool xfs_sb_version_hasinobtcounts(struct xfs_sb *sbp) +{ + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 && + (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_INOBTCNT); +} + /* * end of superblock version macros */ @@ -765,6 +786,9 @@ typedef struct xfs_agi { __be32 agi_free_root; /* root of the free inode btree */ __be32 agi_free_level;/* levels in free inode btree */ + __be32 agi_iblocks; /* inobt blocks used */ + __be32 agi_fblocks; /* finobt blocks used */ + /* structure must be padded to 64 bit alignment */ } xfs_agi_t; @@ -785,7 +809,8 @@ typedef struct xfs_agi { #define XFS_AGI_ALL_BITS_R1 ((1 << XFS_AGI_NUM_BITS_R1) - 1) #define XFS_AGI_FREE_ROOT (1 << 11) #define XFS_AGI_FREE_LEVEL (1 << 12) -#define XFS_AGI_NUM_BITS_R2 13 +#define XFS_AGI_IBLOCKS (1 << 13) /* both inobt/finobt block counters */ +#define XFS_AGI_NUM_BITS_R2 14 /* disk block (xfs_daddr_t) in the AG */ #define XFS_AGI_DADDR(mp) ((xfs_daddr_t)(2 << (mp)->m_sectbb_log)) @@ -831,10 +856,87 @@ struct xfs_agfl { ASSERT(xfs_daddr_to_agno(mp, d) == \ xfs_daddr_to_agno(mp, (d) + (len) - 1))) -typedef struct xfs_timestamp { +/* + * XFS Timestamps + * ============== + * + * Traditional ondisk inode timestamps consist of signed 32-bit counters for + * seconds and nanoseconds; time zero is the Unix epoch, Jan 1 00:00:00 UTC + * 1970, which means that the timestamp epoch is the same as the Unix epoch. + * Therefore, the ondisk min and max defined here can be used directly to + * constrain the incore timestamps on a Unix system. Note that we actually + * encode a __be64 value on disk. + * + * When the bigtime feature is enabled, ondisk inode timestamps become an + * unsigned 64-bit nanoseconds counter. This means that the bigtime inode + * timestamp epoch is the start of the classic timestamp range, which is + * Dec 31 20:45:52 UTC 1901. Because the epochs are not the same, callers + * /must/ use the bigtime conversion functions when encoding and decoding raw + * timestamps. + */ +typedef __be64 xfs_timestamp_t; + +/* Legacy timestamp encoding format. */ +struct xfs_legacy_timestamp { __be32 t_sec; /* timestamp seconds */ __be32 t_nsec; /* timestamp nanoseconds */ -} xfs_timestamp_t; +}; + +/* + * Smallest possible ondisk seconds value with traditional timestamps. This + * corresponds exactly with the incore timestamp Dec 13 20:45:52 UTC 1901. + */ +#define XFS_LEGACY_TIME_MIN ((int64_t)S32_MIN) + +/* + * Largest possible ondisk seconds value with traditional timestamps. This + * corresponds exactly with the incore timestamp Jan 19 03:14:07 UTC 2038. + */ +#define XFS_LEGACY_TIME_MAX ((int64_t)S32_MAX) + +/* + * Smallest possible ondisk seconds value with bigtime timestamps. This + * corresponds (after conversion to a Unix timestamp) with the traditional + * minimum timestamp of Dec 13 20:45:52 UTC 1901. + */ +#define XFS_BIGTIME_TIME_MIN ((int64_t)0) + +/* + * Largest supported ondisk seconds value with bigtime timestamps. This + * corresponds (after conversion to a Unix timestamp) with an incore timestamp + * of Jul 2 20:20:24 UTC 2486. + * + * We round down the ondisk limit so that the bigtime quota and inode max + * timestamps will be the same. + */ +#define XFS_BIGTIME_TIME_MAX ((int64_t)((-1ULL / NSEC_PER_SEC) & ~0x3ULL)) + +/* + * Bigtime epoch is set exactly to the minimum time value that a traditional + * 32-bit timestamp can represent when using the Unix epoch as a reference. + * Hence the Unix epoch is at a fixed offset into the supported bigtime + * timestamp range. + * + * The bigtime epoch also matches the minimum value an on-disk 32-bit XFS + * timestamp can represent so we will not lose any fidelity in converting + * to/from unix and bigtime timestamps. + * + * The following conversion factor converts a seconds counter from the Unix + * epoch to the bigtime epoch. + */ +#define XFS_BIGTIME_EPOCH_OFFSET (-(int64_t)S32_MIN) + +/* Convert a timestamp from the Unix epoch to the bigtime epoch. */ +static inline uint64_t xfs_unix_to_bigtime(time64_t unix_seconds) +{ + return (uint64_t)unix_seconds + XFS_BIGTIME_EPOCH_OFFSET; +} + +/* Convert a timestamp from the bigtime epoch to the Unix epoch. */ +static inline time64_t xfs_bigtime_to_unix(uint64_t ondisk_seconds) +{ + return (time64_t)ondisk_seconds - XFS_BIGTIME_EPOCH_OFFSET; +} /* * On-disk inode structure. @@ -1061,12 +1163,22 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev) #define XFS_DIFLAG2_DAX_BIT 0 /* use DAX for this inode */ #define XFS_DIFLAG2_REFLINK_BIT 1 /* file's blocks may be shared */ #define XFS_DIFLAG2_COWEXTSIZE_BIT 2 /* copy on write extent size hint */ +#define XFS_DIFLAG2_BIGTIME_BIT 3 /* big timestamps */ + #define XFS_DIFLAG2_DAX (1 << XFS_DIFLAG2_DAX_BIT) #define XFS_DIFLAG2_REFLINK (1 << XFS_DIFLAG2_REFLINK_BIT) #define XFS_DIFLAG2_COWEXTSIZE (1 << XFS_DIFLAG2_COWEXTSIZE_BIT) +#define XFS_DIFLAG2_BIGTIME (1 << XFS_DIFLAG2_BIGTIME_BIT) #define XFS_DIFLAG2_ANY \ - (XFS_DIFLAG2_DAX | XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE) + (XFS_DIFLAG2_DAX | XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE | \ + XFS_DIFLAG2_BIGTIME) + +static inline bool xfs_dinode_has_bigtime(const struct xfs_dinode *dip) +{ + return dip->di_version >= 3 && + (dip->di_flags2 & cpu_to_be64(XFS_DIFLAG2_BIGTIME)); +} /* * Inode number format: @@ -1152,13 +1264,98 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev) #define XFS_DQTYPE_USER 0x01 /* user dquot record */ #define XFS_DQTYPE_PROJ 0x02 /* project dquot record */ #define XFS_DQTYPE_GROUP 0x04 /* group dquot record */ +#define XFS_DQTYPE_BIGTIME 0x80 /* large expiry timestamps */ /* bitmask to determine if this is a user/group/project dquot */ #define XFS_DQTYPE_REC_MASK (XFS_DQTYPE_USER | \ XFS_DQTYPE_PROJ | \ XFS_DQTYPE_GROUP) -#define XFS_DQTYPE_ANY (XFS_DQTYPE_REC_MASK) +#define XFS_DQTYPE_ANY (XFS_DQTYPE_REC_MASK | \ + XFS_DQTYPE_BIGTIME) + +/* + * XFS Quota Timers + * ================ + * + * Traditional quota grace period expiration timers are an unsigned 32-bit + * seconds counter; time zero is the Unix epoch, Jan 1 00:00:01 UTC 1970. + * Note that an expiration value of zero means that the quota limit has not + * been reached, and therefore no expiration has been set. Therefore, the + * ondisk min and max defined here can be used directly to constrain the incore + * quota expiration timestamps on a Unix system. + * + * When bigtime is enabled, we trade two bits of precision to expand the + * expiration timeout range to match that of big inode timestamps. The min and + * max recorded here are the on-disk limits, not a Unix timestamp. + * + * The grace period for each quota type is stored in the root dquot (id = 0) + * and is applied to a non-root dquot when it exceeds the soft or hard limits. + * The length of quota grace periods are unsigned 32-bit quantities measured in + * units of seconds. A value of zero means to use the default period. + */ + +/* + * Smallest possible ondisk quota expiration value with traditional timestamps. + * This corresponds exactly with the incore expiration Jan 1 00:00:01 UTC 1970. + */ +#define XFS_DQ_LEGACY_EXPIRY_MIN ((int64_t)1) + +/* + * Largest possible ondisk quota expiration value with traditional timestamps. + * This corresponds exactly with the incore expiration Feb 7 06:28:15 UTC 2106. + */ +#define XFS_DQ_LEGACY_EXPIRY_MAX ((int64_t)U32_MAX) + +/* + * Smallest possible ondisk quota expiration value with bigtime timestamps. + * This corresponds (after conversion to a Unix timestamp) with the incore + * expiration of Jan 1 00:00:04 UTC 1970. + */ +#define XFS_DQ_BIGTIME_EXPIRY_MIN (XFS_DQ_LEGACY_EXPIRY_MIN) + +/* + * Largest supported ondisk quota expiration value with bigtime timestamps. + * This corresponds (after conversion to a Unix timestamp) with an incore + * expiration of Jul 2 20:20:24 UTC 2486. + * + * The ondisk field supports values up to -1U, which corresponds to an incore + * expiration in 2514. This is beyond the maximum the bigtime inode timestamp, + * so we cap the maximum bigtime quota expiration to the max inode timestamp. + */ +#define XFS_DQ_BIGTIME_EXPIRY_MAX ((int64_t)4074815106U) + +/* + * The following conversion factors assist in converting a quota expiration + * timestamp between the incore and ondisk formats. + */ +#define XFS_DQ_BIGTIME_SHIFT (2) +#define XFS_DQ_BIGTIME_SLACK ((int64_t)(1ULL << XFS_DQ_BIGTIME_SHIFT) - 1) + +/* Convert an incore quota expiration timestamp to an ondisk bigtime value. */ +static inline uint32_t xfs_dq_unix_to_bigtime(time64_t unix_seconds) +{ + /* + * Round the expiration timestamp up to the nearest bigtime timestamp + * that we can store, to give users the most time to fix problems. + */ + return ((uint64_t)unix_seconds + XFS_DQ_BIGTIME_SLACK) >> + XFS_DQ_BIGTIME_SHIFT; +} + +/* Convert an ondisk bigtime quota expiration value to an incore timestamp. */ +static inline time64_t xfs_dq_bigtime_to_unix(uint32_t ondisk_seconds) +{ + return (time64_t)ondisk_seconds << XFS_DQ_BIGTIME_SHIFT; +} + +/* + * Default quota grace periods, ranging from zero (use the compiled defaults) + * to ~136 years. These are applied to a non-root dquot that has exceeded + * either limit. + */ +#define XFS_DQ_GRACE_MIN ((int64_t)0) +#define XFS_DQ_GRACE_MAX ((int64_t)U32_MAX) /* * This is the main portion of the on-disk representation of quota information diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index 84bcffa87753..2a2e3cfd94f0 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -249,6 +249,7 @@ typedef struct xfs_fsop_resblks { #define XFS_FSOP_GEOM_FLAGS_SPINODES (1 << 18) /* sparse inode chunks */ #define XFS_FSOP_GEOM_FLAGS_RMAPBT (1 << 19) /* reverse mapping btree */ #define XFS_FSOP_GEOM_FLAGS_REFLINK (1 << 20) /* files can share blocks */ +#define XFS_FSOP_GEOM_FLAGS_BIGTIME (1 << 21) /* 64-bit nsec timestamps */ /* * Minimum and maximum sizes need for growth checks. diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index a6b37db55169..974e71bc4a3a 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -2473,6 +2473,7 @@ xfs_ialloc_log_agi( offsetof(xfs_agi_t, agi_unlinked), offsetof(xfs_agi_t, agi_free_root), offsetof(xfs_agi_t, agi_free_level), + offsetof(xfs_agi_t, agi_iblocks), sizeof(xfs_agi_t) }; #ifdef DEBUG @@ -2806,6 +2807,10 @@ xfs_ialloc_setup_geometry( uint64_t icount; uint inodes; + igeo->new_diflags2 = 0; + if (xfs_sb_version_hasbigtime(&mp->m_sb)) + igeo->new_diflags2 |= XFS_DIFLAG2_BIGTIME; + /* Compute inode btree geometry. */ igeo->agino_log = sbp->sb_inopblog + sbp->sb_agblklog; igeo->inobt_mxr[0] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 1); diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index 3c8aebc36e64..cc919a2ee870 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -67,6 +67,25 @@ xfs_finobt_set_root( XFS_AGI_FREE_ROOT | XFS_AGI_FREE_LEVEL); } +/* Update the inode btree block counter for this btree. */ +static inline void +xfs_inobt_mod_blockcount( + struct xfs_btree_cur *cur, + int howmuch) +{ + struct xfs_buf *agbp = cur->bc_ag.agbp; + struct xfs_agi *agi = agbp->b_addr; + + if (!xfs_sb_version_hasinobtcounts(&cur->bc_mp->m_sb)) + return; + + if (cur->bc_btnum == XFS_BTNUM_FINO) + be32_add_cpu(&agi->agi_fblocks, howmuch); + else if (cur->bc_btnum == XFS_BTNUM_INO) + be32_add_cpu(&agi->agi_iblocks, howmuch); + xfs_ialloc_log_agi(cur->bc_tp, agbp, XFS_AGI_IBLOCKS); +} + STATIC int __xfs_inobt_alloc_block( struct xfs_btree_cur *cur, @@ -102,6 +121,7 @@ __xfs_inobt_alloc_block( new->s = cpu_to_be32(XFS_FSB_TO_AGBNO(args.mp, args.fsbno)); *stat = 1; + xfs_inobt_mod_blockcount(cur, 1); return 0; } @@ -134,6 +154,7 @@ __xfs_inobt_free_block( struct xfs_buf *bp, enum xfs_ag_resv_type resv) { + xfs_inobt_mod_blockcount(cur, -1); return xfs_free_extent(cur->bc_tp, XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(bp)), 1, &XFS_RMAP_OINFO_INOBT, resv); @@ -480,19 +501,29 @@ xfs_inobt_commit_staged_btree( { struct xfs_agi *agi = agbp->b_addr; struct xbtree_afakeroot *afake = cur->bc_ag.afake; + int fields; ASSERT(cur->bc_flags & XFS_BTREE_STAGING); if (cur->bc_btnum == XFS_BTNUM_INO) { + fields = XFS_AGI_ROOT | XFS_AGI_LEVEL; agi->agi_root = cpu_to_be32(afake->af_root); agi->agi_level = cpu_to_be32(afake->af_levels); - xfs_ialloc_log_agi(tp, agbp, XFS_AGI_ROOT | XFS_AGI_LEVEL); + if (xfs_sb_version_hasinobtcounts(&cur->bc_mp->m_sb)) { + agi->agi_iblocks = cpu_to_be32(afake->af_blocks); + fields |= XFS_AGI_IBLOCKS; + } + xfs_ialloc_log_agi(tp, agbp, fields); xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_inobt_ops); } else { + fields = XFS_AGI_FREE_ROOT | XFS_AGI_FREE_LEVEL; agi->agi_free_root = cpu_to_be32(afake->af_root); agi->agi_free_level = cpu_to_be32(afake->af_levels); - xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREE_ROOT | - XFS_AGI_FREE_LEVEL); + if (xfs_sb_version_hasinobtcounts(&cur->bc_mp->m_sb)) { + agi->agi_fblocks = cpu_to_be32(afake->af_blocks); + fields |= XFS_AGI_IBLOCKS; + } + xfs_ialloc_log_agi(tp, agbp, fields); xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_finobt_ops); } } @@ -673,6 +704,28 @@ xfs_inobt_count_blocks( return error; } +/* Read finobt block count from AGI header. */ +static int +xfs_finobt_read_blocks( + struct xfs_mount *mp, + struct xfs_trans *tp, + xfs_agnumber_t agno, + xfs_extlen_t *tree_blocks) +{ + struct xfs_buf *agbp; + struct xfs_agi *agi; + int error; + + error = xfs_ialloc_read_agi(mp, tp, agno, &agbp); + if (error) + return error; + + agi = agbp->b_addr; + *tree_blocks = be32_to_cpu(agi->agi_fblocks); + xfs_trans_brelse(tp, agbp); + return 0; +} + /* * Figure out how many blocks to reserve and how many are used by this btree. */ @@ -690,7 +743,11 @@ xfs_finobt_calc_reserves( if (!xfs_sb_version_hasfinobt(&mp->m_sb)) return 0; - error = xfs_inobt_count_blocks(mp, tp, agno, XFS_BTNUM_FINO, &tree_len); + if (xfs_sb_version_hasinobtcounts(&mp->m_sb)) + error = xfs_finobt_read_blocks(mp, tp, agno, &tree_len); + else + error = xfs_inobt_count_blocks(mp, tp, agno, XFS_BTNUM_FINO, + &tree_len); if (error) return error; diff --git a/fs/xfs/libxfs/xfs_iext_tree.c b/fs/xfs/libxfs/xfs_iext_tree.c index 52451809c478..b4164256993d 100644 --- a/fs/xfs/libxfs/xfs_iext_tree.c +++ b/fs/xfs/libxfs/xfs_iext_tree.c @@ -603,7 +603,7 @@ xfs_iext_realloc_root( if (new_size / sizeof(struct xfs_iext_rec) == RECS_PER_LEAF) new_size = NODE_SIZE; - new = kmem_realloc(ifp->if_u1.if_root, new_size, KM_NOFS); + new = krealloc(ifp->if_u1.if_root, new_size, GFP_NOFS | __GFP_NOFAIL); memset(new + ifp->if_bytes, 0, new_size - ifp->if_bytes); ifp->if_u1.if_root = new; cur->leaf = new; diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 8d5dd08eab75..c667c63f2cb0 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -157,6 +157,36 @@ xfs_imap_to_bp( return 0; } +static inline struct timespec64 xfs_inode_decode_bigtime(uint64_t ts) +{ + struct timespec64 tv; + uint32_t n; + + tv.tv_sec = xfs_bigtime_to_unix(div_u64_rem(ts, NSEC_PER_SEC, &n)); + tv.tv_nsec = n; + + return tv; +} + +/* Convert an ondisk timestamp to an incore timestamp. */ +struct timespec64 +xfs_inode_from_disk_ts( + struct xfs_dinode *dip, + const xfs_timestamp_t ts) +{ + struct timespec64 tv; + struct xfs_legacy_timestamp *lts; + + if (xfs_dinode_has_bigtime(dip)) + return xfs_inode_decode_bigtime(be64_to_cpu(ts)); + + lts = (struct xfs_legacy_timestamp *)&ts; + tv.tv_sec = (int)be32_to_cpu(lts->t_sec); + tv.tv_nsec = (int)be32_to_cpu(lts->t_nsec); + + return tv; +} + int xfs_inode_from_disk( struct xfs_inode *ip, @@ -211,12 +241,9 @@ xfs_inode_from_disk( * a time before epoch is converted to a time long after epoch * on 64 bit systems. */ - inode->i_atime.tv_sec = (int)be32_to_cpu(from->di_atime.t_sec); - inode->i_atime.tv_nsec = (int)be32_to_cpu(from->di_atime.t_nsec); - inode->i_mtime.tv_sec = (int)be32_to_cpu(from->di_mtime.t_sec); - inode->i_mtime.tv_nsec = (int)be32_to_cpu(from->di_mtime.t_nsec); - inode->i_ctime.tv_sec = (int)be32_to_cpu(from->di_ctime.t_sec); - inode->i_ctime.tv_nsec = (int)be32_to_cpu(from->di_ctime.t_nsec); + inode->i_atime = xfs_inode_from_disk_ts(from, from->di_atime); + inode->i_mtime = xfs_inode_from_disk_ts(from, from->di_mtime); + inode->i_ctime = xfs_inode_from_disk_ts(from, from->di_ctime); to->di_size = be64_to_cpu(from->di_size); to->di_nblocks = be64_to_cpu(from->di_nblocks); @@ -229,8 +256,7 @@ xfs_inode_from_disk( if (xfs_sb_version_has_v3inode(&ip->i_mount->m_sb)) { inode_set_iversion_queried(inode, be64_to_cpu(from->di_changecount)); - to->di_crtime.tv_sec = be32_to_cpu(from->di_crtime.t_sec); - to->di_crtime.tv_nsec = be32_to_cpu(from->di_crtime.t_nsec); + to->di_crtime = xfs_inode_from_disk_ts(from, from->di_crtime); to->di_flags2 = be64_to_cpu(from->di_flags2); to->di_cowextsize = be32_to_cpu(from->di_cowextsize); } @@ -252,6 +278,25 @@ out_destroy_data_fork: return error; } +/* Convert an incore timestamp to an ondisk timestamp. */ +static inline xfs_timestamp_t +xfs_inode_to_disk_ts( + struct xfs_inode *ip, + const struct timespec64 tv) +{ + struct xfs_legacy_timestamp *lts; + xfs_timestamp_t ts; + + if (xfs_inode_has_bigtime(ip)) + return cpu_to_be64(xfs_inode_encode_bigtime(tv)); + + lts = (struct xfs_legacy_timestamp *)&ts; + lts->t_sec = cpu_to_be32(tv.tv_sec); + lts->t_nsec = cpu_to_be32(tv.tv_nsec); + + return ts; +} + void xfs_inode_to_disk( struct xfs_inode *ip, @@ -271,12 +316,9 @@ xfs_inode_to_disk( to->di_projid_hi = cpu_to_be16(from->di_projid >> 16); memset(to->di_pad, 0, sizeof(to->di_pad)); - to->di_atime.t_sec = cpu_to_be32(inode->i_atime.tv_sec); - to->di_atime.t_nsec = cpu_to_be32(inode->i_atime.tv_nsec); - to->di_mtime.t_sec = cpu_to_be32(inode->i_mtime.tv_sec); - to->di_mtime.t_nsec = cpu_to_be32(inode->i_mtime.tv_nsec); - to->di_ctime.t_sec = cpu_to_be32(inode->i_ctime.tv_sec); - to->di_ctime.t_nsec = cpu_to_be32(inode->i_ctime.tv_nsec); + to->di_atime = xfs_inode_to_disk_ts(ip, inode->i_atime); + to->di_mtime = xfs_inode_to_disk_ts(ip, inode->i_mtime); + to->di_ctime = xfs_inode_to_disk_ts(ip, inode->i_ctime); to->di_nlink = cpu_to_be32(inode->i_nlink); to->di_gen = cpu_to_be32(inode->i_generation); to->di_mode = cpu_to_be16(inode->i_mode); @@ -295,8 +337,7 @@ xfs_inode_to_disk( if (xfs_sb_version_has_v3inode(&ip->i_mount->m_sb)) { to->di_version = 3; to->di_changecount = cpu_to_be64(inode_peek_iversion(inode)); - to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.tv_sec); - to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.tv_nsec); + to->di_crtime = xfs_inode_to_disk_ts(ip, from->di_crtime); to->di_flags2 = cpu_to_be64(from->di_flags2); to->di_cowextsize = cpu_to_be32(from->di_cowextsize); to->di_ino = cpu_to_be64(ip->i_ino); @@ -310,58 +351,6 @@ xfs_inode_to_disk( } } -void -xfs_log_dinode_to_disk( - struct xfs_log_dinode *from, - struct xfs_dinode *to) -{ - to->di_magic = cpu_to_be16(from->di_magic); - to->di_mode = cpu_to_be16(from->di_mode); - to->di_version = from->di_version; - to->di_format = from->di_format; - to->di_onlink = 0; - to->di_uid = cpu_to_be32(from->di_uid); - to->di_gid = cpu_to_be32(from->di_gid); - to->di_nlink = cpu_to_be32(from->di_nlink); - to->di_projid_lo = cpu_to_be16(from->di_projid_lo); - to->di_projid_hi = cpu_to_be16(from->di_projid_hi); - memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad)); - - to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec); - to->di_atime.t_nsec = cpu_to_be32(from->di_atime.t_nsec); - to->di_mtime.t_sec = cpu_to_be32(from->di_mtime.t_sec); - to->di_mtime.t_nsec = cpu_to_be32(from->di_mtime.t_nsec); - to->di_ctime.t_sec = cpu_to_be32(from->di_ctime.t_sec); - to->di_ctime.t_nsec = cpu_to_be32(from->di_ctime.t_nsec); - - to->di_size = cpu_to_be64(from->di_size); - to->di_nblocks = cpu_to_be64(from->di_nblocks); - to->di_extsize = cpu_to_be32(from->di_extsize); - to->di_nextents = cpu_to_be32(from->di_nextents); - to->di_anextents = cpu_to_be16(from->di_anextents); - to->di_forkoff = from->di_forkoff; - to->di_aformat = from->di_aformat; - to->di_dmevmask = cpu_to_be32(from->di_dmevmask); - to->di_dmstate = cpu_to_be16(from->di_dmstate); - to->di_flags = cpu_to_be16(from->di_flags); - to->di_gen = cpu_to_be32(from->di_gen); - - if (from->di_version == 3) { - to->di_changecount = cpu_to_be64(from->di_changecount); - to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec); - to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec); - to->di_flags2 = cpu_to_be64(from->di_flags2); - to->di_cowextsize = cpu_to_be32(from->di_cowextsize); - to->di_ino = cpu_to_be64(from->di_ino); - to->di_lsn = cpu_to_be64(from->di_lsn); - memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2)); - uuid_copy(&to->di_uuid, &from->di_uuid); - to->di_flushiter = 0; - } else { - to->di_flushiter = cpu_to_be16(from->di_flushiter); - } -} - static xfs_failaddr_t xfs_dinode_verify_fork( struct xfs_dinode *dip, @@ -568,6 +557,11 @@ xfs_dinode_verify( if (fa) return fa; + /* bigtime iflag can only happen on bigtime filesystems */ + if (xfs_dinode_has_bigtime(dip) && + !xfs_sb_version_hasbigtime(&mp->m_sb)) + return __this_address; + return NULL; } diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h index 6b08b9d060c2..536666143fe7 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.h +++ b/fs/xfs/libxfs/xfs_inode_buf.h @@ -32,6 +32,11 @@ struct xfs_icdinode { struct timespec64 di_crtime; /* time created */ }; +static inline bool xfs_icdinode_has_bigtime(const struct xfs_icdinode *icd) +{ + return icd->di_flags2 & XFS_DIFLAG2_BIGTIME; +} + /* * Inode location information. Stored in the inode and passed to * xfs_imap_to_bp() to get a buffer and dinode for a given inode. @@ -49,8 +54,6 @@ void xfs_dinode_calc_crc(struct xfs_mount *, struct xfs_dinode *); void xfs_inode_to_disk(struct xfs_inode *ip, struct xfs_dinode *to, xfs_lsn_t lsn); int xfs_inode_from_disk(struct xfs_inode *ip, struct xfs_dinode *from); -void xfs_log_dinode_to_disk(struct xfs_log_dinode *from, - struct xfs_dinode *to); xfs_failaddr_t xfs_dinode_verify(struct xfs_mount *mp, xfs_ino_t ino, struct xfs_dinode *dip); @@ -60,4 +63,12 @@ xfs_failaddr_t xfs_inode_validate_cowextsize(struct xfs_mount *mp, uint32_t cowextsize, uint16_t mode, uint16_t flags, uint64_t flags2); +static inline uint64_t xfs_inode_encode_bigtime(struct timespec64 tv) +{ + return xfs_unix_to_bigtime(tv.tv_sec) * NSEC_PER_SEC + tv.tv_nsec; +} + +struct timespec64 xfs_inode_from_disk_ts(struct xfs_dinode *dip, + const xfs_timestamp_t ts); + #endif /* __XFS_INODE_BUF_H__ */ diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 0cf853d42d62..7575de5cecb1 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -386,8 +386,8 @@ xfs_iroot_realloc( cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0); new_max = cur_max + rec_diff; new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max); - ifp->if_broot = kmem_realloc(ifp->if_broot, new_size, - KM_NOFS); + ifp->if_broot = krealloc(ifp->if_broot, new_size, + GFP_NOFS | __GFP_NOFAIL); op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, ifp->if_broot_bytes); np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, @@ -496,8 +496,8 @@ xfs_idata_realloc( * in size so that it can be logged and stay on word boundaries. * We enforce that here. */ - ifp->if_u1.if_data = kmem_realloc(ifp->if_u1.if_data, - roundup(new_size, 4), KM_NOFS); + ifp->if_u1.if_data = krealloc(ifp->if_u1.if_data, roundup(new_size, 4), + GFP_NOFS | __GFP_NOFAIL); ifp->if_bytes = new_size; } diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index e3400c9c71cd..8bd00da6d2a4 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -368,10 +368,13 @@ static inline int xfs_ilog_fdata(int w) * directly mirrors the xfs_dinode structure as it must contain all the same * information. */ -typedef struct xfs_ictimestamp { +typedef uint64_t xfs_ictimestamp_t; + +/* Legacy timestamp encoding format. */ +struct xfs_legacy_ictimestamp { int32_t t_sec; /* timestamp seconds */ int32_t t_nsec; /* timestamp nanoseconds */ -} xfs_ictimestamp_t; +}; /* * Define the format of the inode core that is logged. This structure must be diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h index 641132d0e39d..3cca2bfe714c 100644 --- a/fs/xfs/libxfs/xfs_log_recover.h +++ b/fs/xfs/libxfs/xfs_log_recover.h @@ -121,7 +121,6 @@ struct xlog_recover { void xlog_buf_readahead(struct xlog *log, xfs_daddr_t blkno, uint len, const struct xfs_buf_ops *ops); bool xlog_is_buffer_cancelled(struct xlog *log, xfs_daddr_t blkno, uint len); -void xlog_recover_iodone(struct xfs_buf *bp); void xlog_recover_release_intent(struct xlog *log, unsigned short intent_type, uint64_t intent_id); diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h index 076bdc7037ee..0f0af4e35032 100644 --- a/fs/xfs/libxfs/xfs_quota_defs.h +++ b/fs/xfs/libxfs/xfs_quota_defs.h @@ -23,7 +23,8 @@ typedef uint8_t xfs_dqtype_t; #define XFS_DQTYPE_STRINGS \ { XFS_DQTYPE_USER, "USER" }, \ { XFS_DQTYPE_PROJ, "PROJ" }, \ - { XFS_DQTYPE_GROUP, "GROUP" } + { XFS_DQTYPE_GROUP, "GROUP" }, \ + { XFS_DQTYPE_BIGTIME, "BIGTIME" } /* * flags for q_flags field in the dquot. @@ -143,4 +144,9 @@ extern int xfs_calc_dquots_per_chunk(unsigned int nbblks); extern void xfs_dqblk_repair(struct xfs_mount *mp, struct xfs_dqblk *dqb, xfs_dqid_t id, xfs_dqtype_t type); +struct xfs_dquot; +time64_t xfs_dquot_from_disk_ts(struct xfs_disk_dquot *ddq, + __be32 dtimer); +__be32 xfs_dquot_to_disk_ts(struct xfs_dquot *ddq, time64_t timer); + #endif /* __XFS_QUOTA_H__ */ diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index ae9aaf1f34bf..5aeafa59ed27 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -954,7 +954,7 @@ xfs_log_sb( struct xfs_trans *tp) { struct xfs_mount *mp = tp->t_mountp; - struct xfs_buf *bp = xfs_trans_getsb(tp, mp); + struct xfs_buf *bp = xfs_trans_getsb(tp); mp->m_sb.sb_icount = percpu_counter_sum(&mp->m_icount); mp->m_sb.sb_ifree = percpu_counter_sum(&mp->m_ifree); @@ -1084,7 +1084,7 @@ xfs_sync_sb_buf( if (error) return error; - bp = xfs_trans_getsb(tp, mp); + bp = xfs_trans_getsb(tp); xfs_log_sb(tp); xfs_trans_bhold(tp, bp); xfs_trans_set_sync(tp); @@ -1166,6 +1166,8 @@ xfs_fs_geometry( geo->flags |= XFS_FSOP_GEOM_FLAGS_RMAPBT; if (xfs_sb_version_hasreflink(sbp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_REFLINK; + if (xfs_sb_version_hasbigtime(sbp)) + geo->flags |= XFS_FSOP_GEOM_FLAGS_BIGTIME; if (xfs_sb_version_hassector(sbp)) geo->logsectsize = sbp->sb_logsectsize; else diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h index 708feb8eac76..c795ae47b3c9 100644 --- a/fs/xfs/libxfs/xfs_shared.h +++ b/fs/xfs/libxfs/xfs_shared.h @@ -176,6 +176,9 @@ struct xfs_ino_geometry { unsigned int ialloc_align; unsigned int agino_log; /* #bits for agino in inum */ + + /* precomputed value for di_flags2 */ + uint64_t new_diflags2; }; #endif /* __XFS_SHARED_H__ */ diff --git a/fs/xfs/libxfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c index b7e222befb08..90f1d5645052 100644 --- a/fs/xfs/libxfs/xfs_trans_inode.c +++ b/fs/xfs/libxfs/xfs_trans_inode.c @@ -132,6 +132,17 @@ xfs_trans_log_inode( } /* + * If we're updating the inode core or the timestamps and it's possible + * to upgrade this inode to bigtime format, do so now. + */ + if ((flags & (XFS_ILOG_CORE | XFS_ILOG_TIMESTAMP)) && + xfs_sb_version_hasbigtime(&ip->i_mount->m_sb) && + !xfs_inode_has_bigtime(ip)) { + ip->i_d.di_flags2 |= XFS_DIFLAG2_BIGTIME; + flags |= XFS_ILOG_CORE; + } + + /* * Record the specific change for fdatasync optimisation. This allows * fdatasync to skip log forces for inodes that are only timestamp * dirty. @@ -177,9 +188,9 @@ xfs_trans_log_inode( /* * Always OR in the bits from the ili_last_fields field. This is to - * coordinate with the xfs_iflush() and xfs_iflush_done() routines in - * the eventual clearing of the ili_fields bits. See the big comment in - * xfs_iflush() for an explanation of this coordination mechanism. + * coordinate with the xfs_iflush() and xfs_buf_inode_iodone() routines + * in the eventual clearing of the ili_fields bits. See the big comment + * in xfs_iflush() for an explanation of this coordination mechanism. */ iip->ili_fields |= (flags | iip->ili_last_fields | iversion_flags); spin_unlock(&iip->ili_lock); diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c index e9bcf1faa183..ae8e2e0ac64a 100644 --- a/fs/xfs/scrub/agheader.c +++ b/fs/xfs/scrub/agheader.c @@ -781,6 +781,35 @@ xchk_agi_xref_icounts( xchk_block_xref_set_corrupt(sc, sc->sa.agi_bp); } +/* Check agi_[fi]blocks against tree size */ +static inline void +xchk_agi_xref_fiblocks( + struct xfs_scrub *sc) +{ + struct xfs_agi *agi = sc->sa.agi_bp->b_addr; + xfs_agblock_t blocks; + int error = 0; + + if (!xfs_sb_version_hasinobtcounts(&sc->mp->m_sb)) + return; + + if (sc->sa.ino_cur) { + error = xfs_btree_count_blocks(sc->sa.ino_cur, &blocks); + if (!xchk_should_check_xref(sc, &error, &sc->sa.ino_cur)) + return; + if (blocks != be32_to_cpu(agi->agi_iblocks)) + xchk_block_xref_set_corrupt(sc, sc->sa.agi_bp); + } + + if (sc->sa.fino_cur) { + error = xfs_btree_count_blocks(sc->sa.fino_cur, &blocks); + if (!xchk_should_check_xref(sc, &error, &sc->sa.fino_cur)) + return; + if (blocks != be32_to_cpu(agi->agi_fblocks)) + xchk_block_xref_set_corrupt(sc, sc->sa.agi_bp); + } +} + /* Cross-reference with the other btrees. */ STATIC void xchk_agi_xref( @@ -804,6 +833,7 @@ xchk_agi_xref( xchk_agi_xref_icounts(sc); xchk_xref_is_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_FS); xchk_xref_is_not_shared(sc, agbno, 1); + xchk_agi_xref_fiblocks(sc); /* scrub teardown will take care of sc->sa for us */ } diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c index bca2ab1d4be9..401f71579ce6 100644 --- a/fs/xfs/scrub/agheader_repair.c +++ b/fs/xfs/scrub/agheader_repair.c @@ -810,10 +810,34 @@ xrep_agi_calc_from_btrees( error = xfs_ialloc_count_inodes(cur, &count, &freecount); if (error) goto err; + if (xfs_sb_version_hasinobtcounts(&mp->m_sb)) { + xfs_agblock_t blocks; + + error = xfs_btree_count_blocks(cur, &blocks); + if (error) + goto err; + agi->agi_iblocks = cpu_to_be32(blocks); + } xfs_btree_del_cursor(cur, error); agi->agi_count = cpu_to_be32(count); agi->agi_freecount = cpu_to_be32(freecount); + + if (xfs_sb_version_hasfinobt(&mp->m_sb) && + xfs_sb_version_hasinobtcounts(&mp->m_sb)) { + xfs_agblock_t blocks; + + cur = xfs_inobt_init_cursor(mp, sc->tp, agi_bp, sc->sa.agno, + XFS_BTNUM_FINO); + if (error) + goto err; + error = xfs_btree_count_blocks(cur, &blocks); + if (error) + goto err; + xfs_btree_del_cursor(cur, error); + agi->agi_fblocks = cpu_to_be32(blocks); + } + return 0; err: xfs_btree_del_cursor(cur, error); diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c index 6d483ab29e63..3aa85b64de36 100644 --- a/fs/xfs/scrub/inode.c +++ b/fs/xfs/scrub/inode.c @@ -190,11 +190,30 @@ xchk_inode_flags2( if ((flags2 & XFS_DIFLAG2_DAX) && (flags2 & XFS_DIFLAG2_REFLINK)) goto bad; + /* no bigtime iflag without the bigtime feature */ + if (xfs_dinode_has_bigtime(dip) && + !xfs_sb_version_hasbigtime(&mp->m_sb)) + goto bad; + return; bad: xchk_ino_set_corrupt(sc, ino); } +static inline void +xchk_dinode_nsec( + struct xfs_scrub *sc, + xfs_ino_t ino, + struct xfs_dinode *dip, + const xfs_timestamp_t ts) +{ + struct timespec64 tv; + + tv = xfs_inode_from_disk_ts(dip, ts); + if (tv.tv_nsec < 0 || tv.tv_nsec >= NSEC_PER_SEC) + xchk_ino_set_corrupt(sc, ino); +} + /* Scrub all the ondisk inode fields. */ STATIC void xchk_dinode( @@ -293,12 +312,9 @@ xchk_dinode( } /* di_[amc]time.nsec */ - if (be32_to_cpu(dip->di_atime.t_nsec) >= NSEC_PER_SEC) - xchk_ino_set_corrupt(sc, ino); - if (be32_to_cpu(dip->di_mtime.t_nsec) >= NSEC_PER_SEC) - xchk_ino_set_corrupt(sc, ino); - if (be32_to_cpu(dip->di_ctime.t_nsec) >= NSEC_PER_SEC) - xchk_ino_set_corrupt(sc, ino); + xchk_dinode_nsec(sc, ino, dip, dip->di_atime); + xchk_dinode_nsec(sc, ino, dip, dip->di_mtime); + xchk_dinode_nsec(sc, ino, dip, dip->di_ctime); /* * di_size. xfs_dinode_verify checks for things that screw up @@ -403,8 +419,7 @@ xchk_dinode( } if (dip->di_version >= 3) { - if (be32_to_cpu(dip->di_crtime.t_nsec) >= NSEC_PER_SEC) - xchk_ino_set_corrupt(sc, ino); + xchk_dinode_nsec(sc, ino, dip, dip->di_crtime); xchk_inode_flags2(sc, dip, ino, mode, flags, flags2); xchk_inode_cowextsize(sc, dip, ino, mode, flags, flags2); diff --git a/fs/xfs/scrub/symlink.c b/fs/xfs/scrub/symlink.c index 5641ae512c9e..c08be5ede066 100644 --- a/fs/xfs/scrub/symlink.c +++ b/fs/xfs/scrub/symlink.c @@ -22,7 +22,7 @@ xchk_setup_symlink( struct xfs_inode *ip) { /* Allocate the buffer without the inode lock held. */ - sc->buf = kmem_zalloc_large(XFS_SYMLINK_MAXLEN + 1, 0); + sc->buf = kvzalloc(XFS_SYMLINK_MAXLEN + 1, GFP_KERNEL); if (!sc->buf) return -ENOMEM; diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index d4c687b5cd06..c544951a0c07 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -192,7 +192,7 @@ __xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) if (acl) { args.valuelen = XFS_ACL_SIZE(acl->a_count); - args.value = kmem_zalloc_large(args.valuelen, 0); + args.value = kvzalloc(args.valuelen, GFP_KERNEL); if (!args.value) return -ENOMEM; xfs_acl_to_disk(args.value, acl); diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index b35611882ff9..55d126d4e096 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -544,7 +544,7 @@ xfs_discard_page( page, ip->i_ino, offset); error = xfs_bmap_punch_delalloc_range(ip, start_fsb, - PAGE_SIZE / i_blocksize(inode)); + i_blocks_per_page(inode, page)); if (error && !XFS_FORCED_SHUTDOWN(mp)) xfs_alert(mp, "page discard unable to remove delalloc mapping."); out_invalidate: diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c index 50f922cad91a..8f8837fe21cf 100644 --- a/fs/xfs/xfs_attr_list.c +++ b/fs/xfs/xfs_attr_list.c @@ -61,7 +61,7 @@ xfs_attr_shortform_list( int error = 0; ASSERT(dp->i_afp != NULL); - sf = (xfs_attr_shortform_t *)dp->i_afp->if_u1.if_data; + sf = (struct xfs_attr_shortform *)dp->i_afp->if_u1.if_data; ASSERT(sf != NULL); if (!sf->hdr.count) return 0; @@ -96,7 +96,7 @@ xfs_attr_shortform_list( */ if (context->seen_enough) break; - sfe = XFS_ATTR_SF_NEXTENTRY(sfe); + sfe = xfs_attr_sf_nextentry(sfe); } trace_xfs_attr_list_sf_all(context); return 0; @@ -136,7 +136,7 @@ xfs_attr_shortform_list( /* These are bytes, and both on-disk, don't endian-flip */ sbp->valuelen = sfe->valuelen; sbp->flags = sfe->flags; - sfe = XFS_ATTR_SF_NEXTENTRY(sfe); + sfe = xfs_attr_sf_nextentry(sfe); sbp++; nsbuf++; } diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 5123f82f2477..f2a8a0e75e1f 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -946,6 +946,14 @@ xfs_free_file_space( startoffset_fsb = XFS_B_TO_FSB(mp, offset); endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len); + /* We can only free complete realtime extents. */ + if (XFS_IS_REALTIME_INODE(ip)) { + xfs_extlen_t extsz = xfs_get_extsz_hint(ip); + + if ((startoffset_fsb | endoffset_fsb) & (extsz - 1)) + return -EINVAL; + } + /* * Need to zero the stuff we're not freeing, on disk. */ @@ -1139,6 +1147,14 @@ xfs_insert_file_space( trace_xfs_insert_file_space(ip); + /* We can only insert complete realtime extents. */ + if (XFS_IS_REALTIME_INODE(ip)) { + xfs_extlen_t extsz = xfs_get_extsz_hint(ip); + + if ((stop_fsb | shift_fsb) & (extsz - 1)) + return -EINVAL; + } + error = xfs_bmap_can_insert_extents(ip, stop_fsb, shift_fsb); if (error) return error; diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index d4cdcb6fb2fe..4e4cf91f4f9f 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -52,6 +52,15 @@ static kmem_zone_t *xfs_buf_zone; * b_lock (trylock due to inversion) */ +static int __xfs_buf_submit(struct xfs_buf *bp, bool wait); + +static inline int +xfs_buf_submit( + struct xfs_buf *bp) +{ + return __xfs_buf_submit(bp, !(bp->b_flags & XBF_ASYNC)); +} + static inline int xfs_buf_is_vmapped( struct xfs_buf *bp) @@ -751,7 +760,7 @@ found: return 0; } -STATIC int +int _xfs_buf_read( xfs_buf_t *bp, xfs_buf_flags_t flags) @@ -759,7 +768,7 @@ _xfs_buf_read( ASSERT(!(flags & XBF_WRITE)); ASSERT(bp->b_maps[0].bm_bn != XFS_BUF_DADDR_NULL); - bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD); + bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD | XBF_DONE); bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD); return xfs_buf_submit(bp); @@ -1170,20 +1179,145 @@ xfs_buf_wait_unpin( set_current_state(TASK_RUNNING); } +static void +xfs_buf_ioerror_alert_ratelimited( + struct xfs_buf *bp) +{ + static unsigned long lasttime; + static struct xfs_buftarg *lasttarg; + + if (bp->b_target != lasttarg || + time_after(jiffies, (lasttime + 5*HZ))) { + lasttime = jiffies; + xfs_buf_ioerror_alert(bp, __this_address); + } + lasttarg = bp->b_target; +} + /* - * Buffer Utility Routines + * Account for this latest trip around the retry handler, and decide if + * we've failed enough times to constitute a permanent failure. */ +static bool +xfs_buf_ioerror_permanent( + struct xfs_buf *bp, + struct xfs_error_cfg *cfg) +{ + struct xfs_mount *mp = bp->b_mount; -void + if (cfg->max_retries != XFS_ERR_RETRY_FOREVER && + ++bp->b_retries > cfg->max_retries) + return true; + if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER && + time_after(jiffies, cfg->retry_timeout + bp->b_first_retry_time)) + return true; + + /* At unmount we may treat errors differently */ + if ((mp->m_flags & XFS_MOUNT_UNMOUNTING) && mp->m_fail_unmount) + return true; + + return false; +} + +/* + * On a sync write or shutdown we just want to stale the buffer and let the + * caller handle the error in bp->b_error appropriately. + * + * If the write was asynchronous then no one will be looking for the error. If + * this is the first failure of this type, clear the error state and write the + * buffer out again. This means we always retry an async write failure at least + * once, but we also need to set the buffer up to behave correctly now for + * repeated failures. + * + * If we get repeated async write failures, then we take action according to the + * error configuration we have been set up to use. + * + * Returns true if this function took care of error handling and the caller must + * not touch the buffer again. Return false if the caller should proceed with + * normal I/O completion handling. + */ +static bool +xfs_buf_ioend_handle_error( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_mount; + struct xfs_error_cfg *cfg; + + /* + * If we've already decided to shutdown the filesystem because of I/O + * errors, there's no point in giving this a retry. + */ + if (XFS_FORCED_SHUTDOWN(mp)) + goto out_stale; + + xfs_buf_ioerror_alert_ratelimited(bp); + + /* + * We're not going to bother about retrying this during recovery. + * One strike! + */ + if (bp->b_flags & _XBF_LOGRECOVERY) { + xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); + return false; + } + + /* + * Synchronous writes will have callers process the error. + */ + if (!(bp->b_flags & XBF_ASYNC)) + goto out_stale; + + trace_xfs_buf_iodone_async(bp, _RET_IP_); + + cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error); + if (bp->b_last_error != bp->b_error || + !(bp->b_flags & (XBF_STALE | XBF_WRITE_FAIL))) { + bp->b_last_error = bp->b_error; + if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER && + !bp->b_first_retry_time) + bp->b_first_retry_time = jiffies; + goto resubmit; + } + + /* + * Permanent error - we need to trigger a shutdown if we haven't already + * to indicate that inconsistency will result from this action. + */ + if (xfs_buf_ioerror_permanent(bp, cfg)) { + xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); + goto out_stale; + } + + /* Still considered a transient error. Caller will schedule retries. */ + if (bp->b_flags & _XBF_INODES) + xfs_buf_inode_io_fail(bp); + else if (bp->b_flags & _XBF_DQUOTS) + xfs_buf_dquot_io_fail(bp); + else + ASSERT(list_empty(&bp->b_li_list)); + xfs_buf_ioerror(bp, 0); + xfs_buf_relse(bp); + return true; + +resubmit: + xfs_buf_ioerror(bp, 0); + bp->b_flags |= (XBF_DONE | XBF_WRITE_FAIL); + xfs_buf_submit(bp); + return true; +out_stale: + xfs_buf_stale(bp); + bp->b_flags |= XBF_DONE; + bp->b_flags &= ~XBF_WRITE; + trace_xfs_buf_error_relse(bp, _RET_IP_); + return false; +} + +static void xfs_buf_ioend( struct xfs_buf *bp) { - bool read = bp->b_flags & XBF_READ; - trace_xfs_buf_iodone(bp, _RET_IP_); - bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD); - /* * Pull in IO completion errors now. We are guaranteed to be running * single threaded, so we don't need the lock to read b_io_error. @@ -1191,39 +1325,47 @@ xfs_buf_ioend( if (!bp->b_error && bp->b_io_error) xfs_buf_ioerror(bp, bp->b_io_error); - if (read) { + if (bp->b_flags & XBF_READ) { if (!bp->b_error && bp->b_ops) bp->b_ops->verify_read(bp); if (!bp->b_error) bp->b_flags |= XBF_DONE; - xfs_buf_ioend_finish(bp); - return; - } + } else { + if (!bp->b_error) { + bp->b_flags &= ~XBF_WRITE_FAIL; + bp->b_flags |= XBF_DONE; + } - if (!bp->b_error) { - bp->b_flags &= ~XBF_WRITE_FAIL; - bp->b_flags |= XBF_DONE; - } + if (unlikely(bp->b_error) && xfs_buf_ioend_handle_error(bp)) + return; - /* - * If this is a log recovery buffer, we aren't doing transactional IO - * yet so we need to let it handle IO completions. - */ - if (bp->b_flags & _XBF_LOGRECOVERY) { - xlog_recover_iodone(bp); - return; - } + /* clear the retry state */ + bp->b_last_error = 0; + bp->b_retries = 0; + bp->b_first_retry_time = 0; - if (bp->b_flags & _XBF_INODES) { - xfs_buf_inode_iodone(bp); - return; - } + /* + * Note that for things like remote attribute buffers, there may + * not be a buffer log item here, so processing the buffer log + * item must remain optional. + */ + if (bp->b_log_item) + xfs_buf_item_done(bp); + + if (bp->b_flags & _XBF_INODES) + xfs_buf_inode_iodone(bp); + else if (bp->b_flags & _XBF_DQUOTS) + xfs_buf_dquot_iodone(bp); - if (bp->b_flags & _XBF_DQUOTS) { - xfs_buf_dquot_iodone(bp); - return; } - xfs_buf_iodone(bp); + + bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD | + _XBF_LOGRECOVERY); + + if (bp->b_flags & XBF_ASYNC) + xfs_buf_relse(bp); + else + complete(&bp->b_iowait); } static void @@ -1506,7 +1648,7 @@ xfs_buf_iowait( * safe to reference the buffer after a call to this function unless the caller * holds an additional reference itself. */ -int +static int __xfs_buf_submit( struct xfs_buf *bp, bool wait) diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 755b652e695a..bfd2907e7bc4 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -249,6 +249,7 @@ int xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks, int flags, int xfs_buf_read_uncached(struct xfs_buftarg *target, xfs_daddr_t daddr, size_t numblks, int flags, struct xfs_buf **bpp, const struct xfs_buf_ops *ops); +int _xfs_buf_read(struct xfs_buf *bp, xfs_buf_flags_t flags); void xfs_buf_hold(struct xfs_buf *bp); /* Releasing Buffers */ @@ -269,28 +270,12 @@ static inline void xfs_buf_relse(xfs_buf_t *bp) /* Buffer Read and Write Routines */ extern int xfs_bwrite(struct xfs_buf *bp); -extern void xfs_buf_ioend(struct xfs_buf *bp); -static inline void xfs_buf_ioend_finish(struct xfs_buf *bp) -{ - if (bp->b_flags & XBF_ASYNC) - xfs_buf_relse(bp); - else - complete(&bp->b_iowait); -} extern void __xfs_buf_ioerror(struct xfs_buf *bp, int error, xfs_failaddr_t failaddr); #define xfs_buf_ioerror(bp, err) __xfs_buf_ioerror((bp), (err), __this_address) extern void xfs_buf_ioerror_alert(struct xfs_buf *bp, xfs_failaddr_t fa); void xfs_buf_ioend_fail(struct xfs_buf *); - -extern int __xfs_buf_submit(struct xfs_buf *bp, bool); -static inline int xfs_buf_submit(struct xfs_buf *bp) -{ - bool wait = bp->b_flags & XBF_ASYNC ? false : true; - return __xfs_buf_submit(bp, wait); -} - void xfs_buf_zero(struct xfs_buf *bp, size_t boff, size_t bsize); void __xfs_buf_mark_corrupt(struct xfs_buf *bp, xfs_failaddr_t fa); #define xfs_buf_mark_corrupt(bp) __xfs_buf_mark_corrupt((bp), __this_address) diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 408d1b572d3f..0356f2e340a1 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -30,8 +30,6 @@ static inline struct xfs_buf_log_item *BUF_ITEM(struct xfs_log_item *lip) return container_of(lip, struct xfs_buf_log_item, bli_item); } -static void xfs_buf_item_done(struct xfs_buf *bp); - /* Is this log iovec plausibly large enough to contain the buffer log format? */ bool xfs_buf_log_check_iovec( @@ -463,7 +461,7 @@ xfs_buf_item_unpin( */ if (bip->bli_flags & XFS_BLI_STALE_INODE) { xfs_buf_item_done(bp); - xfs_iflush_done(bp); + xfs_buf_inode_iodone(bp); ASSERT(list_empty(&bp->b_li_list)); } else { xfs_trans_ail_delete(lip, SHUTDOWN_LOG_IO_ERROR); @@ -956,153 +954,10 @@ xfs_buf_item_relse( xfs_buf_item_free(bip); } -/* - * Decide if we're going to retry the write after a failure, and prepare - * the buffer for retrying the write. - */ -static bool -xfs_buf_ioerror_fail_without_retry( - struct xfs_buf *bp) -{ - struct xfs_mount *mp = bp->b_mount; - static ulong lasttime; - static xfs_buftarg_t *lasttarg; - - /* - * If we've already decided to shutdown the filesystem because of - * I/O errors, there's no point in giving this a retry. - */ - if (XFS_FORCED_SHUTDOWN(mp)) - return true; - - if (bp->b_target != lasttarg || - time_after(jiffies, (lasttime + 5*HZ))) { - lasttime = jiffies; - xfs_buf_ioerror_alert(bp, __this_address); - } - lasttarg = bp->b_target; - - /* synchronous writes will have callers process the error */ - if (!(bp->b_flags & XBF_ASYNC)) - return true; - return false; -} - -static bool -xfs_buf_ioerror_retry( - struct xfs_buf *bp, - struct xfs_error_cfg *cfg) -{ - if ((bp->b_flags & (XBF_STALE | XBF_WRITE_FAIL)) && - bp->b_last_error == bp->b_error) - return false; - - bp->b_flags |= (XBF_WRITE | XBF_DONE | XBF_WRITE_FAIL); - bp->b_last_error = bp->b_error; - if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER && - !bp->b_first_retry_time) - bp->b_first_retry_time = jiffies; - return true; -} - -/* - * Account for this latest trip around the retry handler, and decide if - * we've failed enough times to constitute a permanent failure. - */ -static bool -xfs_buf_ioerror_permanent( - struct xfs_buf *bp, - struct xfs_error_cfg *cfg) -{ - struct xfs_mount *mp = bp->b_mount; - - if (cfg->max_retries != XFS_ERR_RETRY_FOREVER && - ++bp->b_retries > cfg->max_retries) - return true; - if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER && - time_after(jiffies, cfg->retry_timeout + bp->b_first_retry_time)) - return true; - - /* At unmount we may treat errors differently */ - if ((mp->m_flags & XFS_MOUNT_UNMOUNTING) && mp->m_fail_unmount) - return true; - - return false; -} - -/* - * On a sync write or shutdown we just want to stale the buffer and let the - * caller handle the error in bp->b_error appropriately. - * - * If the write was asynchronous then no one will be looking for the error. If - * this is the first failure of this type, clear the error state and write the - * buffer out again. This means we always retry an async write failure at least - * once, but we also need to set the buffer up to behave correctly now for - * repeated failures. - * - * If we get repeated async write failures, then we take action according to the - * error configuration we have been set up to use. - * - * Multi-state return value: - * - * XBF_IOERROR_FINISH: clear IO error retry state and run callback completions - * XBF_IOERROR_DONE: resubmitted immediately, do not run any completions - * XBF_IOERROR_FAIL: transient error, run failure callback completions and then - * release the buffer - */ -enum { - XBF_IOERROR_FINISH, - XBF_IOERROR_DONE, - XBF_IOERROR_FAIL, -}; - -static int -xfs_buf_iodone_error( - struct xfs_buf *bp) -{ - struct xfs_mount *mp = bp->b_mount; - struct xfs_error_cfg *cfg; - - if (xfs_buf_ioerror_fail_without_retry(bp)) - goto out_stale; - - trace_xfs_buf_item_iodone_async(bp, _RET_IP_); - - cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error); - if (xfs_buf_ioerror_retry(bp, cfg)) { - xfs_buf_ioerror(bp, 0); - xfs_buf_submit(bp); - return XBF_IOERROR_DONE; - } - - /* - * Permanent error - we need to trigger a shutdown if we haven't already - * to indicate that inconsistency will result from this action. - */ - if (xfs_buf_ioerror_permanent(bp, cfg)) { - xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); - goto out_stale; - } - - /* Still considered a transient error. Caller will schedule retries. */ - return XBF_IOERROR_FAIL; - -out_stale: - xfs_buf_stale(bp); - bp->b_flags |= XBF_DONE; - trace_xfs_buf_error_relse(bp, _RET_IP_); - return XBF_IOERROR_FINISH; -} - -static void +void xfs_buf_item_done( struct xfs_buf *bp) { - struct xfs_buf_log_item *bip = bp->b_log_item; - - if (!bip) - return; - /* * If we are forcibly shutting down, this may well be off the AIL * already. That's because we simulate the log-committed callbacks to @@ -1111,113 +966,12 @@ xfs_buf_item_done( * xfs_trans_ail_delete() takes care of these. * * Either way, AIL is useless if we're forcing a shutdown. + * + * Note that log recovery writes might have buffer items that are not on + * the AIL even when the file system is not shut down. */ - xfs_trans_ail_delete(&bip->bli_item, SHUTDOWN_CORRUPT_INCORE); - bp->b_log_item = NULL; - xfs_buf_item_free(bip); - xfs_buf_rele(bp); -} - -static inline void -xfs_buf_clear_ioerror_retry_state( - struct xfs_buf *bp) -{ - bp->b_last_error = 0; - bp->b_retries = 0; - bp->b_first_retry_time = 0; -} - -/* - * Inode buffer iodone callback function. - */ -void -xfs_buf_inode_iodone( - struct xfs_buf *bp) -{ - if (bp->b_error) { - struct xfs_log_item *lip; - int ret = xfs_buf_iodone_error(bp); - - if (ret == XBF_IOERROR_FINISH) - goto finish_iodone; - if (ret == XBF_IOERROR_DONE) - return; - ASSERT(ret == XBF_IOERROR_FAIL); - list_for_each_entry(lip, &bp->b_li_list, li_bio_list) { - set_bit(XFS_LI_FAILED, &lip->li_flags); - } - xfs_buf_ioerror(bp, 0); - xfs_buf_relse(bp); - return; - } - -finish_iodone: - xfs_buf_clear_ioerror_retry_state(bp); - xfs_buf_item_done(bp); - xfs_iflush_done(bp); - xfs_buf_ioend_finish(bp); -} - -/* - * Dquot buffer iodone callback function. - */ -void -xfs_buf_dquot_iodone( - struct xfs_buf *bp) -{ - if (bp->b_error) { - struct xfs_log_item *lip; - int ret = xfs_buf_iodone_error(bp); - - if (ret == XBF_IOERROR_FINISH) - goto finish_iodone; - if (ret == XBF_IOERROR_DONE) - return; - ASSERT(ret == XBF_IOERROR_FAIL); - spin_lock(&bp->b_mount->m_ail->ail_lock); - list_for_each_entry(lip, &bp->b_li_list, li_bio_list) { - xfs_set_li_failed(lip, bp); - } - spin_unlock(&bp->b_mount->m_ail->ail_lock); - xfs_buf_ioerror(bp, 0); - xfs_buf_relse(bp); - return; - } - -finish_iodone: - xfs_buf_clear_ioerror_retry_state(bp); - /* a newly allocated dquot buffer might have a log item attached */ - xfs_buf_item_done(bp); - xfs_dquot_done(bp); - xfs_buf_ioend_finish(bp); -} - -/* - * Dirty buffer iodone callback function. - * - * Note that for things like remote attribute buffers, there may not be a buffer - * log item here, so processing the buffer log item must remain be optional. - */ -void -xfs_buf_iodone( - struct xfs_buf *bp) -{ - if (bp->b_error) { - int ret = xfs_buf_iodone_error(bp); - - if (ret == XBF_IOERROR_FINISH) - goto finish_iodone; - if (ret == XBF_IOERROR_DONE) - return; - ASSERT(ret == XBF_IOERROR_FAIL); - ASSERT(list_empty(&bp->b_li_list)); - xfs_buf_ioerror(bp, 0); - xfs_buf_relse(bp); - return; - } - -finish_iodone: - xfs_buf_clear_ioerror_retry_state(bp); - xfs_buf_item_done(bp); - xfs_buf_ioend_finish(bp); + xfs_trans_ail_delete(&bp->b_log_item->bli_item, + (bp->b_flags & _XBF_LOGRECOVERY) ? 0 : + SHUTDOWN_CORRUPT_INCORE); + xfs_buf_item_relse(bp); } diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h index 23507cbb4c41..50aa0f5ef959 100644 --- a/fs/xfs/xfs_buf_item.h +++ b/fs/xfs/xfs_buf_item.h @@ -50,12 +50,24 @@ struct xfs_buf_log_item { }; int xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *); +void xfs_buf_item_done(struct xfs_buf *bp); void xfs_buf_item_relse(struct xfs_buf *); bool xfs_buf_item_put(struct xfs_buf_log_item *); void xfs_buf_item_log(struct xfs_buf_log_item *, uint, uint); bool xfs_buf_item_dirty_format(struct xfs_buf_log_item *); void xfs_buf_inode_iodone(struct xfs_buf *); +void xfs_buf_inode_io_fail(struct xfs_buf *bp); +#ifdef CONFIG_XFS_QUOTA void xfs_buf_dquot_iodone(struct xfs_buf *); +void xfs_buf_dquot_io_fail(struct xfs_buf *bp); +#else +static inline void xfs_buf_dquot_iodone(struct xfs_buf *bp) +{ +} +static inline void xfs_buf_dquot_io_fail(struct xfs_buf *bp) +{ +} +#endif /* CONFIG_XFS_QUOTA */ void xfs_buf_iodone(struct xfs_buf *); bool xfs_buf_log_check_iovec(struct xfs_log_iovec *iovec); diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c index 8f0457d67d77..24c7a8d11e1a 100644 --- a/fs/xfs/xfs_buf_item_recover.c +++ b/fs/xfs/xfs_buf_item_recover.c @@ -414,7 +414,7 @@ xlog_recover_validate_buf_type( * * Write verifiers update the metadata LSN from log items attached to * the buffer. Therefore, initialize a bli purely to carry the LSN to - * the verifier. We'll clean it up in our ->iodone() callback. + * the verifier. */ if (bp->b_ops) { struct xfs_buf_log_item *bip; diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index bcd73b9c2994..3072814e407d 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -98,12 +98,33 @@ xfs_qm_adjust_dqlimits( xfs_dquot_set_prealloc_limits(dq); } +/* Set the expiration time of a quota's grace period. */ +time64_t +xfs_dquot_set_timeout( + struct xfs_mount *mp, + time64_t timeout) +{ + struct xfs_quotainfo *qi = mp->m_quotainfo; + + return clamp_t(time64_t, timeout, qi->qi_expiry_min, + qi->qi_expiry_max); +} + +/* Set the length of the default grace period. */ +time64_t +xfs_dquot_set_grace_period( + time64_t grace) +{ + return clamp_t(time64_t, grace, XFS_DQ_GRACE_MIN, XFS_DQ_GRACE_MAX); +} + /* * Determine if this quota counter is over either limit and set the quota * timers as appropriate. */ static inline void xfs_qm_adjust_res_timer( + struct xfs_mount *mp, struct xfs_dquot_res *res, struct xfs_quota_limits *qlim) { @@ -112,7 +133,8 @@ xfs_qm_adjust_res_timer( if ((res->softlimit && res->count > res->softlimit) || (res->hardlimit && res->count > res->hardlimit)) { if (res->timer == 0) - res->timer = ktime_get_real_seconds() + qlim->time; + res->timer = xfs_dquot_set_timeout(mp, + ktime_get_real_seconds() + qlim->time); } else { if (res->timer == 0) res->warnings = 0; @@ -145,9 +167,9 @@ xfs_qm_adjust_dqtimers( ASSERT(dq->q_id); defq = xfs_get_defquota(qi, xfs_dquot_type(dq)); - xfs_qm_adjust_res_timer(&dq->q_blk, &defq->blk); - xfs_qm_adjust_res_timer(&dq->q_ino, &defq->ino); - xfs_qm_adjust_res_timer(&dq->q_rtb, &defq->rtb); + xfs_qm_adjust_res_timer(dq->q_mount, &dq->q_blk, &defq->blk); + xfs_qm_adjust_res_timer(dq->q_mount, &dq->q_ino, &defq->ino); + xfs_qm_adjust_res_timer(dq->q_mount, &dq->q_rtb, &defq->rtb); } /* @@ -201,6 +223,8 @@ xfs_qm_init_dquot_blk( d->dd_diskdq.d_version = XFS_DQUOT_VERSION; d->dd_diskdq.d_id = cpu_to_be32(curid); d->dd_diskdq.d_type = type; + if (curid > 0 && xfs_sb_version_hasbigtime(&mp->m_sb)) + d->dd_diskdq.d_type |= XFS_DQTYPE_BIGTIME; if (xfs_sb_version_hascrc(&mp->m_sb)) { uuid_copy(&d->dd_uuid, &mp->m_sb.sb_meta_uuid); xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk), @@ -514,9 +538,9 @@ xfs_dquot_from_disk( dqp->q_ino.warnings = be16_to_cpu(ddqp->d_iwarns); dqp->q_rtb.warnings = be16_to_cpu(ddqp->d_rtbwarns); - dqp->q_blk.timer = be32_to_cpu(ddqp->d_btimer); - dqp->q_ino.timer = be32_to_cpu(ddqp->d_itimer); - dqp->q_rtb.timer = be32_to_cpu(ddqp->d_rtbtimer); + dqp->q_blk.timer = xfs_dquot_from_disk_ts(ddqp, ddqp->d_btimer); + dqp->q_ino.timer = xfs_dquot_from_disk_ts(ddqp, ddqp->d_itimer); + dqp->q_rtb.timer = xfs_dquot_from_disk_ts(ddqp, ddqp->d_rtbtimer); /* * Reservation counters are defined as reservation plus current usage @@ -559,9 +583,9 @@ xfs_dquot_to_disk( ddqp->d_iwarns = cpu_to_be16(dqp->q_ino.warnings); ddqp->d_rtbwarns = cpu_to_be16(dqp->q_rtb.warnings); - ddqp->d_btimer = cpu_to_be32(dqp->q_blk.timer); - ddqp->d_itimer = cpu_to_be32(dqp->q_ino.timer); - ddqp->d_rtbtimer = cpu_to_be32(dqp->q_rtb.timer); + ddqp->d_btimer = xfs_dquot_to_disk_ts(dqp, dqp->q_blk.timer); + ddqp->d_itimer = xfs_dquot_to_disk_ts(dqp, dqp->q_ino.timer); + ddqp->d_rtbtimer = xfs_dquot_to_disk_ts(dqp, dqp->q_rtb.timer); } /* Allocate and initialize the dquot buffer for this in-core dquot. */ @@ -1107,7 +1131,7 @@ xfs_qm_dqflush_done( } void -xfs_dquot_done( +xfs_buf_dquot_iodone( struct xfs_buf *bp) { struct xfs_log_item *lip, *n; @@ -1118,6 +1142,18 @@ xfs_dquot_done( } } +void +xfs_buf_dquot_io_fail( + struct xfs_buf *bp) +{ + struct xfs_log_item *lip; + + spin_lock(&bp->b_mount->m_ail->ail_lock); + list_for_each_entry(lip, &bp->b_li_list, li_bio_list) + xfs_set_li_failed(lip, bp); + spin_unlock(&bp->b_mount->m_ail->ail_lock); +} + /* Check incore dquot for errors before we flush. */ static xfs_failaddr_t xfs_qm_dqflush_check( @@ -1145,6 +1181,14 @@ xfs_qm_dqflush_check( !dqp->q_rtb.timer) return __this_address; + /* bigtime flag should never be set on root dquots */ + if (dqp->q_type & XFS_DQTYPE_BIGTIME) { + if (!xfs_sb_version_hasbigtime(&dqp->q_mount->m_sb)) + return __this_address; + if (dqp->q_id == 0) + return __this_address; + } + return NULL; } diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h index 282a65da93c7..f642884a6834 100644 --- a/fs/xfs/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h @@ -237,4 +237,7 @@ typedef int (*xfs_qm_dqiterate_fn)(struct xfs_dquot *dq, int xfs_qm_dqiterate(struct xfs_mount *mp, xfs_dqtype_t type, xfs_qm_dqiterate_fn iter_fn, void *priv); +time64_t xfs_dquot_set_timeout(struct xfs_mount *mp, time64_t timeout); +time64_t xfs_dquot_set_grace_period(time64_t grace); + #endif /* __XFS_DQUOT_H__ */ diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index a29f78a663ca..3d1b95124744 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1008,6 +1008,21 @@ xfs_file_fadvise( return ret; } +/* Does this file, inode, or mount want synchronous writes? */ +static inline bool xfs_file_sync_writes(struct file *filp) +{ + struct xfs_inode *ip = XFS_I(file_inode(filp)); + + if (ip->i_mount->m_flags & XFS_MOUNT_WSYNC) + return true; + if (filp->f_flags & (__O_SYNC | O_DSYNC)) + return true; + if (IS_SYNC(file_inode(filp))) + return true; + + return false; +} + STATIC loff_t xfs_file_remap_range( struct file *file_in, @@ -1065,7 +1080,7 @@ xfs_file_remap_range( if (ret) goto out_unlock; - if (mp->m_flags & XFS_MOUNT_WSYNC) + if (xfs_file_sync_writes(file_in) || xfs_file_sync_writes(file_out)) xfs_log_force_inode(dest); out_unlock: xfs_iunlock2_io_mmap(src, dest); diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 101028ebb571..deb99300d171 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -52,7 +52,6 @@ xfs_inode_alloc( XFS_STATS_INC(mp, vn_active); ASSERT(atomic_read(&ip->i_pincount) == 0); - ASSERT(!xfs_isiflocked(ip)); ASSERT(ip->i_ino == 0); /* initialise the xfs inode */ @@ -123,7 +122,7 @@ void xfs_inode_free( struct xfs_inode *ip) { - ASSERT(!xfs_isiflocked(ip)); + ASSERT(!xfs_iflags_test(ip, XFS_IFLUSHING)); /* * Because we use RCU freeing we need to ensure the inode always @@ -1035,23 +1034,21 @@ xfs_reclaim_inode( if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) goto out; - if (!xfs_iflock_nowait(ip)) + if (xfs_iflags_test_and_set(ip, XFS_IFLUSHING)) goto out_iunlock; if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { xfs_iunpin_wait(ip); - /* xfs_iflush_abort() drops the flush lock */ xfs_iflush_abort(ip); goto reclaim; } if (xfs_ipincount(ip)) - goto out_ifunlock; + goto out_clear_flush; if (!xfs_inode_clean(ip)) - goto out_ifunlock; + goto out_clear_flush; - xfs_ifunlock(ip); + xfs_iflags_clear(ip, XFS_IFLUSHING); reclaim: - ASSERT(!xfs_isiflocked(ip)); /* * Because we use RCU freeing we need to ensure the inode always appears @@ -1101,8 +1098,8 @@ reclaim: __xfs_inode_free(ip); return; -out_ifunlock: - xfs_ifunlock(ip); +out_clear_flush: + xfs_iflags_clear(ip, XFS_IFLUSHING); out_iunlock: xfs_iunlock(ip, XFS_ILOCK_EXCL); out: @@ -1211,7 +1208,7 @@ xfs_reclaim_inodes( while (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) { xfs_ail_push_all_sync(mp->m_ail); xfs_reclaim_inodes_ag(mp, &nr_to_scan); - }; + } } /* diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index c06129cffba9..49624973eecc 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -598,22 +598,6 @@ xfs_lock_two_inodes( } } -void -__xfs_iflock( - struct xfs_inode *ip) -{ - wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT); - DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT); - - do { - prepare_to_wait_exclusive(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); - if (xfs_isiflocked(ip)) - io_schedule(); - } while (!xfs_iflock_nowait(ip)); - - finish_wait(wq, &wait.wq_entry); -} - STATIC uint _xfs_dic2xflags( uint16_t di_flags, @@ -840,7 +824,7 @@ xfs_ialloc( if (xfs_sb_version_has_v3inode(&mp->m_sb)) { inode_set_iversion(inode, 1); - ip->i_d.di_flags2 = 0; + ip->i_d.di_flags2 = mp->m_ino_geo.new_diflags2; ip->i_d.di_cowextsize = 0; ip->i_d.di_crtime = tv; } @@ -2531,11 +2515,8 @@ retry: * valid, the wrong inode or stale. */ spin_lock(&ip->i_flags_lock); - if (ip->i_ino != inum || __xfs_iflags_test(ip, XFS_ISTALE)) { - spin_unlock(&ip->i_flags_lock); - rcu_read_unlock(); - return; - } + if (ip->i_ino != inum || __xfs_iflags_test(ip, XFS_ISTALE)) + goto out_iflags_unlock; /* * Don't try to lock/unlock the current inode, but we _cannot_ skip the @@ -2552,16 +2533,14 @@ retry: } } ip->i_flags |= XFS_ISTALE; - spin_unlock(&ip->i_flags_lock); - rcu_read_unlock(); /* - * If we can't get the flush lock, the inode is already attached. All + * If the inode is flushing, it is already attached to the buffer. All * we needed to do here is mark the inode stale so buffer IO completion * will remove it from the AIL. */ iip = ip->i_itemp; - if (!xfs_iflock_nowait(ip)) { + if (__xfs_iflags_test(ip, XFS_IFLUSHING)) { ASSERT(!list_empty(&iip->ili_item.li_bio_list)); ASSERT(iip->ili_last_fields); goto out_iunlock; @@ -2573,10 +2552,12 @@ retry: * commit as the flock synchronises removal of the inode from the * cluster buffer against inode reclaim. */ - if (!iip || list_empty(&iip->ili_item.li_bio_list)) { - xfs_ifunlock(ip); + if (!iip || list_empty(&iip->ili_item.li_bio_list)) goto out_iunlock; - } + + __xfs_iflags_set(ip, XFS_IFLUSHING); + spin_unlock(&ip->i_flags_lock); + rcu_read_unlock(); /* we have a dirty inode in memory that has not yet been flushed. */ spin_lock(&iip->ili_lock); @@ -2586,9 +2567,16 @@ retry: spin_unlock(&iip->ili_lock); ASSERT(iip->ili_last_fields); + if (ip != free_ip) + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return; + out_iunlock: if (ip != free_ip) xfs_iunlock(ip, XFS_ILOCK_EXCL); +out_iflags_unlock: + spin_unlock(&ip->i_flags_lock); + rcu_read_unlock(); } /* @@ -2631,8 +2619,9 @@ xfs_ifree_cluster( /* * We obtain and lock the backing buffer first in the process - * here, as we have to ensure that any dirty inode that we - * can't get the flush lock on is attached to the buffer. + * here to ensure dirty inodes attached to the buffer remain in + * the flushing state while we mark them stale. + * * If we scan the in-memory inodes first, then buffer IO can * complete before we get a lock on it, and hence we may fail * to mark all the active inodes on the buffer stale. @@ -2717,7 +2706,7 @@ xfs_ifree( VFS_I(ip)->i_mode = 0; /* mark incore inode as free */ ip->i_d.di_flags = 0; - ip->i_d.di_flags2 = 0; + ip->i_d.di_flags2 = ip->i_mount->m_ino_geo.new_diflags2; ip->i_d.di_dmevmask = 0; ip->i_d.di_forkoff = 0; /* mark the attr fork not in use */ ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS; @@ -3443,7 +3432,7 @@ xfs_iflush( int error; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); - ASSERT(xfs_isiflocked(ip)); + ASSERT(xfs_iflags_test(ip, XFS_IFLUSHING)); ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_BTREE || ip->i_df.if_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); ASSERT(iip->ili_item.li_buf == bp); @@ -3553,8 +3542,8 @@ xfs_iflush( * * What we do is move the bits to the ili_last_fields field. When * logging the inode, these bits are moved back to the ili_fields field. - * In the xfs_iflush_done() routine we clear ili_last_fields, since we - * know that the information those bits represent is permanently on + * In the xfs_buf_inode_iodone() routine we clear ili_last_fields, since + * we know that the information those bits represent is permanently on * disk. As long as the flush completes before the inode is logged * again, then both ili_fields and ili_last_fields will be cleared. */ @@ -3568,7 +3557,7 @@ flush_out: /* * Store the current LSN of the inode so that we can tell whether the - * item has moved in the AIL from xfs_iflush_done(). + * item has moved in the AIL from xfs_buf_inode_iodone(). */ xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, &iip->ili_item.li_lsn); @@ -3613,7 +3602,7 @@ xfs_iflush_cluster( /* * Quick and dirty check to avoid locks if possible. */ - if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLOCK)) + if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLUSHING)) continue; if (xfs_ipincount(ip)) continue; @@ -3627,7 +3616,7 @@ xfs_iflush_cluster( */ spin_lock(&ip->i_flags_lock); ASSERT(!__xfs_iflags_test(ip, XFS_ISTALE)); - if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLOCK)) { + if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLUSHING)) { spin_unlock(&ip->i_flags_lock); continue; } @@ -3635,24 +3624,17 @@ xfs_iflush_cluster( /* * ILOCK will pin the inode against reclaim and prevent * concurrent transactions modifying the inode while we are - * flushing the inode. + * flushing the inode. If we get the lock, set the flushing + * state before we drop the i_flags_lock. */ if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) { spin_unlock(&ip->i_flags_lock); continue; } + __xfs_iflags_set(ip, XFS_IFLUSHING); spin_unlock(&ip->i_flags_lock); /* - * Skip inodes that are already flush locked as they have - * already been written to the buffer. - */ - if (!xfs_iflock_nowait(ip)) { - xfs_iunlock(ip, XFS_ILOCK_SHARED); - continue; - } - - /* * Abort flushing this inode if we are shut down because the * inode may not currently be in the AIL. This can occur when * log I/O failure unpins the inode without inserting into the @@ -3661,7 +3643,6 @@ xfs_iflush_cluster( */ if (XFS_FORCED_SHUTDOWN(mp)) { xfs_iunpin_wait(ip); - /* xfs_iflush_abort() drops the flush lock */ xfs_iflush_abort(ip); xfs_iunlock(ip, XFS_ILOCK_SHARED); error = -EIO; @@ -3670,7 +3651,7 @@ xfs_iflush_cluster( /* don't block waiting on a log force to unpin dirty inodes */ if (xfs_ipincount(ip)) { - xfs_ifunlock(ip); + xfs_iflags_clear(ip, XFS_IFLUSHING); xfs_iunlock(ip, XFS_ILOCK_SHARED); continue; } @@ -3678,7 +3659,7 @@ xfs_iflush_cluster( if (!xfs_inode_clean(ip)) error = xfs_iflush(ip, bp); else - xfs_ifunlock(ip); + xfs_iflags_clear(ip, XFS_IFLUSHING); xfs_iunlock(ip, XFS_ILOCK_SHARED); if (error) break; diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index e9a8bb184d1f..751a3d1d7d84 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -194,6 +194,11 @@ static inline bool xfs_inode_has_cow_data(struct xfs_inode *ip) return ip->i_cowfp && ip->i_cowfp->if_bytes; } +static inline bool xfs_inode_has_bigtime(struct xfs_inode *ip) +{ + return ip->i_d.di_flags2 & XFS_DIFLAG2_BIGTIME; +} + /* * Return the buftarg used for data allocations on a given inode. */ @@ -211,8 +216,7 @@ static inline bool xfs_inode_has_cow_data(struct xfs_inode *ip) #define XFS_INEW (1 << __XFS_INEW_BIT) #define XFS_ITRUNCATED (1 << 5) /* truncated down so flush-on-close */ #define XFS_IDIRTY_RELEASE (1 << 6) /* dirty release already seen */ -#define __XFS_IFLOCK_BIT 7 /* inode is being flushed right now */ -#define XFS_IFLOCK (1 << __XFS_IFLOCK_BIT) +#define XFS_IFLUSHING (1 << 7) /* inode is being flushed */ #define __XFS_IPINNED_BIT 8 /* wakeup key for zero pin count */ #define XFS_IPINNED (1 << __XFS_IPINNED_BIT) #define XFS_IEOFBLOCKS (1 << 9) /* has the preallocblocks tag set */ @@ -234,36 +238,6 @@ static inline bool xfs_inode_has_cow_data(struct xfs_inode *ip) XFS_IDIRTY_RELEASE | XFS_ITRUNCATED) /* - * Synchronize processes attempting to flush the in-core inode back to disk. - */ - -static inline int xfs_isiflocked(struct xfs_inode *ip) -{ - return xfs_iflags_test(ip, XFS_IFLOCK); -} - -extern void __xfs_iflock(struct xfs_inode *ip); - -static inline int xfs_iflock_nowait(struct xfs_inode *ip) -{ - return !xfs_iflags_test_and_set(ip, XFS_IFLOCK); -} - -static inline void xfs_iflock(struct xfs_inode *ip) -{ - if (!xfs_iflock_nowait(ip)) - __xfs_iflock(ip); -} - -static inline void xfs_ifunlock(struct xfs_inode *ip) -{ - ASSERT(xfs_isiflocked(ip)); - xfs_iflags_clear(ip, XFS_IFLOCK); - smp_mb(); - wake_up_bit(&ip->i_flags, __XFS_IFLOCK_BIT); -} - -/* * Flags for inode locking. * Bit ranges: 1<<1 - 1<<16-1 -- iolock/ilock modes (bitfield) * 1<<16 - 1<<32-1 -- lockdep annotation (integers) diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 6c65938cee1c..17e20a6d8b4e 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -295,6 +295,28 @@ xfs_inode_item_format_attr_fork( } } +/* + * Convert an incore timestamp to a log timestamp. Note that the log format + * specifies host endian format! + */ +static inline xfs_ictimestamp_t +xfs_inode_to_log_dinode_ts( + struct xfs_inode *ip, + const struct timespec64 tv) +{ + struct xfs_legacy_ictimestamp *lits; + xfs_ictimestamp_t its; + + if (xfs_inode_has_bigtime(ip)) + return xfs_inode_encode_bigtime(tv); + + lits = (struct xfs_legacy_ictimestamp *)&its; + lits->t_sec = tv.tv_sec; + lits->t_nsec = tv.tv_nsec; + + return its; +} + static void xfs_inode_to_log_dinode( struct xfs_inode *ip, @@ -313,12 +335,9 @@ xfs_inode_to_log_dinode( memset(to->di_pad, 0, sizeof(to->di_pad)); memset(to->di_pad3, 0, sizeof(to->di_pad3)); - to->di_atime.t_sec = inode->i_atime.tv_sec; - to->di_atime.t_nsec = inode->i_atime.tv_nsec; - to->di_mtime.t_sec = inode->i_mtime.tv_sec; - to->di_mtime.t_nsec = inode->i_mtime.tv_nsec; - to->di_ctime.t_sec = inode->i_ctime.tv_sec; - to->di_ctime.t_nsec = inode->i_ctime.tv_nsec; + to->di_atime = xfs_inode_to_log_dinode_ts(ip, inode->i_atime); + to->di_mtime = xfs_inode_to_log_dinode_ts(ip, inode->i_mtime); + to->di_ctime = xfs_inode_to_log_dinode_ts(ip, inode->i_ctime); to->di_nlink = inode->i_nlink; to->di_gen = inode->i_generation; to->di_mode = inode->i_mode; @@ -340,8 +359,7 @@ xfs_inode_to_log_dinode( if (xfs_sb_version_has_v3inode(&ip->i_mount->m_sb)) { to->di_version = 3; to->di_changecount = inode_peek_iversion(inode); - to->di_crtime.t_sec = from->di_crtime.tv_sec; - to->di_crtime.t_nsec = from->di_crtime.tv_nsec; + to->di_crtime = xfs_inode_to_log_dinode_ts(ip, from->di_crtime); to->di_flags2 = from->di_flags2; to->di_cowextsize = from->di_cowextsize; to->di_ino = ip->i_ino; @@ -491,8 +509,7 @@ xfs_inode_item_push( (ip->i_flags & XFS_ISTALE)) return XFS_ITEM_PINNED; - /* If the inode is already flush locked, we're already flushing. */ - if (xfs_isiflocked(ip)) + if (xfs_iflags_test(ip, XFS_IFLUSHING)) return XFS_ITEM_FLUSHING; if (!xfs_buf_trylock(bp)) @@ -703,7 +720,7 @@ xfs_iflush_finish( iip->ili_last_fields = 0; iip->ili_flush_lsn = 0; spin_unlock(&iip->ili_lock); - xfs_ifunlock(iip->ili_inode); + xfs_iflags_clear(iip->ili_inode, XFS_IFLUSHING); if (drop_buffer) xfs_buf_rele(bp); } @@ -711,11 +728,11 @@ xfs_iflush_finish( /* * Inode buffer IO completion routine. It is responsible for removing inodes - * attached to the buffer from the AIL if they have not been re-logged, as well - * as completing the flush and unlocking the inode. + * attached to the buffer from the AIL if they have not been re-logged and + * completing the inode flush. */ void -xfs_iflush_done( +xfs_buf_inode_iodone( struct xfs_buf *bp) { struct xfs_log_item *lip, *n; @@ -754,11 +771,21 @@ xfs_iflush_done( list_splice_tail(&flushed_inodes, &bp->b_li_list); } +void +xfs_buf_inode_io_fail( + struct xfs_buf *bp) +{ + struct xfs_log_item *lip; + + list_for_each_entry(lip, &bp->b_li_list, li_bio_list) + set_bit(XFS_LI_FAILED, &lip->li_flags); +} + /* - * This is the inode flushing abort routine. It is called from xfs_iflush when + * This is the inode flushing abort routine. It is called when * the filesystem is shutting down to clean up the inode state. It is * responsible for removing the inode item from the AIL if it has not been - * re-logged, and unlocking the inode's flush lock. + * re-logged and clearing the inode's flush state. */ void xfs_iflush_abort( @@ -790,7 +817,7 @@ xfs_iflush_abort( list_del_init(&iip->ili_item.li_bio_list); spin_unlock(&iip->ili_lock); } - xfs_ifunlock(ip); + xfs_iflags_clear(ip, XFS_IFLUSHING); if (bp) xfs_buf_rele(bp); } diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h index 048b5e7dee90..4b926e32831c 100644 --- a/fs/xfs/xfs_inode_item.h +++ b/fs/xfs/xfs_inode_item.h @@ -25,8 +25,8 @@ struct xfs_inode_log_item { * * We need atomic changes between inode dirtying, inode flushing and * inode completion, but these all hold different combinations of - * ILOCK and iflock and hence we need some other method of serialising - * updates to the flush state. + * ILOCK and IFLUSHING and hence we need some other method of + * serialising updates to the flush state. */ spinlock_t ili_lock; /* flush state lock */ unsigned int ili_last_fields; /* fields when flushed */ @@ -43,7 +43,6 @@ static inline int xfs_inode_clean(struct xfs_inode *ip) extern void xfs_inode_item_init(struct xfs_inode *, struct xfs_mount *); extern void xfs_inode_item_destroy(struct xfs_inode *); -extern void xfs_iflush_done(struct xfs_buf *); extern void xfs_iflush_abort(struct xfs_inode *); extern int xfs_inode_item_format_convert(xfs_log_iovec_t *, struct xfs_inode_log_format *); diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c index 5e0d291835b3..cb44f7653f03 100644 --- a/fs/xfs/xfs_inode_item_recover.c +++ b/fs/xfs/xfs_inode_item_recover.c @@ -115,6 +115,82 @@ out_free_ip: return error; } +static inline bool xfs_log_dinode_has_bigtime(const struct xfs_log_dinode *ld) +{ + return ld->di_version >= 3 && + (ld->di_flags2 & XFS_DIFLAG2_BIGTIME); +} + +/* Convert a log timestamp to an ondisk timestamp. */ +static inline xfs_timestamp_t +xfs_log_dinode_to_disk_ts( + struct xfs_log_dinode *from, + const xfs_ictimestamp_t its) +{ + struct xfs_legacy_timestamp *lts; + struct xfs_legacy_ictimestamp *lits; + xfs_timestamp_t ts; + + if (xfs_log_dinode_has_bigtime(from)) + return cpu_to_be64(its); + + lts = (struct xfs_legacy_timestamp *)&ts; + lits = (struct xfs_legacy_ictimestamp *)&its; + lts->t_sec = cpu_to_be32(lits->t_sec); + lts->t_nsec = cpu_to_be32(lits->t_nsec); + + return ts; +} + +STATIC void +xfs_log_dinode_to_disk( + struct xfs_log_dinode *from, + struct xfs_dinode *to) +{ + to->di_magic = cpu_to_be16(from->di_magic); + to->di_mode = cpu_to_be16(from->di_mode); + to->di_version = from->di_version; + to->di_format = from->di_format; + to->di_onlink = 0; + to->di_uid = cpu_to_be32(from->di_uid); + to->di_gid = cpu_to_be32(from->di_gid); + to->di_nlink = cpu_to_be32(from->di_nlink); + to->di_projid_lo = cpu_to_be16(from->di_projid_lo); + to->di_projid_hi = cpu_to_be16(from->di_projid_hi); + memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad)); + + to->di_atime = xfs_log_dinode_to_disk_ts(from, from->di_atime); + to->di_mtime = xfs_log_dinode_to_disk_ts(from, from->di_mtime); + to->di_ctime = xfs_log_dinode_to_disk_ts(from, from->di_ctime); + + to->di_size = cpu_to_be64(from->di_size); + to->di_nblocks = cpu_to_be64(from->di_nblocks); + to->di_extsize = cpu_to_be32(from->di_extsize); + to->di_nextents = cpu_to_be32(from->di_nextents); + to->di_anextents = cpu_to_be16(from->di_anextents); + to->di_forkoff = from->di_forkoff; + to->di_aformat = from->di_aformat; + to->di_dmevmask = cpu_to_be32(from->di_dmevmask); + to->di_dmstate = cpu_to_be16(from->di_dmstate); + to->di_flags = cpu_to_be16(from->di_flags); + to->di_gen = cpu_to_be32(from->di_gen); + + if (from->di_version == 3) { + to->di_changecount = cpu_to_be64(from->di_changecount); + to->di_crtime = xfs_log_dinode_to_disk_ts(from, + from->di_crtime); + to->di_flags2 = cpu_to_be64(from->di_flags2); + to->di_cowextsize = cpu_to_be32(from->di_cowextsize); + to->di_ino = cpu_to_be64(from->di_ino); + to->di_lsn = cpu_to_be64(from->di_lsn); + memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2)); + uuid_copy(&to->di_uuid, &from->di_uuid); + to->di_flushiter = 0; + } else { + to->di_flushiter = cpu_to_be16(from->di_flushiter); + } +} + STATIC int xlog_recover_inode_commit_pass2( struct xlog *log, diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 6f22a66777cd..bca7659fb5c6 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -404,7 +404,7 @@ xfs_ioc_attr_list( context.cursor.offset)) return -EINVAL; - buffer = kmem_zalloc_large(bufsize, 0); + buffer = kvzalloc(bufsize, GFP_KERNEL); if (!buffer) return -ENOMEM; @@ -1190,7 +1190,8 @@ xfs_flags2diflags2( unsigned int xflags) { uint64_t di_flags2 = - (ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK); + (ip->i_d.di_flags2 & (XFS_DIFLAG2_REFLINK | + XFS_DIFLAG2_BIGTIME)); if (xflags & FS_XFLAG_DAX) di_flags2 |= XFS_DIFLAG2_DAX; @@ -1690,7 +1691,7 @@ xfs_ioc_getbmap( if (bmx.bmv_count > ULONG_MAX / recsize) return -ENOMEM; - buf = kmem_zalloc_large(bmx.bmv_count * sizeof(*buf), 0); + buf = kvzalloc(bmx.bmv_count * sizeof(*buf), GFP_KERNEL); if (!buf) return -ENOMEM; diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index e2ec91b2d0f4..a17d788921d6 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -265,32 +265,6 @@ xlog_header_check_mount( return 0; } -void -xlog_recover_iodone( - struct xfs_buf *bp) -{ - if (bp->b_error) { - /* - * We're not going to bother about retrying - * this during recovery. One strike! - */ - if (!XFS_FORCED_SHUTDOWN(bp->b_mount)) { - xfs_buf_ioerror_alert(bp, __this_address); - xfs_force_shutdown(bp->b_mount, SHUTDOWN_META_IO_ERROR); - } - } - - /* - * On v5 supers, a bli could be attached to update the metadata LSN. - * Clean it up. - */ - if (bp->b_log_item) - xfs_buf_item_relse(bp); - ASSERT(bp->b_log_item == NULL); - bp->b_flags &= ~_XBF_LOGRECOVERY; - xfs_buf_ioend_finish(bp); -} - /* * This routine finds (to an approximation) the first block in the physical * log which contains the given cycle. It uses a binary search algorithm. @@ -2097,7 +2071,7 @@ xlog_recover_add_to_cont_trans( old_ptr = item->ri_buf[item->ri_cnt-1].i_addr; old_len = item->ri_buf[item->ri_cnt-1].i_len; - ptr = kmem_realloc(old_ptr, len + old_len, 0); + ptr = krealloc(old_ptr, len + old_len, GFP_KERNEL | __GFP_NOFAIL); memcpy(&ptr[old_len], dp, len); item->ri_buf[item->ri_cnt-1].i_len += len; item->ri_buf[item->ri_cnt-1].i_addr = ptr; @@ -3294,14 +3268,14 @@ xlog_do_log_recovery( */ STATIC int xlog_do_recover( - struct xlog *log, - xfs_daddr_t head_blk, - xfs_daddr_t tail_blk) + struct xlog *log, + xfs_daddr_t head_blk, + xfs_daddr_t tail_blk) { - struct xfs_mount *mp = log->l_mp; - int error; - xfs_buf_t *bp; - xfs_sb_t *sbp; + struct xfs_mount *mp = log->l_mp; + struct xfs_buf *bp = mp->m_sb_bp; + struct xfs_sb *sbp = &mp->m_sb; + int error; trace_xfs_log_recover(log, head_blk, tail_blk); @@ -3315,9 +3289,8 @@ xlog_do_recover( /* * If IO errors happened during recovery, bail out. */ - if (XFS_FORCED_SHUTDOWN(mp)) { + if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; - } /* * We now update the tail_lsn since much of the recovery has completed @@ -3331,16 +3304,12 @@ xlog_do_recover( xlog_assign_tail_lsn(mp); /* - * Now that we've finished replaying all buffer and inode - * updates, re-read in the superblock and reverify it. + * Now that we've finished replaying all buffer and inode updates, + * re-read the superblock and reverify it. */ - bp = xfs_getsb(mp); - bp->b_flags &= ~(XBF_DONE | XBF_ASYNC); - ASSERT(!(bp->b_flags & XBF_WRITE)); - bp->b_flags |= XBF_READ; - bp->b_ops = &xfs_sb_buf_ops; - - error = xfs_buf_submit(bp); + xfs_buf_lock(bp); + xfs_buf_hold(bp); + error = _xfs_buf_read(bp, XBF_READ); if (error) { if (!XFS_FORCED_SHUTDOWN(mp)) { xfs_buf_ioerror_alert(bp, __this_address); @@ -3351,7 +3320,6 @@ xlog_do_recover( } /* Convert superblock from on-disk format */ - sbp = &mp->m_sb; xfs_sb_from_disk(sbp, bp->b_addr); xfs_buf_relse(bp); diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index c8ae49a1e99c..150ee5cb8645 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -80,9 +80,9 @@ xfs_uuid_mount( } if (hole < 0) { - xfs_uuid_table = kmem_realloc(xfs_uuid_table, + xfs_uuid_table = krealloc(xfs_uuid_table, (xfs_uuid_table_size + 1) * sizeof(*xfs_uuid_table), - 0); + GFP_KERNEL | __GFP_NOFAIL); hole = xfs_uuid_table_size++; } xfs_uuid_table[hole] = *uuid; @@ -1059,11 +1059,12 @@ xfs_unmountfs( * We can potentially deadlock here if we have an inode cluster * that has been freed has its buffer still pinned in memory because * the transaction is still sitting in a iclog. The stale inodes - * on that buffer will have their flush locks held until the - * transaction hits the disk and the callbacks run. the inode - * flush takes the flush lock unconditionally and with nothing to - * push out the iclog we will never get that unlocked. hence we - * need to force the log first. + * on that buffer will be pinned to the buffer until the + * transaction hits the disk and the callbacks run. Pushing the AIL will + * skip the stale inodes and may never see the pinned buffer, so + * nothing will push out the iclog and unpin the buffer. Hence we + * need to force the log here to ensure all items are flushed into the + * AIL before we go any further. */ xfs_log_force(mp, XFS_LOG_SYNC); @@ -1289,23 +1290,6 @@ xfs_mod_frextents( } /* - * xfs_getsb() is called to obtain the buffer for the superblock. - * The buffer is returned locked and read in from disk. - * The buffer should be released with a call to xfs_brelse(). - */ -struct xfs_buf * -xfs_getsb( - struct xfs_mount *mp) -{ - struct xfs_buf *bp = mp->m_sb_bp; - - xfs_buf_lock(bp); - xfs_buf_hold(bp); - ASSERT(bp->b_flags & XBF_DONE); - return bp; -} - -/* * Used to free the superblock along various error paths. */ void diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index a72cfcaa4ad1..dfa429b77ee2 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -410,7 +410,6 @@ extern int xfs_mod_fdblocks(struct xfs_mount *mp, int64_t delta, bool reserved); extern int xfs_mod_frextents(struct xfs_mount *mp, int64_t delta); -extern struct xfs_buf *xfs_getsb(xfs_mount_t *); extern int xfs_readsb(xfs_mount_t *, int); extern void xfs_freesb(xfs_mount_t *); extern bool xfs_fs_writable(struct xfs_mount *mp, int level); diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h index 5f04d8a5ab2a..0aa87c210104 100644 --- a/fs/xfs/xfs_ondisk.h +++ b/fs/xfs/xfs_ondisk.h @@ -15,6 +15,10 @@ "XFS: offsetof(" #structname ", " #member ") is wrong, " \ "expected " #off) +#define XFS_CHECK_VALUE(value, expected) \ + BUILD_BUG_ON_MSG((value) != (expected), \ + "XFS: value of " #value " is wrong, expected " #expected) + static inline void __init xfs_check_ondisk_structs(void) { @@ -23,7 +27,7 @@ xfs_check_ondisk_structs(void) XFS_CHECK_STRUCT_SIZE(struct xfs_acl_entry, 12); XFS_CHECK_STRUCT_SIZE(struct xfs_agf, 224); XFS_CHECK_STRUCT_SIZE(struct xfs_agfl, 36); - XFS_CHECK_STRUCT_SIZE(struct xfs_agi, 336); + XFS_CHECK_STRUCT_SIZE(struct xfs_agi, 344); XFS_CHECK_STRUCT_SIZE(struct xfs_bmbt_key, 8); XFS_CHECK_STRUCT_SIZE(struct xfs_bmbt_rec, 16); XFS_CHECK_STRUCT_SIZE(struct xfs_bmdr_block, 4); @@ -41,7 +45,8 @@ xfs_check_ondisk_structs(void) XFS_CHECK_STRUCT_SIZE(struct xfs_refcount_rec, 12); XFS_CHECK_STRUCT_SIZE(struct xfs_rmap_key, 20); XFS_CHECK_STRUCT_SIZE(struct xfs_rmap_rec, 24); - XFS_CHECK_STRUCT_SIZE(struct xfs_timestamp, 8); + XFS_CHECK_STRUCT_SIZE(xfs_timestamp_t, 8); + XFS_CHECK_STRUCT_SIZE(struct xfs_legacy_timestamp, 8); XFS_CHECK_STRUCT_SIZE(xfs_alloc_key_t, 8); XFS_CHECK_STRUCT_SIZE(xfs_alloc_ptr_t, 4); XFS_CHECK_STRUCT_SIZE(xfs_alloc_rec_t, 8); @@ -84,12 +89,12 @@ xfs_check_ondisk_structs(void) XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, namelen, 8); XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, name, 9); XFS_CHECK_STRUCT_SIZE(xfs_attr_leafblock_t, 40); - XFS_CHECK_OFFSET(xfs_attr_shortform_t, hdr.totsize, 0); - XFS_CHECK_OFFSET(xfs_attr_shortform_t, hdr.count, 2); - XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].namelen, 4); - XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].valuelen, 5); - XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].flags, 6); - XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].nameval, 7); + XFS_CHECK_OFFSET(struct xfs_attr_shortform, hdr.totsize, 0); + XFS_CHECK_OFFSET(struct xfs_attr_shortform, hdr.count, 2); + XFS_CHECK_OFFSET(struct xfs_attr_shortform, list[0].namelen, 4); + XFS_CHECK_OFFSET(struct xfs_attr_shortform, list[0].valuelen, 5); + XFS_CHECK_OFFSET(struct xfs_attr_shortform, list[0].flags, 6); + XFS_CHECK_OFFSET(struct xfs_attr_shortform, list[0].nameval, 7); XFS_CHECK_STRUCT_SIZE(xfs_da_blkinfo_t, 12); XFS_CHECK_STRUCT_SIZE(xfs_da_intnode_t, 16); XFS_CHECK_STRUCT_SIZE(xfs_da_node_entry_t, 8); @@ -121,7 +126,8 @@ xfs_check_ondisk_structs(void) XFS_CHECK_STRUCT_SIZE(struct xfs_extent_64, 16); XFS_CHECK_STRUCT_SIZE(struct xfs_log_dinode, 176); XFS_CHECK_STRUCT_SIZE(struct xfs_icreate_log, 28); - XFS_CHECK_STRUCT_SIZE(struct xfs_ictimestamp, 8); + XFS_CHECK_STRUCT_SIZE(xfs_ictimestamp_t, 8); + XFS_CHECK_STRUCT_SIZE(struct xfs_legacy_ictimestamp, 8); XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format_32, 52); XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format, 56); XFS_CHECK_STRUCT_SIZE(struct xfs_qoff_logformat, 20); @@ -152,6 +158,20 @@ xfs_check_ondisk_structs(void) XFS_CHECK_STRUCT_SIZE(struct xfs_inumbers, 24); XFS_CHECK_STRUCT_SIZE(struct xfs_bulkstat_req, 64); XFS_CHECK_STRUCT_SIZE(struct xfs_inumbers_req, 64); + + /* + * Make sure the incore inode timestamp range corresponds to hand + * converted values based on the ondisk format specification. + */ + XFS_CHECK_VALUE(XFS_BIGTIME_TIME_MIN - XFS_BIGTIME_EPOCH_OFFSET, + XFS_LEGACY_TIME_MIN); + XFS_CHECK_VALUE(XFS_BIGTIME_TIME_MAX - XFS_BIGTIME_EPOCH_OFFSET, + 16299260424LL); + + /* Do the same with the incore quota expiration range. */ + XFS_CHECK_VALUE(XFS_DQ_BIGTIME_EXPIRY_MIN << XFS_DQ_BIGTIME_SHIFT, 4); + XFS_CHECK_VALUE(XFS_DQ_BIGTIME_EXPIRY_MAX << XFS_DQ_BIGTIME_SHIFT, + 16299260424LL); } #endif /* __XFS_ONDISK_H */ diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index be67570badf8..3f82e0c92c2d 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -661,6 +661,17 @@ xfs_qm_init_quotainfo( /* Precalc some constants */ qinf->qi_dqchunklen = XFS_FSB_TO_BB(mp, XFS_DQUOT_CLUSTER_SIZE_FSB); qinf->qi_dqperchunk = xfs_calc_dquots_per_chunk(qinf->qi_dqchunklen); + if (xfs_sb_version_hasbigtime(&mp->m_sb)) { + qinf->qi_expiry_min = + xfs_dq_bigtime_to_unix(XFS_DQ_BIGTIME_EXPIRY_MIN); + qinf->qi_expiry_max = + xfs_dq_bigtime_to_unix(XFS_DQ_BIGTIME_EXPIRY_MAX); + } else { + qinf->qi_expiry_min = XFS_DQ_LEGACY_EXPIRY_MIN; + qinf->qi_expiry_max = XFS_DQ_LEGACY_EXPIRY_MAX; + } + trace_xfs_quota_expiry_range(mp, qinf->qi_expiry_min, + qinf->qi_expiry_max); mp->m_qflags |= (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_CHKD); @@ -879,6 +890,8 @@ xfs_qm_reset_dqcounts( ddq->d_bwarns = 0; ddq->d_iwarns = 0; ddq->d_rtbwarns = 0; + if (xfs_sb_version_hasbigtime(&mp->m_sb)) + ddq->d_type |= XFS_DQTYPE_BIGTIME; } if (xfs_sb_version_hascrc(&mp->m_sb)) { diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h index 9c078c35d924..e3dabab44097 100644 --- a/fs/xfs/xfs_qm.h +++ b/fs/xfs/xfs_qm.h @@ -65,6 +65,10 @@ struct xfs_quotainfo { struct xfs_def_quota qi_grp_default; struct xfs_def_quota qi_prj_default; struct shrinker qi_shrinker; + + /* Minimum and maximum quota expiration timestamp values. */ + time64_t qi_expiry_min; + time64_t qi_expiry_max; }; static inline struct radix_tree_root * diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 1c542b4a5220..ca1b57d291dc 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -479,13 +479,19 @@ xfs_setqlim_warns( static inline void xfs_setqlim_timer( + struct xfs_mount *mp, struct xfs_dquot_res *res, struct xfs_quota_limits *qlim, s64 timer) { - res->timer = timer; - if (qlim) - qlim->time = timer; + if (qlim) { + /* Set the length of the default grace period. */ + res->timer = xfs_dquot_set_grace_period(timer); + qlim->time = res->timer; + } else { + /* Set the grace period expiration on a quota. */ + res->timer = xfs_dquot_set_timeout(mp, timer); + } } /* @@ -574,7 +580,7 @@ xfs_qm_scall_setqlim( if (newlim->d_fieldmask & QC_SPC_WARNS) xfs_setqlim_warns(res, qlim, newlim->d_spc_warns); if (newlim->d_fieldmask & QC_SPC_TIMER) - xfs_setqlim_timer(res, qlim, newlim->d_spc_timer); + xfs_setqlim_timer(mp, res, qlim, newlim->d_spc_timer); /* Blocks on the realtime device. */ hard = (newlim->d_fieldmask & QC_RT_SPC_HARD) ? @@ -590,7 +596,7 @@ xfs_qm_scall_setqlim( if (newlim->d_fieldmask & QC_RT_SPC_WARNS) xfs_setqlim_warns(res, qlim, newlim->d_rt_spc_warns); if (newlim->d_fieldmask & QC_RT_SPC_TIMER) - xfs_setqlim_timer(res, qlim, newlim->d_rt_spc_timer); + xfs_setqlim_timer(mp, res, qlim, newlim->d_rt_spc_timer); /* Inodes */ hard = (newlim->d_fieldmask & QC_INO_HARD) ? @@ -606,7 +612,7 @@ xfs_qm_scall_setqlim( if (newlim->d_fieldmask & QC_INO_WARNS) xfs_setqlim_warns(res, qlim, newlim->d_ino_warns); if (newlim->d_fieldmask & QC_INO_TIMER) - xfs_setqlim_timer(res, qlim, newlim->d_ino_timer); + xfs_setqlim_timer(mp, res, qlim, newlim->d_ino_timer); if (id != 0) { /* diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index 06b22e35fc90..5a62398940d0 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -108,8 +108,6 @@ extern void xfs_qm_mount_quotas(struct xfs_mount *); extern void xfs_qm_unmount(struct xfs_mount *); extern void xfs_qm_unmount_quotas(struct xfs_mount *); -void xfs_dquot_done(struct xfs_buf *); - #else static inline int xfs_qm_vop_dqalloc(struct xfs_inode *ip, kuid_t kuid, kgid_t kgid, @@ -151,12 +149,6 @@ static inline int xfs_trans_reserve_quota_bydquots(struct xfs_trans *tp, #define xfs_qm_mount_quotas(mp) #define xfs_qm_unmount(mp) #define xfs_qm_unmount_quotas(mp) - -static inline void xfs_dquot_done(struct xfs_buf *bp) -{ - return; -} - #endif /* CONFIG_XFS_QUOTA */ #define xfs_trans_unreserve_quota_nblks(tp, ip, nblks, ninos, flags) \ diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 6209e7b6b895..5b89c12f1566 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -247,6 +247,9 @@ xfs_rtallocate_extent_block( end = XFS_BLOCKTOBIT(mp, bbno + 1) - 1; i <= end; i++) { + /* Make sure we don't scan off the end of the rt volume. */ + maxlen = min(mp->m_sb.sb_rextents, i + maxlen) - i; + /* * See if there's a free extent of maxlen starting at i. * If it's not so then next will contain the first non-free. @@ -442,6 +445,14 @@ xfs_rtallocate_extent_near( */ if (bno >= mp->m_sb.sb_rextents) bno = mp->m_sb.sb_rextents - 1; + + /* Make sure we don't run off the end of the rt volume. */ + maxlen = min(mp->m_sb.sb_rextents, bno + maxlen) - bno; + if (maxlen < minlen) { + *rtblock = NULLRTBLOCK; + return 0; + } + /* * Try the exact allocation first. */ @@ -862,7 +873,7 @@ xfs_alloc_rsum_cache( * lower bound on the minimum level with any free extents. We can * continue without the cache if it couldn't be allocated. */ - mp->m_rsum_cache = kmem_zalloc_large(rbmblocks, 0); + mp->m_rsum_cache = kvzalloc(rbmblocks, GFP_KERNEL); if (!mp->m_rsum_cache) xfs_warn(mp, "could not allocate realtime summary cache"); } diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 71ac6c1cdc36..baf5de30eebb 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -654,11 +654,11 @@ xfs_fs_destroy_inode( ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIM)); /* - * We always use background reclaim here because even if the - * inode is clean, it still may be under IO and hence we have - * to take the flush lock. The background reclaim path handles - * this more efficiently than we can here, so simply let background - * reclaim tear down all inodes. + * We always use background reclaim here because even if the inode is + * clean, it still may be under IO and hence we have wait for IO + * completion to occur before we can reclaim the inode. The background + * reclaim path handles this more efficiently than we can here, so + * simply let background reclaim tear down all inodes. */ xfs_inode_set_reclaim_tag(ip); } @@ -1484,8 +1484,14 @@ xfs_fc_fill_super( sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_max_links = XFS_MAXLINK; sb->s_time_gran = 1; - sb->s_time_min = S32_MIN; - sb->s_time_max = S32_MAX; + if (xfs_sb_version_hasbigtime(&mp->m_sb)) { + sb->s_time_min = xfs_bigtime_to_unix(XFS_BIGTIME_TIME_MIN); + sb->s_time_max = xfs_bigtime_to_unix(XFS_BIGTIME_TIME_MAX); + } else { + sb->s_time_min = XFS_LEGACY_TIME_MIN; + sb->s_time_max = XFS_LEGACY_TIME_MAX; + } + trace_xfs_inode_timestamp_range(mp, sb->s_time_min, sb->s_time_max); sb->s_iflags |= SB_I_CGROUPWB; set_posix_acl_flag(sb); @@ -1494,6 +1500,10 @@ xfs_fc_fill_super( if (XFS_SB_VERSION_NUM(&mp->m_sb) == XFS_SB_VERSION_5) sb->s_flags |= SB_I_VERSION; + if (xfs_sb_version_hasbigtime(&mp->m_sb)) + xfs_warn(mp, + "EXPERIMENTAL big timestamp feature in use. Use at your own risk!"); + if (mp->m_flags & XFS_MOUNT_DAX_ALWAYS) { bool rtdev_is_dax = false, datadev_is_dax; @@ -1549,6 +1559,10 @@ xfs_fc_fill_super( goto out_filestream_unmount; } + if (xfs_sb_version_hasinobtcounts(&mp->m_sb)) + xfs_warn(mp, + "EXPERIMENTAL inode btree counters feature in use. Use at your own risk!"); + error = xfs_mountfs(mp); if (error) goto out_filestream_unmount; diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index abb1d859f226..dcdcf99cfa5d 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -338,7 +338,7 @@ DEFINE_BUF_EVENT(xfs_buf_delwri_split); DEFINE_BUF_EVENT(xfs_buf_delwri_pushbuf); DEFINE_BUF_EVENT(xfs_buf_get_uncached); DEFINE_BUF_EVENT(xfs_buf_item_relse); -DEFINE_BUF_EVENT(xfs_buf_item_iodone_async); +DEFINE_BUF_EVENT(xfs_buf_iodone_async); DEFINE_BUF_EVENT(xfs_buf_error_relse); DEFINE_BUF_EVENT(xfs_buf_wait_buftarg); DEFINE_BUF_EVENT(xfs_trans_read_buf_shut); @@ -3676,7 +3676,6 @@ DEFINE_EVENT(xfs_kmem_class, name, \ DEFINE_KMEM_EVENT(kmem_alloc); DEFINE_KMEM_EVENT(kmem_alloc_io); DEFINE_KMEM_EVENT(kmem_alloc_large); -DEFINE_KMEM_EVENT(kmem_realloc); TRACE_EVENT(xfs_check_new_dalign, TP_PROTO(struct xfs_mount *mp, int new_dalign, xfs_ino_t calc_rootino), @@ -3844,6 +3843,32 @@ TRACE_EVENT(xfs_btree_bload_block, __entry->nr_records) ) +DECLARE_EVENT_CLASS(xfs_timestamp_range_class, + TP_PROTO(struct xfs_mount *mp, time64_t min, time64_t max), + TP_ARGS(mp, min, max), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(long long, min) + __field(long long, max) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->min = min; + __entry->max = max; + ), + TP_printk("dev %d:%d min %lld max %lld", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->min, + __entry->max) +) + +#define DEFINE_TIMESTAMP_RANGE_EVENT(name) \ +DEFINE_EVENT(xfs_timestamp_range_class, name, \ + TP_PROTO(struct xfs_mount *mp, long long min, long long max), \ + TP_ARGS(mp, min, max)) +DEFINE_TIMESTAMP_RANGE_EVENT(xfs_inode_timestamp_range); +DEFINE_TIMESTAMP_RANGE_EVENT(xfs_quota_expiry_range); + #endif /* _TRACE_XFS_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index ed72867b1a19..ca18a040336a 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -468,7 +468,7 @@ xfs_trans_apply_sb_deltas( xfs_buf_t *bp; int whole = 0; - bp = xfs_trans_getsb(tp, tp->t_mountp); + bp = xfs_trans_getsb(tp); sbp = bp->b_addr; /* diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index b752501818d2..f46534b75236 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -209,7 +209,7 @@ xfs_trans_read_buf( flags, bpp, ops); } -struct xfs_buf *xfs_trans_getsb(xfs_trans_t *, struct xfs_mount *); +struct xfs_buf *xfs_trans_getsb(struct xfs_trans *); void xfs_trans_brelse(xfs_trans_t *, struct xfs_buf *); void xfs_trans_bjoin(xfs_trans_t *, struct xfs_buf *); diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 11cd666cd99a..42d63b830cb9 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -166,50 +166,34 @@ xfs_trans_get_buf_map( } /* - * Get and lock the superblock buffer of this file system for the - * given transaction. - * - * We don't need to use incore_match() here, because the superblock - * buffer is a private buffer which we keep a pointer to in the - * mount structure. + * Get and lock the superblock buffer for the given transaction. */ -xfs_buf_t * +struct xfs_buf * xfs_trans_getsb( - xfs_trans_t *tp, - struct xfs_mount *mp) + struct xfs_trans *tp) { - xfs_buf_t *bp; - struct xfs_buf_log_item *bip; + struct xfs_buf *bp = tp->t_mountp->m_sb_bp; /* - * Default to just trying to lock the superblock buffer - * if tp is NULL. + * Just increment the lock recursion count if the buffer is already + * attached to this transaction. */ - if (tp == NULL) - return xfs_getsb(mp); - - /* - * If the superblock buffer already has this transaction - * pointer in its b_fsprivate2 field, then we know we already - * have it locked. In this case we just increment the lock - * recursion count and return the buffer to the caller. - */ - bp = mp->m_sb_bp; if (bp->b_transp == tp) { - bip = bp->b_log_item; + struct xfs_buf_log_item *bip = bp->b_log_item; + ASSERT(bip != NULL); ASSERT(atomic_read(&bip->bli_refcount) > 0); bip->bli_recur++; + trace_xfs_trans_getsb_recur(bip); - return bp; - } + } else { + xfs_buf_lock(bp); + xfs_buf_hold(bp); + _xfs_trans_bjoin(tp, bp, 1); - bp = xfs_getsb(mp); - if (bp == NULL) - return NULL; + trace_xfs_trans_getsb(bp->b_log_item); + } - _xfs_trans_bjoin(tp, bp, 1); - trace_xfs_trans_getsb(bp->b_log_item); return bp; } diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index c6ba7ef18e06..133fc6fc3edd 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -55,6 +55,12 @@ xfs_trans_log_dquot( { ASSERT(XFS_DQ_IS_LOCKED(dqp)); + /* Upgrade the dquot to bigtime format if possible. */ + if (dqp->q_id != 0 && + xfs_sb_version_hasbigtime(&tp->t_mountp->m_sb) && + !(dqp->q_type & XFS_DQTYPE_BIGTIME)) + dqp->q_type |= XFS_DQTYPE_BIGTIME; + tp->t_flags |= XFS_TRANS_DIRTY; set_bit(XFS_LI_DIRTY, &dqp->q_logitem.qli_item.li_flags); } |