diff options
Diffstat (limited to 'fs')
151 files changed, 5368 insertions, 3860 deletions
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c index 0e7a6f81ae36..6043567b95c2 100644 --- a/fs/binfmt_aout.c +++ b/fs/binfmt_aout.c @@ -30,7 +30,7 @@ #include <asm/cacheflush.h> #include <asm/a.out-core.h> -static int load_aout_binary(struct linux_binprm *, struct pt_regs * regs); +static int load_aout_binary(struct linux_binprm *); static int load_aout_library(struct file*); #ifdef CONFIG_COREDUMP @@ -201,8 +201,9 @@ static unsigned long __user *create_aout_tables(char __user *p, struct linux_bin * libraries. There is no binary dependent code anywhere else. */ -static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs) +static int load_aout_binary(struct linux_binprm * bprm) { + struct pt_regs *regs = current_pt_regs(); struct exec ex; unsigned long error; unsigned long fd_offset; diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index fbd9f60bd763..6d7d1647a68c 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -44,7 +44,7 @@ #define user_siginfo_t siginfo_t #endif -static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs); +static int load_elf_binary(struct linux_binprm *bprm); static int load_elf_library(struct file *); static unsigned long elf_map(struct file *, unsigned long, struct elf_phdr *, int, int, unsigned long); @@ -558,7 +558,7 @@ static unsigned long randomize_stack_top(unsigned long stack_top) #endif } -static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) +static int load_elf_binary(struct linux_binprm *bprm) { struct file *interpreter = NULL; /* to shut gcc up */ unsigned long load_addr = 0, load_bias = 0; @@ -575,6 +575,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) unsigned long reloc_func_desc __maybe_unused = 0; int executable_stack = EXSTACK_DEFAULT; unsigned long def_flags = 0; + struct pt_regs *regs = current_pt_regs(); struct { struct elfhdr elf_ex; struct elfhdr interp_elf_ex; diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index a46049154107..dc84732e554f 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -56,7 +56,7 @@ typedef char *elf_caddr_t; MODULE_LICENSE("GPL"); -static int load_elf_fdpic_binary(struct linux_binprm *, struct pt_regs *); +static int load_elf_fdpic_binary(struct linux_binprm *); static int elf_fdpic_fetch_phdrs(struct elf_fdpic_params *, struct file *); static int elf_fdpic_map_file(struct elf_fdpic_params *, struct file *, struct mm_struct *, const char *); @@ -164,10 +164,10 @@ static int elf_fdpic_fetch_phdrs(struct elf_fdpic_params *params, /* * load an fdpic binary into various bits of memory */ -static int load_elf_fdpic_binary(struct linux_binprm *bprm, - struct pt_regs *regs) +static int load_elf_fdpic_binary(struct linux_binprm *bprm) { struct elf_fdpic_params exec_params, interp_params; + struct pt_regs *regs = current_pt_regs(); struct elf_phdr *phdr; unsigned long stack_size, entryaddr; #ifdef ELF_FDPIC_PLAT_INIT diff --git a/fs/binfmt_em86.c b/fs/binfmt_em86.c index 2790c7e1912e..4e6cce57d113 100644 --- a/fs/binfmt_em86.c +++ b/fs/binfmt_em86.c @@ -22,7 +22,7 @@ #define EM86_INTERP "/usr/bin/em86" #define EM86_I_NAME "em86" -static int load_em86(struct linux_binprm *bprm,struct pt_regs *regs) +static int load_em86(struct linux_binprm *bprm) { char *interp, *i_name, *i_arg; struct file * file; @@ -90,7 +90,7 @@ static int load_em86(struct linux_binprm *bprm,struct pt_regs *regs) if (retval < 0) return retval; - return search_binary_handler(bprm, regs); + return search_binary_handler(bprm); } static struct linux_binfmt em86_format = { diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index e280352b28f9..b56371981d16 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -88,7 +88,7 @@ struct lib_info { static int load_flat_shared_library(int id, struct lib_info *p); #endif -static int load_flat_binary(struct linux_binprm *, struct pt_regs * regs); +static int load_flat_binary(struct linux_binprm *); static int flat_core_dump(struct coredump_params *cprm); static struct linux_binfmt flat_format = { @@ -858,9 +858,10 @@ out: * libraries. There is no binary dependent code anywhere else. */ -static int load_flat_binary(struct linux_binprm * bprm, struct pt_regs * regs) +static int load_flat_binary(struct linux_binprm * bprm) { struct lib_info libinfo; + struct pt_regs *regs = current_pt_regs(); unsigned long p = bprm->p; unsigned long stack_len; unsigned long start_addr; diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index 790b3cddca67..b0b70fbea06c 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -104,7 +104,7 @@ static Node *check_file(struct linux_binprm *bprm) /* * the loader itself */ -static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs) +static int load_misc_binary(struct linux_binprm *bprm) { Node *fmt; struct file * interp_file = NULL; @@ -199,7 +199,7 @@ static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs) bprm->recursion_depth++; - retval = search_binary_handler (bprm, regs); + retval = search_binary_handler(bprm); if (retval < 0) goto _error; diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c index d3b8c1f63155..8c954997e7f7 100644 --- a/fs/binfmt_script.c +++ b/fs/binfmt_script.c @@ -14,7 +14,7 @@ #include <linux/err.h> #include <linux/fs.h> -static int load_script(struct linux_binprm *bprm,struct pt_regs *regs) +static int load_script(struct linux_binprm *bprm) { const char *i_arg, *i_name; char *cp; @@ -95,7 +95,7 @@ static int load_script(struct linux_binprm *bprm,struct pt_regs *regs) retval = prepare_binprm(bprm); if (retval < 0) return retval; - return search_binary_handler(bprm,regs); + return search_binary_handler(bprm); } static struct linux_binfmt script_format = { diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c index 4517aaff61b4..4e00ed68d4a6 100644 --- a/fs/binfmt_som.c +++ b/fs/binfmt_som.c @@ -35,7 +35,7 @@ #include <linux/elf.h> -static int load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs); +static int load_som_binary(struct linux_binprm * bprm); static int load_som_library(struct file *); /* @@ -180,13 +180,14 @@ out: */ static int -load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs) +load_som_binary(struct linux_binprm * bprm) { int retval; unsigned int size; unsigned long som_entry; struct som_hdr *som_ex; struct som_exec_auxhdr *hpuxhdr; + struct pt_regs *regs = current_pt_regs(); /* Get the exec-header */ som_ex = (struct som_hdr *) bprm->buf; diff --git a/fs/block_dev.c b/fs/block_dev.c index 1a1e5e3b1eaf..ab3a456f6650 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -70,19 +70,6 @@ static void bdev_inode_switch_bdi(struct inode *inode, spin_unlock(&dst->wb.list_lock); } -sector_t blkdev_max_block(struct block_device *bdev) -{ - sector_t retval = ~((sector_t)0); - loff_t sz = i_size_read(bdev->bd_inode); - - if (sz) { - unsigned int size = block_size(bdev); - unsigned int sizebits = blksize_bits(size); - retval = (sz >> sizebits); - } - return retval; -} - /* Kill _all_ buffers and pagecache , dirty or not.. */ void kill_bdev(struct block_device *bdev) { @@ -116,8 +103,6 @@ EXPORT_SYMBOL(invalidate_bdev); int set_blocksize(struct block_device *bdev, int size) { - struct address_space *mapping; - /* Size must be a power of two, and between 512 and PAGE_SIZE */ if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size)) return -EINVAL; @@ -126,19 +111,6 @@ int set_blocksize(struct block_device *bdev, int size) if (size < bdev_logical_block_size(bdev)) return -EINVAL; - /* Prevent starting I/O or mapping the device */ - percpu_down_write(&bdev->bd_block_size_semaphore); - - /* Check that the block device is not memory mapped */ - mapping = bdev->bd_inode->i_mapping; - mutex_lock(&mapping->i_mmap_mutex); - if (mapping_mapped(mapping)) { - mutex_unlock(&mapping->i_mmap_mutex); - percpu_up_write(&bdev->bd_block_size_semaphore); - return -EBUSY; - } - mutex_unlock(&mapping->i_mmap_mutex); - /* Don't change the size if it is same as current */ if (bdev->bd_block_size != size) { sync_blockdev(bdev); @@ -146,9 +118,6 @@ int set_blocksize(struct block_device *bdev, int size) bdev->bd_inode->i_blkbits = blksize_bits(size); kill_bdev(bdev); } - - percpu_up_write(&bdev->bd_block_size_semaphore); - return 0; } @@ -181,52 +150,12 @@ static int blkdev_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh, int create) { - if (iblock >= blkdev_max_block(I_BDEV(inode))) { - if (create) - return -EIO; - - /* - * for reads, we're just trying to fill a partial page. - * return a hole, they will have to call get_block again - * before they can fill it, and they will get -EIO at that - * time - */ - return 0; - } bh->b_bdev = I_BDEV(inode); bh->b_blocknr = iblock; set_buffer_mapped(bh); return 0; } -static int -blkdev_get_blocks(struct inode *inode, sector_t iblock, - struct buffer_head *bh, int create) -{ - sector_t end_block = blkdev_max_block(I_BDEV(inode)); - unsigned long max_blocks = bh->b_size >> inode->i_blkbits; - - if ((iblock + max_blocks) > end_block) { - max_blocks = end_block - iblock; - if ((long)max_blocks <= 0) { - if (create) - return -EIO; /* write fully beyond EOF */ - /* - * It is a read which is fully beyond EOF. We return - * a !buffer_mapped buffer - */ - max_blocks = 0; - } - } - - bh->b_bdev = I_BDEV(inode); - bh->b_blocknr = iblock; - bh->b_size = max_blocks << inode->i_blkbits; - if (max_blocks) - set_buffer_mapped(bh); - return 0; -} - static ssize_t blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t offset, unsigned long nr_segs) @@ -235,7 +164,7 @@ blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, struct inode *inode = file->f_mapping->host; return __blockdev_direct_IO(rw, iocb, inode, I_BDEV(inode), iov, offset, - nr_segs, blkdev_get_blocks, NULL, NULL, 0); + nr_segs, blkdev_get_block, NULL, NULL, 0); } int __sync_blockdev(struct block_device *bdev, int wait) @@ -459,12 +388,6 @@ static struct inode *bdev_alloc_inode(struct super_block *sb) struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL); if (!ei) return NULL; - - if (unlikely(percpu_init_rwsem(&ei->bdev.bd_block_size_semaphore))) { - kmem_cache_free(bdev_cachep, ei); - return NULL; - } - return &ei->vfs_inode; } @@ -473,8 +396,6 @@ static void bdev_i_callback(struct rcu_head *head) struct inode *inode = container_of(head, struct inode, i_rcu); struct bdev_inode *bdi = BDEV_I(inode); - percpu_free_rwsem(&bdi->bdev.bd_block_size_semaphore); - kmem_cache_free(bdev_cachep, bdi); } @@ -1593,22 +1514,6 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg) return blkdev_ioctl(bdev, mode, cmd, arg); } -ssize_t blkdev_aio_read(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) -{ - ssize_t ret; - struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host); - - percpu_down_read(&bdev->bd_block_size_semaphore); - - ret = generic_file_aio_read(iocb, iov, nr_segs, pos); - - percpu_up_read(&bdev->bd_block_size_semaphore); - - return ret; -} -EXPORT_SYMBOL_GPL(blkdev_aio_read); - /* * Write data to the block device. Only intended for the block device itself * and the raw driver which basically is a fake block device. @@ -1620,16 +1525,12 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { struct file *file = iocb->ki_filp; - struct block_device *bdev = I_BDEV(file->f_mapping->host); struct blk_plug plug; ssize_t ret; BUG_ON(iocb->ki_pos != pos); blk_start_plug(&plug); - - percpu_down_read(&bdev->bd_block_size_semaphore); - ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); if (ret > 0 || ret == -EIOCBQUEUED) { ssize_t err; @@ -1638,62 +1539,27 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov, if (err < 0 && ret > 0) ret = err; } - - percpu_up_read(&bdev->bd_block_size_semaphore); - blk_finish_plug(&plug); - return ret; } EXPORT_SYMBOL_GPL(blkdev_aio_write); -static int blkdev_mmap(struct file *file, struct vm_area_struct *vma) -{ - int ret; - struct block_device *bdev = I_BDEV(file->f_mapping->host); - - percpu_down_read(&bdev->bd_block_size_semaphore); - - ret = generic_file_mmap(file, vma); - - percpu_up_read(&bdev->bd_block_size_semaphore); - - return ret; -} - -static ssize_t blkdev_splice_read(struct file *file, loff_t *ppos, - struct pipe_inode_info *pipe, size_t len, - unsigned int flags) -{ - ssize_t ret; - struct block_device *bdev = I_BDEV(file->f_mapping->host); - - percpu_down_read(&bdev->bd_block_size_semaphore); - - ret = generic_file_splice_read(file, ppos, pipe, len, flags); - - percpu_up_read(&bdev->bd_block_size_semaphore); - - return ret; -} - -static ssize_t blkdev_splice_write(struct pipe_inode_info *pipe, - struct file *file, loff_t *ppos, size_t len, - unsigned int flags) +static ssize_t blkdev_aio_read(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) { - ssize_t ret; - struct block_device *bdev = I_BDEV(file->f_mapping->host); - - percpu_down_read(&bdev->bd_block_size_semaphore); - - ret = generic_file_splice_write(pipe, file, ppos, len, flags); + struct file *file = iocb->ki_filp; + struct inode *bd_inode = file->f_mapping->host; + loff_t size = i_size_read(bd_inode); - percpu_up_read(&bdev->bd_block_size_semaphore); + if (pos >= size) + return 0; - return ret; + size -= pos; + if (size < INT_MAX) + nr_segs = iov_shorten((struct iovec *)iov, nr_segs, size); + return generic_file_aio_read(iocb, iov, nr_segs, pos); } - /* * Try to release a page associated with block device when the system * is under memory pressure. @@ -1724,16 +1590,16 @@ const struct file_operations def_blk_fops = { .llseek = block_llseek, .read = do_sync_read, .write = do_sync_write, - .aio_read = blkdev_aio_read, + .aio_read = blkdev_aio_read, .aio_write = blkdev_aio_write, - .mmap = blkdev_mmap, + .mmap = generic_file_mmap, .fsync = blkdev_fsync, .unlocked_ioctl = block_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = compat_blkdev_ioctl, #endif - .splice_read = blkdev_splice_read, - .splice_write = blkdev_splice_write, + .splice_read = generic_file_splice_read, + .splice_write = generic_file_splice_write, }; int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 7cda51995c1e..22a0439e5a86 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3416,8 +3416,8 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr) num_dirty = root->fs_info->dirty_metadata_bytes; if (num_dirty > thresh) { - balance_dirty_pages_ratelimited_nr( - root->fs_info->btree_inode->i_mapping, 1); + balance_dirty_pages_ratelimited( + root->fs_info->btree_inode->i_mapping); } return; } @@ -3437,8 +3437,8 @@ void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr) num_dirty = root->fs_info->dirty_metadata_bytes; if (num_dirty > thresh) { - balance_dirty_pages_ratelimited_nr( - root->fs_info->btree_inode->i_mapping, 1); + balance_dirty_pages_ratelimited( + root->fs_info->btree_inode->i_mapping); } return; } diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 9ab1bed88116..a8ee75cb96ee 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1346,8 +1346,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, cond_resched(); - balance_dirty_pages_ratelimited_nr(inode->i_mapping, - dirty_pages); + balance_dirty_pages_ratelimited(inode->i_mapping); if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1) btrfs_btree_balance_dirty(root, 1); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 8fcf9a59c28d..5b3429ab8ec1 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1225,7 +1225,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, } defrag_count += ret; - balance_dirty_pages_ratelimited_nr(inode->i_mapping, ret); + balance_dirty_pages_ratelimited(inode->i_mapping); mutex_unlock(&inode->i_mutex); if (newer_than) { diff --git a/fs/buffer.c b/fs/buffer.c index b5f044283edb..6e9ed48064fc 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -555,7 +555,7 @@ void emergency_thaw_all(void) */ int sync_mapping_buffers(struct address_space *mapping) { - struct address_space *buffer_mapping = mapping->assoc_mapping; + struct address_space *buffer_mapping = mapping->private_data; if (buffer_mapping == NULL || list_empty(&mapping->private_list)) return 0; @@ -588,10 +588,10 @@ void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode) struct address_space *buffer_mapping = bh->b_page->mapping; mark_buffer_dirty(bh); - if (!mapping->assoc_mapping) { - mapping->assoc_mapping = buffer_mapping; + if (!mapping->private_data) { + mapping->private_data = buffer_mapping; } else { - BUG_ON(mapping->assoc_mapping != buffer_mapping); + BUG_ON(mapping->private_data != buffer_mapping); } if (!bh->b_assoc_map) { spin_lock(&buffer_mapping->private_lock); @@ -788,7 +788,7 @@ void invalidate_inode_buffers(struct inode *inode) if (inode_has_buffers(inode)) { struct address_space *mapping = &inode->i_data; struct list_head *list = &mapping->private_list; - struct address_space *buffer_mapping = mapping->assoc_mapping; + struct address_space *buffer_mapping = mapping->private_data; spin_lock(&buffer_mapping->private_lock); while (!list_empty(list)) @@ -811,7 +811,7 @@ int remove_inode_buffers(struct inode *inode) if (inode_has_buffers(inode)) { struct address_space *mapping = &inode->i_data; struct list_head *list = &mapping->private_list; - struct address_space *buffer_mapping = mapping->assoc_mapping; + struct address_space *buffer_mapping = mapping->private_data; spin_lock(&buffer_mapping->private_lock); while (!list_empty(list)) { @@ -911,6 +911,18 @@ link_dev_buffers(struct page *page, struct buffer_head *head) attach_page_buffers(page, head); } +static sector_t blkdev_max_block(struct block_device *bdev, unsigned int size) +{ + sector_t retval = ~((sector_t)0); + loff_t sz = i_size_read(bdev->bd_inode); + + if (sz) { + unsigned int sizebits = blksize_bits(size); + retval = (sz >> sizebits); + } + return retval; +} + /* * Initialise the state of a blockdev page's buffers. */ @@ -921,7 +933,7 @@ init_page_buffers(struct page *page, struct block_device *bdev, struct buffer_head *head = page_buffers(page); struct buffer_head *bh = head; int uptodate = PageUptodate(page); - sector_t end_block = blkdev_max_block(I_BDEV(bdev->bd_inode)); + sector_t end_block = blkdev_max_block(I_BDEV(bdev->bd_inode), size); do { if (!buffer_mapped(bh)) { @@ -1553,6 +1565,28 @@ void unmap_underlying_metadata(struct block_device *bdev, sector_t block) EXPORT_SYMBOL(unmap_underlying_metadata); /* + * Size is a power-of-two in the range 512..PAGE_SIZE, + * and the case we care about most is PAGE_SIZE. + * + * So this *could* possibly be written with those + * constraints in mind (relevant mostly if some + * architecture has a slow bit-scan instruction) + */ +static inline int block_size_bits(unsigned int blocksize) +{ + return ilog2(blocksize); +} + +static struct buffer_head *create_page_buffers(struct page *page, struct inode *inode, unsigned int b_state) +{ + BUG_ON(!PageLocked(page)); + + if (!page_has_buffers(page)) + create_empty_buffers(page, 1 << ACCESS_ONCE(inode->i_blkbits), b_state); + return page_buffers(page); +} + +/* * NOTE! All mapped/uptodate combinations are valid: * * Mapped Uptodate Meaning @@ -1589,19 +1623,13 @@ static int __block_write_full_page(struct inode *inode, struct page *page, sector_t block; sector_t last_block; struct buffer_head *bh, *head; - const unsigned blocksize = 1 << inode->i_blkbits; + unsigned int blocksize, bbits; int nr_underway = 0; int write_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE); - BUG_ON(!PageLocked(page)); - - last_block = (i_size_read(inode) - 1) >> inode->i_blkbits; - - if (!page_has_buffers(page)) { - create_empty_buffers(page, blocksize, + head = create_page_buffers(page, inode, (1 << BH_Dirty)|(1 << BH_Uptodate)); - } /* * Be very careful. We have no exclusion from __set_page_dirty_buffers @@ -1613,9 +1641,12 @@ static int __block_write_full_page(struct inode *inode, struct page *page, * handle that here by just cleaning them. */ - block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); - head = page_buffers(page); bh = head; + blocksize = bh->b_size; + bbits = block_size_bits(blocksize); + + block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits); + last_block = (i_size_read(inode) - 1) >> bbits; /* * Get all the dirty buffers mapped to disk addresses and @@ -1806,12 +1837,10 @@ int __block_write_begin(struct page *page, loff_t pos, unsigned len, BUG_ON(to > PAGE_CACHE_SIZE); BUG_ON(from > to); - blocksize = 1 << inode->i_blkbits; - if (!page_has_buffers(page)) - create_empty_buffers(page, blocksize, 0); - head = page_buffers(page); + head = create_page_buffers(page, inode, 0); + blocksize = head->b_size; + bbits = block_size_bits(blocksize); - bbits = inode->i_blkbits; block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits); for(bh = head, block_start = 0; bh != head || !block_start; @@ -1881,11 +1910,11 @@ static int __block_commit_write(struct inode *inode, struct page *page, unsigned blocksize; struct buffer_head *bh, *head; - blocksize = 1 << inode->i_blkbits; + bh = head = page_buffers(page); + blocksize = bh->b_size; - for(bh = head = page_buffers(page), block_start = 0; - bh != head || !block_start; - block_start=block_end, bh = bh->b_this_page) { + block_start = 0; + do { block_end = block_start + blocksize; if (block_end <= from || block_start >= to) { if (!buffer_uptodate(bh)) @@ -1895,7 +1924,10 @@ static int __block_commit_write(struct inode *inode, struct page *page, mark_buffer_dirty(bh); } clear_buffer_new(bh); - } + + block_start = block_end; + bh = bh->b_this_page; + } while (bh != head); /* * If this is a partial write which happened to make all buffers @@ -2020,7 +2052,6 @@ EXPORT_SYMBOL(generic_write_end); int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc, unsigned long from) { - struct inode *inode = page->mapping->host; unsigned block_start, block_end, blocksize; unsigned to; struct buffer_head *bh, *head; @@ -2029,13 +2060,13 @@ int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc, if (!page_has_buffers(page)) return 0; - blocksize = 1 << inode->i_blkbits; + head = page_buffers(page); + blocksize = head->b_size; to = min_t(unsigned, PAGE_CACHE_SIZE - from, desc->count); to = from + to; if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize) return 0; - head = page_buffers(page); bh = head; block_start = 0; do { @@ -2068,18 +2099,16 @@ int block_read_full_page(struct page *page, get_block_t *get_block) struct inode *inode = page->mapping->host; sector_t iblock, lblock; struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE]; - unsigned int blocksize; + unsigned int blocksize, bbits; int nr, i; int fully_mapped = 1; - BUG_ON(!PageLocked(page)); - blocksize = 1 << inode->i_blkbits; - if (!page_has_buffers(page)) - create_empty_buffers(page, blocksize, 0); - head = page_buffers(page); + head = create_page_buffers(page, inode, 0); + blocksize = head->b_size; + bbits = block_size_bits(blocksize); - iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); - lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits; + iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits); + lblock = (i_size_read(inode)+blocksize-1) >> bbits; bh = head; nr = 0; i = 0; @@ -2864,6 +2893,55 @@ static void end_bio_bh_io_sync(struct bio *bio, int err) bio_put(bio); } +/* + * This allows us to do IO even on the odd last sectors + * of a device, even if the bh block size is some multiple + * of the physical sector size. + * + * We'll just truncate the bio to the size of the device, + * and clear the end of the buffer head manually. + * + * Truly out-of-range accesses will turn into actual IO + * errors, this only handles the "we need to be able to + * do IO at the final sector" case. + */ +static void guard_bh_eod(int rw, struct bio *bio, struct buffer_head *bh) +{ + sector_t maxsector; + unsigned bytes; + + maxsector = i_size_read(bio->bi_bdev->bd_inode) >> 9; + if (!maxsector) + return; + + /* + * If the *whole* IO is past the end of the device, + * let it through, and the IO layer will turn it into + * an EIO. + */ + if (unlikely(bio->bi_sector >= maxsector)) + return; + + maxsector -= bio->bi_sector; + bytes = bio->bi_size; + if (likely((bytes >> 9) <= maxsector)) + return; + + /* Uhhuh. We've got a bh that straddles the device size! */ + bytes = maxsector << 9; + + /* Truncate the bio.. */ + bio->bi_size = bytes; + bio->bi_io_vec[0].bv_len = bytes; + + /* ..and clear the end of the buffer for reads */ + if ((rw & RW_MASK) == READ) { + void *kaddr = kmap_atomic(bh->b_page); + memset(kaddr + bh_offset(bh) + bytes, 0, bh->b_size - bytes); + kunmap_atomic(kaddr); + } +} + int submit_bh(int rw, struct buffer_head * bh) { struct bio *bio; @@ -2900,6 +2978,9 @@ int submit_bh(int rw, struct buffer_head * bh) bio->bi_end_io = end_bio_bh_io_sync; bio->bi_private = bh; + /* Take care of bh's that straddle the end of the device */ + guard_bh_eod(rw, bio, bh); + bio_get(bio); submit_bio(rw, bio); diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig index 2075ddfffa73..21ff76c22a17 100644 --- a/fs/cifs/Kconfig +++ b/fs/cifs/Kconfig @@ -122,9 +122,17 @@ config CIFS_ACL Allows fetching CIFS/NTFS ACL from the server. The DACL blob is handed over to the application/caller. +config CIFS_DEBUG + bool "Enable CIFS debugging routines" + default y + depends on CIFS + help + Enabling this option adds helpful debugging messages to + the cifs code which increases the size of the cifs module. + If unsure, say Y. config CIFS_DEBUG2 bool "Enable additional CIFS debugging routines" - depends on CIFS + depends on CIFS_DEBUG help Enabling this option adds a few more debugging routines to the cifs code which slightly increases the size of diff --git a/fs/cifs/cifs_debug.h b/fs/cifs/cifs_debug.h index c0c68bb492d7..86e92ef2abc1 100644 --- a/fs/cifs/cifs_debug.h +++ b/fs/cifs/cifs_debug.h @@ -18,7 +18,6 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * */ -#define CIFS_DEBUG /* BB temporary */ #ifndef _H_CIFS_DEBUG #define _H_CIFS_DEBUG @@ -37,49 +36,43 @@ void dump_smb(void *, int); #define CIFS_RC 0x02 #define CIFS_TIMER 0x04 +extern int cifsFYI; +extern int cifsERROR; + /* * debug ON * -------- */ -#ifdef CIFS_DEBUG +#ifdef CONFIG_CIFS_DEBUG /* information message: e.g., configuration, major event */ -extern int cifsFYI; -#define cifsfyi(fmt, arg...) \ +#define cifsfyi(fmt, ...) \ do { \ if (cifsFYI & CIFS_INFO) \ - printk(KERN_DEBUG "%s: " fmt "\n", __FILE__, ##arg); \ + printk(KERN_DEBUG "%s: " fmt "\n", \ + __FILE__, ##__VA_ARGS__); \ } while (0) -#define cFYI(set, fmt, arg...) \ -do { \ - if (set) \ - cifsfyi(fmt, ##arg); \ +#define cFYI(set, fmt, ...) \ +do { \ + if (set) \ + cifsfyi(fmt, ##__VA_ARGS__); \ } while (0) -#define cifswarn(fmt, arg...) \ - printk(KERN_WARNING fmt "\n", ##arg) +#define cifswarn(fmt, ...) \ + printk(KERN_WARNING fmt "\n", ##__VA_ARGS__) -/* debug event message: */ -extern int cifsERROR; - -#define cEVENT(fmt, arg...) \ +/* error event message: e.g., i/o error */ +#define cifserror(fmt, ...) \ do { \ if (cifsERROR) \ - printk(KERN_EVENT "%s: " fmt "\n", __FILE__, ##arg); \ -} while (0) - -/* error event message: e.g., i/o error */ -#define cifserror(fmt, arg...) \ -do { \ - if (cifsERROR) \ - printk(KERN_ERR "CIFS VFS: " fmt "\n", ##arg); \ + printk(KERN_ERR "CIFS VFS: " fmt "\n", ##__VA_ARGS__); \ } while (0) -#define cERROR(set, fmt, arg...) \ -do { \ - if (set) \ - cifserror(fmt, ##arg); \ +#define cERROR(set, fmt, ...) \ +do { \ + if (set) \ + cifserror(fmt, ##__VA_ARGS__); \ } while (0) /* @@ -87,10 +80,27 @@ do { \ * --------- */ #else /* _CIFS_DEBUG */ -#define cERROR(set, fmt, arg...) -#define cEVENT(fmt, arg...) -#define cFYI(set, fmt, arg...) -#define cifserror(fmt, arg...) +#define cifsfyi(fmt, ...) \ +do { \ + if (0) \ + printk(KERN_DEBUG "%s: " fmt "\n", \ + __FILE__, ##__VA_ARGS__); \ +} while (0) +#define cFYI(set, fmt, ...) \ +do { \ + if (0 && set) \ + cifsfyi(fmt, ##__VA_ARGS__); \ +} while (0) +#define cifserror(fmt, ...) \ +do { \ + if (0) \ + printk(KERN_ERR "CIFS VFS: " fmt "\n", ##__VA_ARGS__); \ +} while (0) +#define cERROR(set, fmt, ...) \ +do { \ + if (0 && set) \ + cifserror(fmt, ##__VA_ARGS__); \ +} while (0) #endif /* _CIFS_DEBUG */ #endif /* _H_CIFS_DEBUG */ diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index fc783e264420..75c1ee699143 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -42,135 +42,27 @@ static const struct cifs_sid sid_authusers = { /* group users */ static const struct cifs_sid sid_user = {1, 2 , {0, 0, 0, 0, 0, 5}, {} }; -const struct cred *root_cred; - -static void -shrink_idmap_tree(struct rb_root *root, int nr_to_scan, int *nr_rem, - int *nr_del) -{ - struct rb_node *node; - struct rb_node *tmp; - struct cifs_sid_id *psidid; - - node = rb_first(root); - while (node) { - tmp = node; - node = rb_next(tmp); - psidid = rb_entry(tmp, struct cifs_sid_id, rbnode); - if (nr_to_scan == 0 || *nr_del == nr_to_scan) - ++(*nr_rem); - else { - if (time_after(jiffies, psidid->time + SID_MAP_EXPIRE) - && psidid->refcount == 0) { - rb_erase(tmp, root); - ++(*nr_del); - } else - ++(*nr_rem); - } - } -} - -/* - * Run idmap cache shrinker. - */ -static int -cifs_idmap_shrinker(struct shrinker *shrink, struct shrink_control *sc) -{ - int nr_to_scan = sc->nr_to_scan; - int nr_del = 0; - int nr_rem = 0; - struct rb_root *root; - - root = &uidtree; - spin_lock(&siduidlock); - shrink_idmap_tree(root, nr_to_scan, &nr_rem, &nr_del); - spin_unlock(&siduidlock); - - root = &gidtree; - spin_lock(&sidgidlock); - shrink_idmap_tree(root, nr_to_scan, &nr_rem, &nr_del); - spin_unlock(&sidgidlock); - - root = &siduidtree; - spin_lock(&uidsidlock); - shrink_idmap_tree(root, nr_to_scan, &nr_rem, &nr_del); - spin_unlock(&uidsidlock); - - root = &sidgidtree; - spin_lock(&gidsidlock); - shrink_idmap_tree(root, nr_to_scan, &nr_rem, &nr_del); - spin_unlock(&gidsidlock); - - return nr_rem; -} - -static void -sid_rb_insert(struct rb_root *root, unsigned long cid, - struct cifs_sid_id **psidid, char *typestr) -{ - char *strptr; - struct rb_node *node = root->rb_node; - struct rb_node *parent = NULL; - struct rb_node **linkto = &(root->rb_node); - struct cifs_sid_id *lsidid; - - while (node) { - lsidid = rb_entry(node, struct cifs_sid_id, rbnode); - parent = node; - if (cid > lsidid->id) { - linkto = &(node->rb_left); - node = node->rb_left; - } - if (cid < lsidid->id) { - linkto = &(node->rb_right); - node = node->rb_right; - } - } - - (*psidid)->id = cid; - (*psidid)->time = jiffies - (SID_MAP_RETRY + 1); - (*psidid)->refcount = 0; - - sprintf((*psidid)->sidstr, "%s", typestr); - strptr = (*psidid)->sidstr + strlen((*psidid)->sidstr); - sprintf(strptr, "%ld", cid); - - clear_bit(SID_ID_PENDING, &(*psidid)->state); - clear_bit(SID_ID_MAPPED, &(*psidid)->state); - - rb_link_node(&(*psidid)->rbnode, parent, linkto); - rb_insert_color(&(*psidid)->rbnode, root); -} - -static struct cifs_sid_id * -sid_rb_search(struct rb_root *root, unsigned long cid) -{ - struct rb_node *node = root->rb_node; - struct cifs_sid_id *lsidid; - - while (node) { - lsidid = rb_entry(node, struct cifs_sid_id, rbnode); - if (cid > lsidid->id) - node = node->rb_left; - else if (cid < lsidid->id) - node = node->rb_right; - else /* node found */ - return lsidid; - } - - return NULL; -} - -static struct shrinker cifs_shrinker = { - .shrink = cifs_idmap_shrinker, - .seeks = DEFAULT_SEEKS, -}; +static const struct cred *root_cred; static int cifs_idmap_key_instantiate(struct key *key, struct key_preparsed_payload *prep) { char *payload; + /* + * If the payload is less than or equal to the size of a pointer, then + * an allocation here is wasteful. Just copy the data directly to the + * payload.value union member instead. + * + * With this however, you must check the datalen before trying to + * dereference payload.data! + */ + if (prep->datalen <= sizeof(key->payload)) { + key->payload.value = 0; + memcpy(&key->payload.value, prep->data, prep->datalen); + key->datalen = prep->datalen; + return 0; + } payload = kmalloc(prep->datalen, GFP_KERNEL); if (!payload) return -ENOMEM; @@ -184,10 +76,11 @@ cifs_idmap_key_instantiate(struct key *key, struct key_preparsed_payload *prep) static inline void cifs_idmap_key_destroy(struct key *key) { - kfree(key->payload.data); + if (key->datalen > sizeof(key->payload)) + kfree(key->payload.data); } -struct key_type cifs_idmap_key_type = { +static struct key_type cifs_idmap_key_type = { .name = "cifs.idmap", .instantiate = cifs_idmap_key_instantiate, .destroy = cifs_idmap_key_destroy, @@ -195,214 +88,174 @@ struct key_type cifs_idmap_key_type = { .match = user_match, }; -static void -sid_to_str(struct cifs_sid *sidptr, char *sidstr) +static char * +sid_to_key_str(struct cifs_sid *sidptr, unsigned int type) { - int i; - unsigned long saval; - char *strptr; + int i, len; + unsigned int saval; + char *sidstr, *strptr; + unsigned long long id_auth_val; + + /* 3 bytes for prefix */ + sidstr = kmalloc(3 + SID_STRING_BASE_SIZE + + (SID_STRING_SUBAUTH_SIZE * sidptr->num_subauth), + GFP_KERNEL); + if (!sidstr) + return sidstr; strptr = sidstr; + len = sprintf(strptr, "%cs:S-%hhu", type == SIDOWNER ? 'o' : 'g', + sidptr->revision); + strptr += len; + + /* The authority field is a single 48-bit number */ + id_auth_val = (unsigned long long)sidptr->authority[5]; + id_auth_val |= (unsigned long long)sidptr->authority[4] << 8; + id_auth_val |= (unsigned long long)sidptr->authority[3] << 16; + id_auth_val |= (unsigned long long)sidptr->authority[2] << 24; + id_auth_val |= (unsigned long long)sidptr->authority[1] << 32; + id_auth_val |= (unsigned long long)sidptr->authority[0] << 48; - sprintf(strptr, "%s", "S"); - strptr = sidstr + strlen(sidstr); - - sprintf(strptr, "-%d", sidptr->revision); - strptr = sidstr + strlen(sidstr); + /* + * MS-DTYP states that if the authority is >= 2^32, then it should be + * expressed as a hex value. + */ + if (id_auth_val <= UINT_MAX) + len = sprintf(strptr, "-%llu", id_auth_val); + else + len = sprintf(strptr, "-0x%llx", id_auth_val); - for (i = 0; i < 6; ++i) { - if (sidptr->authority[i]) { - sprintf(strptr, "-%d", sidptr->authority[i]); - strptr = sidstr + strlen(sidstr); - } - } + strptr += len; for (i = 0; i < sidptr->num_subauth; ++i) { saval = le32_to_cpu(sidptr->sub_auth[i]); - sprintf(strptr, "-%ld", saval); - strptr = sidstr + strlen(sidstr); + len = sprintf(strptr, "-%u", saval); + strptr += len; } + + return sidstr; } -static void -id_rb_insert(struct rb_root *root, struct cifs_sid *sidptr, - struct cifs_sid_id **psidid, char *typestr) +/* + * if the two SIDs (roughly equivalent to a UUID for a user or group) are + * the same returns zero, if they do not match returns non-zero. + */ +static int +compare_sids(const struct cifs_sid *ctsid, const struct cifs_sid *cwsid) { - int rc; - char *strptr; - struct rb_node *node = root->rb_node; - struct rb_node *parent = NULL; - struct rb_node **linkto = &(root->rb_node); - struct cifs_sid_id *lsidid; - - while (node) { - lsidid = rb_entry(node, struct cifs_sid_id, rbnode); - parent = node; - rc = compare_sids(sidptr, &((lsidid)->sid)); - if (rc > 0) { - linkto = &(node->rb_left); - node = node->rb_left; - } else if (rc < 0) { - linkto = &(node->rb_right); - node = node->rb_right; - } - } - - memcpy(&(*psidid)->sid, sidptr, sizeof(struct cifs_sid)); - (*psidid)->time = jiffies - (SID_MAP_RETRY + 1); - (*psidid)->refcount = 0; + int i; + int num_subauth, num_sat, num_saw; - sprintf((*psidid)->sidstr, "%s", typestr); - strptr = (*psidid)->sidstr + strlen((*psidid)->sidstr); - sid_to_str(&(*psidid)->sid, strptr); + if ((!ctsid) || (!cwsid)) + return 1; - clear_bit(SID_ID_PENDING, &(*psidid)->state); - clear_bit(SID_ID_MAPPED, &(*psidid)->state); + /* compare the revision */ + if (ctsid->revision != cwsid->revision) { + if (ctsid->revision > cwsid->revision) + return 1; + else + return -1; + } - rb_link_node(&(*psidid)->rbnode, parent, linkto); - rb_insert_color(&(*psidid)->rbnode, root); -} + /* compare all of the six auth values */ + for (i = 0; i < NUM_AUTHS; ++i) { + if (ctsid->authority[i] != cwsid->authority[i]) { + if (ctsid->authority[i] > cwsid->authority[i]) + return 1; + else + return -1; + } + } -static struct cifs_sid_id * -id_rb_search(struct rb_root *root, struct cifs_sid *sidptr) -{ - int rc; - struct rb_node *node = root->rb_node; - struct cifs_sid_id *lsidid; - - while (node) { - lsidid = rb_entry(node, struct cifs_sid_id, rbnode); - rc = compare_sids(sidptr, &((lsidid)->sid)); - if (rc > 0) { - node = node->rb_left; - } else if (rc < 0) { - node = node->rb_right; - } else /* node found */ - return lsidid; + /* compare all of the subauth values if any */ + num_sat = ctsid->num_subauth; + num_saw = cwsid->num_subauth; + num_subauth = num_sat < num_saw ? num_sat : num_saw; + if (num_subauth) { + for (i = 0; i < num_subauth; ++i) { + if (ctsid->sub_auth[i] != cwsid->sub_auth[i]) { + if (le32_to_cpu(ctsid->sub_auth[i]) > + le32_to_cpu(cwsid->sub_auth[i])) + return 1; + else + return -1; + } + } } - return NULL; + return 0; /* sids compare/match */ } -static int -sidid_pending_wait(void *unused) +static void +cifs_copy_sid(struct cifs_sid *dst, const struct cifs_sid *src) { - schedule(); - return signal_pending(current) ? -ERESTARTSYS : 0; + int i; + + dst->revision = src->revision; + dst->num_subauth = min_t(u8, src->num_subauth, SID_MAX_SUB_AUTHORITIES); + for (i = 0; i < NUM_AUTHS; ++i) + dst->authority[i] = src->authority[i]; + for (i = 0; i < dst->num_subauth; ++i) + dst->sub_auth[i] = src->sub_auth[i]; } static int -id_to_sid(unsigned long cid, uint sidtype, struct cifs_sid *ssid) +id_to_sid(unsigned int cid, uint sidtype, struct cifs_sid *ssid) { - int rc = 0; + int rc; struct key *sidkey; + struct cifs_sid *ksid; + unsigned int ksid_size; + char desc[3 + 10 + 1]; /* 3 byte prefix + 10 bytes for value + NULL */ const struct cred *saved_cred; - struct cifs_sid *lsid; - struct cifs_sid_id *psidid, *npsidid; - struct rb_root *cidtree; - spinlock_t *cidlock; - - if (sidtype == SIDOWNER) { - cidlock = &siduidlock; - cidtree = &uidtree; - } else if (sidtype == SIDGROUP) { - cidlock = &sidgidlock; - cidtree = &gidtree; - } else - return -EINVAL; - spin_lock(cidlock); - psidid = sid_rb_search(cidtree, cid); - - if (!psidid) { /* node does not exist, allocate one & attempt adding */ - spin_unlock(cidlock); - npsidid = kzalloc(sizeof(struct cifs_sid_id), GFP_KERNEL); - if (!npsidid) - return -ENOMEM; - - npsidid->sidstr = kmalloc(SIDLEN, GFP_KERNEL); - if (!npsidid->sidstr) { - kfree(npsidid); - return -ENOMEM; - } + rc = snprintf(desc, sizeof(desc), "%ci:%u", + sidtype == SIDOWNER ? 'o' : 'g', cid); + if (rc >= sizeof(desc)) + return -EINVAL; - spin_lock(cidlock); - psidid = sid_rb_search(cidtree, cid); - if (psidid) { /* node happened to get inserted meanwhile */ - ++psidid->refcount; - spin_unlock(cidlock); - kfree(npsidid->sidstr); - kfree(npsidid); - } else { - psidid = npsidid; - sid_rb_insert(cidtree, cid, &psidid, - sidtype == SIDOWNER ? "oi:" : "gi:"); - ++psidid->refcount; - spin_unlock(cidlock); - } - } else { - ++psidid->refcount; - spin_unlock(cidlock); + rc = 0; + saved_cred = override_creds(root_cred); + sidkey = request_key(&cifs_idmap_key_type, desc, ""); + if (IS_ERR(sidkey)) { + rc = -EINVAL; + cFYI(1, "%s: Can't map %cid %u to a SID", __func__, + sidtype == SIDOWNER ? 'u' : 'g', cid); + goto out_revert_creds; + } else if (sidkey->datalen < CIFS_SID_BASE_SIZE) { + rc = -EIO; + cFYI(1, "%s: Downcall contained malformed key " + "(datalen=%hu)", __func__, sidkey->datalen); + goto invalidate_key; } /* - * If we are here, it is safe to access psidid and its fields - * since a reference was taken earlier while holding the spinlock. - * A reference on the node is put without holding the spinlock - * and it is OK to do so in this case, shrinker will not erase - * this node until all references are put and we do not access - * any fields of the node after a reference is put . + * A sid is usually too large to be embedded in payload.value, but if + * there are no subauthorities and the host has 8-byte pointers, then + * it could be. */ - if (test_bit(SID_ID_MAPPED, &psidid->state)) { - memcpy(ssid, &psidid->sid, sizeof(struct cifs_sid)); - psidid->time = jiffies; /* update ts for accessing */ - goto id_sid_out; - } - - if (time_after(psidid->time + SID_MAP_RETRY, jiffies)) { - rc = -EINVAL; - goto id_sid_out; + ksid = sidkey->datalen <= sizeof(sidkey->payload) ? + (struct cifs_sid *)&sidkey->payload.value : + (struct cifs_sid *)sidkey->payload.data; + + ksid_size = CIFS_SID_BASE_SIZE + (ksid->num_subauth * sizeof(__le32)); + if (ksid_size > sidkey->datalen) { + rc = -EIO; + cFYI(1, "%s: Downcall contained malformed key (datalen=%hu, " + "ksid_size=%u)", __func__, sidkey->datalen, ksid_size); + goto invalidate_key; } - if (!test_and_set_bit(SID_ID_PENDING, &psidid->state)) { - saved_cred = override_creds(root_cred); - sidkey = request_key(&cifs_idmap_key_type, psidid->sidstr, ""); - if (IS_ERR(sidkey)) { - rc = -EINVAL; - cFYI(1, "%s: Can't map and id to a SID", __func__); - } else { - lsid = (struct cifs_sid *)sidkey->payload.data; - memcpy(&psidid->sid, lsid, - sidkey->datalen < sizeof(struct cifs_sid) ? - sidkey->datalen : sizeof(struct cifs_sid)); - memcpy(ssid, &psidid->sid, - sidkey->datalen < sizeof(struct cifs_sid) ? - sidkey->datalen : sizeof(struct cifs_sid)); - set_bit(SID_ID_MAPPED, &psidid->state); - key_put(sidkey); - kfree(psidid->sidstr); - } - psidid->time = jiffies; /* update ts for accessing */ - revert_creds(saved_cred); - clear_bit(SID_ID_PENDING, &psidid->state); - wake_up_bit(&psidid->state, SID_ID_PENDING); - } else { - rc = wait_on_bit(&psidid->state, SID_ID_PENDING, - sidid_pending_wait, TASK_INTERRUPTIBLE); - if (rc) { - cFYI(1, "%s: sidid_pending_wait interrupted %d", - __func__, rc); - --psidid->refcount; - return rc; - } - if (test_bit(SID_ID_MAPPED, &psidid->state)) - memcpy(ssid, &psidid->sid, sizeof(struct cifs_sid)); - else - rc = -EINVAL; - } -id_sid_out: - --psidid->refcount; + cifs_copy_sid(ssid, ksid); +out_key_put: + key_put(sidkey); +out_revert_creds: + revert_creds(saved_cred); return rc; + +invalidate_key: + key_invalidate(sidkey); + goto out_key_put; } static int @@ -410,111 +263,67 @@ sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid, struct cifs_fattr *fattr, uint sidtype) { int rc; - unsigned long cid; - struct key *idkey; + struct key *sidkey; + char *sidstr; const struct cred *saved_cred; - struct cifs_sid_id *psidid, *npsidid; - struct rb_root *cidtree; - spinlock_t *cidlock; - - if (sidtype == SIDOWNER) { - cid = cifs_sb->mnt_uid; /* default uid, in case upcall fails */ - cidlock = &siduidlock; - cidtree = &uidtree; - } else if (sidtype == SIDGROUP) { - cid = cifs_sb->mnt_gid; /* default gid, in case upcall fails */ - cidlock = &sidgidlock; - cidtree = &gidtree; - } else - return -ENOENT; - - spin_lock(cidlock); - psidid = id_rb_search(cidtree, psid); - - if (!psidid) { /* node does not exist, allocate one & attempt adding */ - spin_unlock(cidlock); - npsidid = kzalloc(sizeof(struct cifs_sid_id), GFP_KERNEL); - if (!npsidid) - return -ENOMEM; - - npsidid->sidstr = kmalloc(SIDLEN, GFP_KERNEL); - if (!npsidid->sidstr) { - kfree(npsidid); - return -ENOMEM; - } - - spin_lock(cidlock); - psidid = id_rb_search(cidtree, psid); - if (psidid) { /* node happened to get inserted meanwhile */ - ++psidid->refcount; - spin_unlock(cidlock); - kfree(npsidid->sidstr); - kfree(npsidid); - } else { - psidid = npsidid; - id_rb_insert(cidtree, psid, &psidid, - sidtype == SIDOWNER ? "os:" : "gs:"); - ++psidid->refcount; - spin_unlock(cidlock); - } - } else { - ++psidid->refcount; - spin_unlock(cidlock); - } + uid_t fuid = cifs_sb->mnt_uid; + gid_t fgid = cifs_sb->mnt_gid; /* - * If we are here, it is safe to access psidid and its fields - * since a reference was taken earlier while holding the spinlock. - * A reference on the node is put without holding the spinlock - * and it is OK to do so in this case, shrinker will not erase - * this node until all references are put and we do not access - * any fields of the node after a reference is put . + * If we have too many subauthorities, then something is really wrong. + * Just return an error. */ - if (test_bit(SID_ID_MAPPED, &psidid->state)) { - cid = psidid->id; - psidid->time = jiffies; /* update ts for accessing */ - goto sid_to_id_out; + if (unlikely(psid->num_subauth > SID_MAX_SUB_AUTHORITIES)) { + cFYI(1, "%s: %u subauthorities is too many!", __func__, + psid->num_subauth); + return -EIO; } - if (time_after(psidid->time + SID_MAP_RETRY, jiffies)) - goto sid_to_id_out; - - if (!test_and_set_bit(SID_ID_PENDING, &psidid->state)) { - saved_cred = override_creds(root_cred); - idkey = request_key(&cifs_idmap_key_type, psidid->sidstr, ""); - if (IS_ERR(idkey)) - cFYI(1, "%s: Can't map SID to an id", __func__); - else { - cid = *(unsigned long *)idkey->payload.value; - psidid->id = cid; - set_bit(SID_ID_MAPPED, &psidid->state); - key_put(idkey); - kfree(psidid->sidstr); - } - revert_creds(saved_cred); - psidid->time = jiffies; /* update ts for accessing */ - clear_bit(SID_ID_PENDING, &psidid->state); - wake_up_bit(&psidid->state, SID_ID_PENDING); - } else { - rc = wait_on_bit(&psidid->state, SID_ID_PENDING, - sidid_pending_wait, TASK_INTERRUPTIBLE); - if (rc) { - cFYI(1, "%s: sidid_pending_wait interrupted %d", - __func__, rc); - --psidid->refcount; /* decremented without spinlock */ - return rc; - } - if (test_bit(SID_ID_MAPPED, &psidid->state)) - cid = psidid->id; + sidstr = sid_to_key_str(psid, sidtype); + if (!sidstr) + return -ENOMEM; + + saved_cred = override_creds(root_cred); + sidkey = request_key(&cifs_idmap_key_type, sidstr, ""); + if (IS_ERR(sidkey)) { + rc = -EINVAL; + cFYI(1, "%s: Can't map SID %s to a %cid", __func__, sidstr, + sidtype == SIDOWNER ? 'u' : 'g'); + goto out_revert_creds; + } + + /* + * FIXME: Here we assume that uid_t and gid_t are same size. It's + * probably a safe assumption but might be better to check based on + * sidtype. + */ + if (sidkey->datalen != sizeof(uid_t)) { + rc = -EIO; + cFYI(1, "%s: Downcall contained malformed key " + "(datalen=%hu)", __func__, sidkey->datalen); + key_invalidate(sidkey); + goto out_key_put; } -sid_to_id_out: - --psidid->refcount; /* decremented without spinlock */ if (sidtype == SIDOWNER) - fattr->cf_uid = cid; + memcpy(&fuid, &sidkey->payload.value, sizeof(uid_t)); else - fattr->cf_gid = cid; + memcpy(&fgid, &sidkey->payload.value, sizeof(gid_t)); + +out_key_put: + key_put(sidkey); +out_revert_creds: + revert_creds(saved_cred); + kfree(sidstr); + /* + * Note that we return 0 here unconditionally. If the mapping + * fails then we just fall back to using the mnt_uid/mnt_gid. + */ + if (sidtype == SIDOWNER) + fattr->cf_uid = fuid; + else + fattr->cf_gid = fgid; return 0; } @@ -561,17 +370,6 @@ init_cifs_idmap(void) cred->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING; root_cred = cred; - spin_lock_init(&siduidlock); - uidtree = RB_ROOT; - spin_lock_init(&sidgidlock); - gidtree = RB_ROOT; - - spin_lock_init(&uidsidlock); - siduidtree = RB_ROOT; - spin_lock_init(&gidsidlock); - sidgidtree = RB_ROOT; - register_shrinker(&cifs_shrinker); - cFYI(1, "cifs idmap keyring: %d", key_serial(keyring)); return 0; @@ -588,95 +386,13 @@ exit_cifs_idmap(void) key_revoke(root_cred->thread_keyring); unregister_key_type(&cifs_idmap_key_type); put_cred(root_cred); - unregister_shrinker(&cifs_shrinker); cFYI(1, "Unregistered %s key type", cifs_idmap_key_type.name); } -void -cifs_destroy_idmaptrees(void) -{ - struct rb_root *root; - struct rb_node *node; - - root = &uidtree; - spin_lock(&siduidlock); - while ((node = rb_first(root))) - rb_erase(node, root); - spin_unlock(&siduidlock); - - root = &gidtree; - spin_lock(&sidgidlock); - while ((node = rb_first(root))) - rb_erase(node, root); - spin_unlock(&sidgidlock); - - root = &siduidtree; - spin_lock(&uidsidlock); - while ((node = rb_first(root))) - rb_erase(node, root); - spin_unlock(&uidsidlock); - - root = &sidgidtree; - spin_lock(&gidsidlock); - while ((node = rb_first(root))) - rb_erase(node, root); - spin_unlock(&gidsidlock); -} - -/* if the two SIDs (roughly equivalent to a UUID for a user or group) are - the same returns 1, if they do not match returns 0 */ -int compare_sids(const struct cifs_sid *ctsid, const struct cifs_sid *cwsid) -{ - int i; - int num_subauth, num_sat, num_saw; - - if ((!ctsid) || (!cwsid)) - return 1; - - /* compare the revision */ - if (ctsid->revision != cwsid->revision) { - if (ctsid->revision > cwsid->revision) - return 1; - else - return -1; - } - - /* compare all of the six auth values */ - for (i = 0; i < 6; ++i) { - if (ctsid->authority[i] != cwsid->authority[i]) { - if (ctsid->authority[i] > cwsid->authority[i]) - return 1; - else - return -1; - } - } - - /* compare all of the subauth values if any */ - num_sat = ctsid->num_subauth; - num_saw = cwsid->num_subauth; - num_subauth = num_sat < num_saw ? num_sat : num_saw; - if (num_subauth) { - for (i = 0; i < num_subauth; ++i) { - if (ctsid->sub_auth[i] != cwsid->sub_auth[i]) { - if (le32_to_cpu(ctsid->sub_auth[i]) > - le32_to_cpu(cwsid->sub_auth[i])) - return 1; - else - return -1; - } - } - } - - return 0; /* sids compare/match */ -} - - /* copy ntsd, owner sid, and group sid from a security descriptor to another */ static void copy_sec_desc(const struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd, __u32 sidsoffset) { - int i; - struct cifs_sid *owner_sid_ptr, *group_sid_ptr; struct cifs_sid *nowner_sid_ptr, *ngroup_sid_ptr; @@ -692,26 +408,14 @@ static void copy_sec_desc(const struct cifs_ntsd *pntsd, owner_sid_ptr = (struct cifs_sid *)((char *)pntsd + le32_to_cpu(pntsd->osidoffset)); nowner_sid_ptr = (struct cifs_sid *)((char *)pnntsd + sidsoffset); - - nowner_sid_ptr->revision = owner_sid_ptr->revision; - nowner_sid_ptr->num_subauth = owner_sid_ptr->num_subauth; - for (i = 0; i < 6; i++) - nowner_sid_ptr->authority[i] = owner_sid_ptr->authority[i]; - for (i = 0; i < 5; i++) - nowner_sid_ptr->sub_auth[i] = owner_sid_ptr->sub_auth[i]; + cifs_copy_sid(nowner_sid_ptr, owner_sid_ptr); /* copy group sid */ group_sid_ptr = (struct cifs_sid *)((char *)pntsd + le32_to_cpu(pntsd->gsidoffset)); ngroup_sid_ptr = (struct cifs_sid *)((char *)pnntsd + sidsoffset + sizeof(struct cifs_sid)); - - ngroup_sid_ptr->revision = group_sid_ptr->revision; - ngroup_sid_ptr->num_subauth = group_sid_ptr->num_subauth; - for (i = 0; i < 6; i++) - ngroup_sid_ptr->authority[i] = group_sid_ptr->authority[i]; - for (i = 0; i < 5; i++) - ngroup_sid_ptr->sub_auth[i] = group_sid_ptr->sub_auth[i]; + cifs_copy_sid(ngroup_sid_ptr, group_sid_ptr); return; } @@ -818,7 +522,7 @@ static __u16 fill_ace_for_sid(struct cifs_ace *pntace, pntace->sid.revision = psid->revision; pntace->sid.num_subauth = psid->num_subauth; - for (i = 0; i < 6; i++) + for (i = 0; i < NUM_AUTHS; i++) pntace->sid.authority[i] = psid->authority[i]; for (i = 0; i < psid->num_subauth; i++) pntace->sid.sub_auth[i] = psid->sub_auth[i]; @@ -994,8 +698,8 @@ static int parse_sid(struct cifs_sid *psid, char *end_of_acl) return -EINVAL; } - if (psid->num_subauth) { #ifdef CONFIG_CIFS_DEBUG2 + if (psid->num_subauth) { int i; cFYI(1, "SID revision %d num_auth %d", psid->revision, psid->num_subauth); @@ -1009,8 +713,8 @@ static int parse_sid(struct cifs_sid *psid, char *end_of_acl) num auths and therefore go off the end */ cFYI(1, "RID 0x%x", le32_to_cpu(psid->sub_auth[psid->num_subauth-1])); -#endif } +#endif return 0; } @@ -1120,8 +824,7 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd, kfree(nowner_sid_ptr); return rc; } - memcpy(owner_sid_ptr, nowner_sid_ptr, - sizeof(struct cifs_sid)); + cifs_copy_sid(owner_sid_ptr, nowner_sid_ptr); kfree(nowner_sid_ptr); *aclflag = CIFS_ACL_OWNER; } @@ -1139,8 +842,7 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd, kfree(ngroup_sid_ptr); return rc; } - memcpy(group_sid_ptr, ngroup_sid_ptr, - sizeof(struct cifs_sid)); + cifs_copy_sid(group_sid_ptr, ngroup_sid_ptr); kfree(ngroup_sid_ptr); *aclflag = CIFS_ACL_GROUP; } @@ -1316,42 +1018,39 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode, /* Get the security descriptor */ pntsd = get_cifs_acl(CIFS_SB(inode->i_sb), inode, path, &secdesclen); - - /* Add three ACEs for owner, group, everyone getting rid of - other ACEs as chmod disables ACEs and set the security descriptor */ - if (IS_ERR(pntsd)) { rc = PTR_ERR(pntsd); cERROR(1, "%s: error %d getting sec desc", __func__, rc); - } else { - /* allocate memory for the smb header, - set security descriptor request security descriptor - parameters, and secuirty descriptor itself */ - - secdesclen = secdesclen < DEFSECDESCLEN ? - DEFSECDESCLEN : secdesclen; - pnntsd = kmalloc(secdesclen, GFP_KERNEL); - if (!pnntsd) { - cERROR(1, "Unable to allocate security descriptor"); - kfree(pntsd); - return -ENOMEM; - } + goto out; + } - rc = build_sec_desc(pntsd, pnntsd, secdesclen, nmode, uid, gid, - &aclflag); + /* + * Add three ACEs for owner, group, everyone getting rid of other ACEs + * as chmod disables ACEs and set the security descriptor. Allocate + * memory for the smb header, set security descriptor request security + * descriptor parameters, and secuirty descriptor itself + */ + secdesclen = max_t(u32, secdesclen, DEFAULT_SEC_DESC_LEN); + pnntsd = kmalloc(secdesclen, GFP_KERNEL); + if (!pnntsd) { + cERROR(1, "Unable to allocate security descriptor"); + kfree(pntsd); + return -ENOMEM; + } - cFYI(DBG2, "build_sec_desc rc: %d", rc); + rc = build_sec_desc(pntsd, pnntsd, secdesclen, nmode, uid, gid, + &aclflag); - if (!rc) { - /* Set the security descriptor */ - rc = set_cifs_acl(pnntsd, secdesclen, inode, - path, aclflag); - cFYI(DBG2, "set_cifs_acl rc: %d", rc); - } + cFYI(DBG2, "build_sec_desc rc: %d", rc); - kfree(pnntsd); - kfree(pntsd); + if (!rc) { + /* Set the security descriptor */ + rc = set_cifs_acl(pnntsd, secdesclen, inode, path, aclflag); + cFYI(DBG2, "set_cifs_acl rc: %d", rc); } + kfree(pnntsd); + kfree(pntsd); +out: return rc; } diff --git a/fs/cifs/cifsacl.h b/fs/cifs/cifsacl.h index 5c902c7ce524..4f3884835267 100644 --- a/fs/cifs/cifsacl.h +++ b/fs/cifs/cifsacl.h @@ -23,11 +23,8 @@ #define _CIFSACL_H -#define NUM_AUTHS 6 /* number of authority fields */ -#define NUM_SUBAUTHS 5 /* number of sub authority fields */ -#define NUM_WK_SIDS 7 /* number of well known sids */ -#define SIDNAMELENGTH 20 /* long enough for the ones we care about */ -#define DEFSECDESCLEN 192 /* sec desc len contaiting a dacl with three aces */ +#define NUM_AUTHS (6) /* number of authority fields */ +#define SID_MAX_SUB_AUTHORITIES (15) /* max number of sub authority fields */ #define READ_BIT 0x4 #define WRITE_BIT 0x2 @@ -41,12 +38,32 @@ #define SIDOWNER 1 #define SIDGROUP 2 -#define SIDLEN 150 /* S- 1 revision- 6 authorities- max 5 sub authorities */ -#define SID_ID_MAPPED 0 -#define SID_ID_PENDING 1 -#define SID_MAP_EXPIRE (3600 * HZ) /* map entry expires after one hour */ -#define SID_MAP_RETRY (300 * HZ) /* wait 5 minutes for next attempt to map */ +/* + * Security Descriptor length containing DACL with 3 ACEs (one each for + * owner, group and world). + */ +#define DEFAULT_SEC_DESC_LEN (sizeof(struct cifs_ntsd) + \ + sizeof(struct cifs_acl) + \ + (sizeof(struct cifs_ace) * 3)) + +/* + * Maximum size of a string representation of a SID: + * + * The fields are unsigned values in decimal. So: + * + * u8: max 3 bytes in decimal + * u32: max 10 bytes in decimal + * + * "S-" + 3 bytes for version field + 15 for authority field + NULL terminator + * + * For authority field, max is when all 6 values are non-zero and it must be + * represented in hex. So "-0x" + 12 hex digits. + * + * Add 11 bytes for each subauthority field (10 bytes each + 1 for '-') + */ +#define SID_STRING_BASE_SIZE (2 + 3 + 15 + 1) +#define SID_STRING_SUBAUTH_SIZE (11) /* size of a single subauth string */ struct cifs_ntsd { __le16 revision; /* revision level */ @@ -60,10 +77,13 @@ struct cifs_ntsd { struct cifs_sid { __u8 revision; /* revision level */ __u8 num_subauth; - __u8 authority[6]; - __le32 sub_auth[5]; /* sub_auth[num_subauth] */ + __u8 authority[NUM_AUTHS]; + __le32 sub_auth[SID_MAX_SUB_AUTHORITIES]; /* sub_auth[num_subauth] */ } __attribute__((packed)); +/* size of a struct cifs_sid, sans sub_auth array */ +#define CIFS_SID_BASE_SIZE (1 + 1 + NUM_AUTHS) + struct cifs_acl { __le16 revision; /* revision level */ __le16 size; @@ -78,26 +98,4 @@ struct cifs_ace { struct cifs_sid sid; /* ie UUID of user or group who gets these perms */ } __attribute__((packed)); -struct cifs_wksid { - struct cifs_sid cifssid; - char sidname[SIDNAMELENGTH]; -} __attribute__((packed)); - -struct cifs_sid_id { - unsigned int refcount; /* increment with spinlock, decrement without */ - unsigned long id; - unsigned long time; - unsigned long state; - char *sidstr; - struct rb_node rbnode; - struct cifs_sid sid; -}; - -#ifdef __KERNEL__ -extern struct key_type cifs_idmap_key_type; -extern const struct cred *root_cred; -#endif /* KERNEL */ - -extern int compare_sids(const struct cifs_sid *, const struct cifs_sid *); - #endif /* _CIFSACL_H */ diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index e7931cc55d0c..210f0af83fc4 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -64,24 +64,23 @@ unsigned int global_secflags = CIFSSEC_DEF; unsigned int sign_CIFS_PDUs = 1; static const struct super_operations cifs_super_ops; unsigned int CIFSMaxBufSize = CIFS_MAX_MSGSIZE; -module_param(CIFSMaxBufSize, int, 0); +module_param(CIFSMaxBufSize, uint, 0); MODULE_PARM_DESC(CIFSMaxBufSize, "Network buffer size (not including header). " "Default: 16384 Range: 8192 to 130048"); unsigned int cifs_min_rcv = CIFS_MIN_RCV_POOL; -module_param(cifs_min_rcv, int, 0); +module_param(cifs_min_rcv, uint, 0); MODULE_PARM_DESC(cifs_min_rcv, "Network buffers in pool. Default: 4 Range: " "1 to 64"); unsigned int cifs_min_small = 30; -module_param(cifs_min_small, int, 0); +module_param(cifs_min_small, uint, 0); MODULE_PARM_DESC(cifs_min_small, "Small network buffers in pool. Default: 30 " "Range: 2 to 256"); unsigned int cifs_max_pending = CIFS_MAX_REQ; -module_param(cifs_max_pending, int, 0444); +module_param(cifs_max_pending, uint, 0444); MODULE_PARM_DESC(cifs_max_pending, "Simultaneous requests to server. " "Default: 32767 Range: 2 to 32767."); module_param(enable_oplocks, bool, 0644); -MODULE_PARM_DESC(enable_oplocks, "Enable or disable oplocks (bool). Default:" - "y/Y/1"); +MODULE_PARM_DESC(enable_oplocks, "Enable or disable oplocks. Default: y/Y/1"); extern mempool_t *cifs_sm_req_poolp; extern mempool_t *cifs_req_poolp; @@ -230,6 +229,7 @@ cifs_alloc_inode(struct super_block *sb) cifs_set_oplock_level(cifs_inode, 0); cifs_inode->delete_pending = false; cifs_inode->invalid_mapping = false; + cifs_inode->leave_pages_clean = false; cifs_inode->vfs_inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */ cifs_inode->server_eof = 0; cifs_inode->uniqueid = 0; @@ -540,8 +540,8 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb) char *s, *p; char sep; - full_path = build_path_to_root(vol, cifs_sb, - cifs_sb_master_tcon(cifs_sb)); + full_path = cifs_build_path_to_root(vol, cifs_sb, + cifs_sb_master_tcon(cifs_sb)); if (full_path == NULL) return ERR_PTR(-ENOMEM); @@ -1205,7 +1205,6 @@ exit_cifs(void) unregister_filesystem(&cifs_fs_type); cifs_dfs_release_automount_timer(); #ifdef CONFIG_CIFS_ACL - cifs_destroy_idmaptrees(); exit_cifs_idmap(); #endif #ifdef CONFIG_CIFS_UPCALL diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index f5af2527fc69..aea1eec64911 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -178,6 +178,7 @@ struct smb_rqst { enum smb_version { Smb_1 = 1, + Smb_20, Smb_21, Smb_30, }; @@ -280,9 +281,6 @@ struct smb_version_operations { /* set attributes */ int (*set_file_info)(struct inode *, const char *, FILE_BASIC_INFO *, const unsigned int); - /* build a full path to the root of the mount */ - char * (*build_path_to_root)(struct smb_vol *, struct cifs_sb_info *, - struct cifs_tcon *); /* check if we can send an echo or nor */ bool (*can_echo)(struct TCP_Server_Info *); /* send echo request */ @@ -369,6 +367,8 @@ struct smb_version_operations { void (*set_lease_key)(struct inode *, struct cifs_fid *fid); /* generate new lease key */ void (*new_lease_key)(struct cifs_fid *fid); + int (*calc_signature)(struct smb_rqst *rqst, + struct TCP_Server_Info *server); }; struct smb_version_values { @@ -396,7 +396,6 @@ struct smb_vol { char *password; char *domainname; char *UNC; - char *UNCip; char *iocharset; /* local code page for mapping to and from Unicode */ char source_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* clnt nb name */ char target_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* srvr nb name */ @@ -444,11 +443,11 @@ struct smb_vol { unsigned int rsize; unsigned int wsize; bool sockopt_tcp_nodelay:1; - unsigned short int port; unsigned long actimeo; /* attribute cache timeout (jiffies) */ struct smb_version_operations *ops; struct smb_version_values *vals; char *prepath; + struct sockaddr_storage dstaddr; /* destination address */ struct sockaddr_storage srcaddr; /* allow binding to a local IP */ struct nls_table *local_nls; }; @@ -1031,6 +1030,7 @@ struct cifsInodeInfo { bool clientCanCacheAll; /* read and writebehind oplock */ bool delete_pending; /* DELETE_ON_CLOSE is set */ bool invalid_mapping; /* pagecache is invalid */ + bool leave_pages_clean; /* protected by i_mutex, not set pages dirty */ unsigned long time; /* jiffies of last update of inode */ u64 server_eof; /* current file size on server -- protected by i_lock */ u64 uniqueid; /* server inode number */ @@ -1067,30 +1067,16 @@ static inline char CIFS_DIR_SEP(const struct cifs_sb_info *cifs_sb) static inline void convert_delimiter(char *path, char delim) { - int i; - char old_delim; - - if (path == NULL) - return; + char old_delim, *pos; if (delim == '/') old_delim = '\\'; else old_delim = '/'; - for (i = 0; path[i] != '\0'; i++) { - if (path[i] == old_delim) - path[i] = delim; - } -} - -static inline char * -build_path_to_root(struct smb_vol *vol, struct cifs_sb_info *cifs_sb, - struct cifs_tcon *tcon) -{ - if (!vol->ops->build_path_to_root) - return NULL; - return vol->ops->build_path_to_root(vol, cifs_sb, tcon); + pos = path; + while ((pos = strchr(pos, old_delim))) + *pos = delim; } #ifdef CONFIG_CIFS_STATS @@ -1362,7 +1348,7 @@ require use of the stronger protocol */ #define CIFSSEC_MUST_SEAL 0x40040 /* not supported yet */ #define CIFSSEC_MUST_NTLMSSP 0x80080 /* raw ntlmssp with ntlmv2 */ -#define CIFSSEC_DEF (CIFSSEC_MAY_SIGN | CIFSSEC_MAY_NTLM | CIFSSEC_MAY_NTLMV2 | CIFSSEC_MAY_NTLMSSP) +#define CIFSSEC_DEF (CIFSSEC_MAY_SIGN | CIFSSEC_MAY_NTLMSSP) #define CIFSSEC_MAX (CIFSSEC_MUST_SIGN | CIFSSEC_MUST_NTLMV2) #define CIFSSEC_AUTH_MASK (CIFSSEC_MAY_NTLM | CIFSSEC_MAY_NTLMV2 | CIFSSEC_MAY_LANMAN | CIFSSEC_MAY_PLNTXT | CIFSSEC_MAY_KRB5 | CIFSSEC_MAY_NTLMSSP) /* @@ -1506,6 +1492,6 @@ extern struct smb_version_values smb20_values; extern struct smb_version_operations smb21_operations; extern struct smb_version_values smb21_values; #define SMB30_VERSION_STRING "3.0" -/*extern struct smb_version_operations smb30_operations; */ /* not needed yet */ +extern struct smb_version_operations smb30_operations; extern struct smb_version_values smb30_values; #endif /* _CIFS_GLOB_H */ diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 5144e9fbeb8c..1988c1baa224 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -58,8 +58,10 @@ do { \ } while (0) extern int init_cifs_idmap(void); extern void exit_cifs_idmap(void); -extern void cifs_destroy_idmaptrees(void); extern char *build_path_from_dentry(struct dentry *); +extern char *cifs_build_path_to_root(struct smb_vol *vol, + struct cifs_sb_info *cifs_sb, + struct cifs_tcon *tcon); extern char *build_wildcard_path_from_dentry(struct dentry *direntry); extern char *cifs_compose_mount_options(const char *sb_mountdata, const char *fullpath, const struct dfs_info3_param *ref, @@ -107,9 +109,7 @@ extern unsigned int smbCalcSize(void *buf); extern int decode_negTokenInit(unsigned char *security_blob, int length, struct TCP_Server_Info *server); extern int cifs_convert_address(struct sockaddr *dst, const char *src, int len); -extern int cifs_set_port(struct sockaddr *addr, const unsigned short int port); -extern int cifs_fill_sockaddr(struct sockaddr *dst, const char *src, int len, - const unsigned short int port); +extern void cifs_set_port(struct sockaddr *addr, const unsigned short int port); extern int map_smb_to_linux_error(char *buf, bool logErr); extern void header_assemble(struct smb_hdr *, char /* command */ , const struct cifs_tcon *, int /* length of @@ -185,7 +185,7 @@ extern void cifs_mark_open_files_invalid(struct cifs_tcon *tcon); extern bool cifs_find_lock_conflict(struct cifsFileInfo *cfile, __u64 offset, __u64 length, __u8 type, struct cifsLockInfo **conf_lock, - bool rw_check); + int rw_check); extern void cifs_add_pending_open(struct cifs_fid *fid, struct tcon_link *tlink, struct cifs_pending_open *open); diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 5c670b998ffb..7635b5db26a7 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -186,6 +186,7 @@ static const match_table_t cifs_mount_option_tokens = { { Opt_user, "user=%s" }, { Opt_user, "username=%s" }, { Opt_blank_pass, "pass=" }, + { Opt_blank_pass, "password=" }, { Opt_pass, "pass=%s" }, { Opt_pass, "password=%s" }, { Opt_blank_ip, "ip=" }, @@ -274,6 +275,7 @@ static const match_table_t cifs_cacheflavor_tokens = { static const match_table_t cifs_smb_version_tokens = { { Smb_1, SMB1_VERSION_STRING }, + { Smb_20, SMB20_VERSION_STRING}, { Smb_21, SMB21_VERSION_STRING }, { Smb_30, SMB30_VERSION_STRING }, }; @@ -1074,12 +1076,16 @@ cifs_parse_smb_version(char *value, struct smb_vol *vol) vol->vals = &smb1_values; break; #ifdef CONFIG_CIFS_SMB2 + case Smb_20: + vol->ops = &smb21_operations; /* currently identical with 2.1 */ + vol->vals = &smb20_values; + break; case Smb_21: vol->ops = &smb21_operations; vol->vals = &smb21_values; break; case Smb_30: - vol->ops = &smb21_operations; /* currently identical with 2.1 */ + vol->ops = &smb30_operations; vol->vals = &smb30_values; break; #endif @@ -1090,6 +1096,52 @@ cifs_parse_smb_version(char *value, struct smb_vol *vol) return 0; } +/* + * Parse a devname into substrings and populate the vol->UNC and vol->prepath + * fields with the result. Returns 0 on success and an error otherwise. + */ +static int +cifs_parse_devname(const char *devname, struct smb_vol *vol) +{ + char *pos; + const char *delims = "/\\"; + size_t len; + + /* make sure we have a valid UNC double delimiter prefix */ + len = strspn(devname, delims); + if (len != 2) + return -EINVAL; + + /* find delimiter between host and sharename */ + pos = strpbrk(devname + 2, delims); + if (!pos) + return -EINVAL; + + /* skip past delimiter */ + ++pos; + + /* now go until next delimiter or end of string */ + len = strcspn(pos, delims); + + /* move "pos" up to delimiter or NULL */ + pos += len; + vol->UNC = kstrndup(devname, pos - devname, GFP_KERNEL); + if (!vol->UNC) + return -ENOMEM; + + convert_delimiter(vol->UNC, '\\'); + + /* If pos is NULL, or is a bogus trailing delimiter then no prepath */ + if (!*pos++ || !*pos) + return 0; + + vol->prepath = kstrdup(pos, GFP_KERNEL); + if (!vol->prepath) + return -ENOMEM; + + return 0; +} + static int cifs_parse_mount_options(const char *mountdata, const char *devname, struct smb_vol *vol) @@ -1108,11 +1160,17 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, char *string = NULL; char *tmp_end, *value; char delim; + bool got_ip = false; + unsigned short port = 0; + struct sockaddr *dstaddr = (struct sockaddr *)&vol->dstaddr; separator[0] = ','; separator[1] = 0; delim = separator[0]; + /* ensure we always start with zeroed-out smb_vol */ + memset(vol, 0, sizeof(*vol)); + /* * does not have to be perfect mapping since field is * informational, only used for servers that do not support @@ -1169,6 +1227,16 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, vol->backupuid_specified = false; /* no backup intent for a user */ vol->backupgid_specified = false; /* no backup intent for a group */ + /* + * For now, we ignore -EINVAL errors under the assumption that the + * unc= and prefixpath= options will be usable. + */ + if (cifs_parse_devname(devname, vol) == -ENOMEM) { + printk(KERN_ERR "CIFS: Unable to allocate memory to parse " + "device string.\n"); + goto out_nomem; + } + while ((data = strsep(&options, separator)) != NULL) { substring_t args[MAX_OPT_ARGS]; unsigned long option; @@ -1416,12 +1484,12 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, vol->dir_mode = option; break; case Opt_port: - if (get_option_ul(args, &option)) { - cERROR(1, "%s: Invalid port value", - __func__); + if (get_option_ul(args, &option) || + option > USHRT_MAX) { + cERROR(1, "%s: Invalid port value", __func__); goto cifs_parse_mount_err; } - vol->port = option; + port = (unsigned short)option; break; case Opt_rsize: if (get_option_ul(args, &option)) { @@ -1537,53 +1605,48 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, vol->password[j] = '\0'; break; case Opt_blank_ip: - vol->UNCip = NULL; + /* FIXME: should this be an error instead? */ + got_ip = false; break; case Opt_ip: string = match_strdup(args); if (string == NULL) goto out_nomem; - if (strnlen(string, INET6_ADDRSTRLEN) > - INET6_ADDRSTRLEN) { - printk(KERN_WARNING "CIFS: ip address " - "too long\n"); - goto cifs_parse_mount_err; - } - vol->UNCip = kstrdup(string, GFP_KERNEL); - if (!vol->UNCip) { - printk(KERN_WARNING "CIFS: no memory " - "for UNC IP\n"); + if (!cifs_convert_address(dstaddr, string, + strlen(string))) { + printk(KERN_ERR "CIFS: bad ip= option (%s).\n", + string); goto cifs_parse_mount_err; } + got_ip = true; break; case Opt_unc: - string = match_strdup(args); - if (string == NULL) + string = vol->UNC; + vol->UNC = match_strdup(args); + if (vol->UNC == NULL) { + kfree(string); goto out_nomem; - - temp_len = strnlen(string, 300); - if (temp_len == 300) { - printk(KERN_WARNING "CIFS: UNC name too long\n"); - goto cifs_parse_mount_err; } - vol->UNC = kmalloc(temp_len+1, GFP_KERNEL); - if (vol->UNC == NULL) { - printk(KERN_WARNING "CIFS: no memory for UNC\n"); - goto cifs_parse_mount_err; - } - strcpy(vol->UNC, string); - - if (strncmp(string, "//", 2) == 0) { - vol->UNC[0] = '\\'; - vol->UNC[1] = '\\'; - } else if (strncmp(string, "\\\\", 2) != 0) { - printk(KERN_WARNING "CIFS: UNC Path does not " - "begin with // or \\\\\n"); + convert_delimiter(vol->UNC, '\\'); + if (vol->UNC[0] != '\\' || vol->UNC[1] != '\\') { + kfree(string); + printk(KERN_ERR "CIFS: UNC Path does not " + "begin with // or \\\\\n"); goto cifs_parse_mount_err; } + /* Compare old unc= option to new one */ + if (!string || strcmp(string, vol->UNC)) + printk(KERN_WARNING "CIFS: the value of the " + "unc= mount option does not match the " + "device string. Using the unc= option " + "for now. In 3.10, that option will " + "be ignored and the contents of the " + "device string will be used " + "instead. (%s != %s)\n", string, + vol->UNC); break; case Opt_domain: string = match_strdup(args); @@ -1618,31 +1681,26 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, } break; case Opt_prefixpath: - string = match_strdup(args); - if (string == NULL) - goto out_nomem; - - temp_len = strnlen(string, 1024); - if (string[0] != '/') - temp_len++; /* missing leading slash */ - if (temp_len > 1024) { - printk(KERN_WARNING "CIFS: prefix too long\n"); - goto cifs_parse_mount_err; - } + /* skip over any leading delimiter */ + if (*args[0].from == '/' || *args[0].from == '\\') + args[0].from++; - vol->prepath = kmalloc(temp_len+1, GFP_KERNEL); + string = vol->prepath; + vol->prepath = match_strdup(args); if (vol->prepath == NULL) { - printk(KERN_WARNING "CIFS: no memory " - "for path prefix\n"); - goto cifs_parse_mount_err; + kfree(string); + goto out_nomem; } - - if (string[0] != '/') { - vol->prepath[0] = '/'; - strcpy(vol->prepath+1, string); - } else - strcpy(vol->prepath, string); - + /* Compare old prefixpath= option to new one */ + if (!string || strcmp(string, vol->prepath)) + printk(KERN_WARNING "CIFS: the value of the " + "prefixpath= mount option does not " + "match the device string. Using the " + "prefixpath= option for now. In 3.10, " + "that option will be ignored and the " + "contents of the device string will be " + "used instead.(%s != %s)\n", string, + vol->prepath); break; case Opt_iocharset: string = match_strdup(args); @@ -1799,9 +1857,30 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, goto cifs_parse_mount_err; } #endif + if (!vol->UNC) { + cERROR(1, "CIFS mount error: No usable UNC path provided in " + "device string or in unc= option!"); + goto cifs_parse_mount_err; + } - if (vol->UNCip == NULL) - vol->UNCip = &vol->UNC[2]; + /* make sure UNC has a share name */ + if (!strchr(vol->UNC + 3, '\\')) { + cERROR(1, "Malformed UNC. Unable to find share name."); + goto cifs_parse_mount_err; + } + + if (!got_ip) { + /* No ip= option specified? Try to get it from UNC */ + if (!cifs_convert_address(dstaddr, &vol->UNC[2], + strlen(&vol->UNC[2]))) { + printk(KERN_ERR "Unable to determine destination " + "address.\n"); + goto cifs_parse_mount_err; + } + } + + /* set the port that we got earlier */ + cifs_set_port(dstaddr, port); if (uid_specified) vol->override_uid = override_uid; @@ -1972,9 +2051,10 @@ match_security(struct TCP_Server_Info *server, struct smb_vol *vol) return true; } -static int match_server(struct TCP_Server_Info *server, struct sockaddr *addr, - struct smb_vol *vol) +static int match_server(struct TCP_Server_Info *server, struct smb_vol *vol) { + struct sockaddr *addr = (struct sockaddr *)&vol->dstaddr; + if ((server->vals != vol->vals) || (server->ops != vol->ops)) return 0; @@ -1995,13 +2075,13 @@ static int match_server(struct TCP_Server_Info *server, struct sockaddr *addr, } static struct TCP_Server_Info * -cifs_find_tcp_session(struct sockaddr *addr, struct smb_vol *vol) +cifs_find_tcp_session(struct smb_vol *vol) { struct TCP_Server_Info *server; spin_lock(&cifs_tcp_ses_lock); list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) { - if (!match_server(server, addr, vol)) + if (!match_server(server, vol)) continue; ++server->srv_count; @@ -2051,40 +2131,12 @@ static struct TCP_Server_Info * cifs_get_tcp_session(struct smb_vol *volume_info) { struct TCP_Server_Info *tcp_ses = NULL; - struct sockaddr_storage addr; - struct sockaddr_in *sin_server = (struct sockaddr_in *) &addr; - struct sockaddr_in6 *sin_server6 = (struct sockaddr_in6 *) &addr; int rc; - memset(&addr, 0, sizeof(struct sockaddr_storage)); - - cFYI(1, "UNC: %s ip: %s", volume_info->UNC, volume_info->UNCip); - - if (volume_info->UNCip && volume_info->UNC) { - rc = cifs_fill_sockaddr((struct sockaddr *)&addr, - volume_info->UNCip, - strlen(volume_info->UNCip), - volume_info->port); - if (!rc) { - /* we failed translating address */ - rc = -EINVAL; - goto out_err; - } - } else if (volume_info->UNCip) { - /* BB using ip addr as tcp_ses name to connect to the - DFS root below */ - cERROR(1, "Connecting to DFS root not implemented yet"); - rc = -EINVAL; - goto out_err; - } else /* which tcp_sess DFS root would we conect to */ { - cERROR(1, "CIFS mount error: No UNC path (e.g. -o " - "unc=//192.168.1.100/public) specified"); - rc = -EINVAL; - goto out_err; - } + cFYI(1, "UNC: %s", volume_info->UNC); /* see if we already have a matching tcp_ses */ - tcp_ses = cifs_find_tcp_session((struct sockaddr *)&addr, volume_info); + tcp_ses = cifs_find_tcp_session(volume_info); if (tcp_ses) return tcp_ses; @@ -2129,27 +2181,18 @@ cifs_get_tcp_session(struct smb_vol *volume_info) INIT_LIST_HEAD(&tcp_ses->tcp_ses_list); INIT_LIST_HEAD(&tcp_ses->smb_ses_list); INIT_DELAYED_WORK(&tcp_ses->echo, cifs_echo_request); - + memcpy(&tcp_ses->srcaddr, &volume_info->srcaddr, + sizeof(tcp_ses->srcaddr)); + memcpy(&tcp_ses->dstaddr, &volume_info->dstaddr, + sizeof(tcp_ses->dstaddr)); /* * at this point we are the only ones with the pointer * to the struct since the kernel thread not created yet * no need to spinlock this init of tcpStatus or srv_count */ tcp_ses->tcpStatus = CifsNew; - memcpy(&tcp_ses->srcaddr, &volume_info->srcaddr, - sizeof(tcp_ses->srcaddr)); ++tcp_ses->srv_count; - if (addr.ss_family == AF_INET6) { - cFYI(1, "attempting ipv6 connect"); - /* BB should we allow ipv6 on port 139? */ - /* other OS never observed in Wild doing 139 with v6 */ - memcpy(&tcp_ses->dstaddr, sin_server6, - sizeof(struct sockaddr_in6)); - } else - memcpy(&tcp_ses->dstaddr, sin_server, - sizeof(struct sockaddr_in)); - rc = ip_connect(tcp_ses); if (rc < 0) { cERROR(1, "Error connecting to socket. Aborting operation"); @@ -2397,8 +2440,6 @@ cifs_set_cifscreds(struct smb_vol *vol __attribute__((unused)), } #endif /* CONFIG_KEYS */ -static bool warned_on_ntlm; /* globals init to false automatically */ - static struct cifs_ses * cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info) { @@ -2475,14 +2516,6 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info) ses->cred_uid = volume_info->cred_uid; ses->linux_uid = volume_info->linux_uid; - /* ntlmv2 is much stronger than ntlm security, and has been broadly - supported for many years, time to update default security mechanism */ - if ((volume_info->secFlg == 0) && warned_on_ntlm == false) { - warned_on_ntlm = true; - cERROR(1, "default security mechanism requested. The default " - "security mechanism will be upgraded from ntlm to " - "ntlmv2 in kernel release 3.3"); - } ses->overrideSecFlg = volume_info->secFlg; mutex_lock(&ses->session_mutex); @@ -2598,13 +2631,6 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info) } } - if (strchr(volume_info->UNC + 3, '\\') == NULL - && strchr(volume_info->UNC + 3, '/') == NULL) { - cERROR(1, "Missing share name"); - rc = -ENODEV; - goto out_fail; - } - /* * BB Do we need to wrap session_mutex around this TCon call and Unix * SetFS as we do on SessSetup and reconnect? @@ -2718,11 +2744,8 @@ cifs_match_super(struct super_block *sb, void *data) struct cifs_ses *ses; struct cifs_tcon *tcon; struct tcon_link *tlink; - struct sockaddr_storage addr; int rc = 0; - memset(&addr, 0, sizeof(struct sockaddr_storage)); - spin_lock(&cifs_tcp_ses_lock); cifs_sb = CIFS_SB(sb); tlink = cifs_get_tlink(cifs_sb_master_tlink(cifs_sb)); @@ -2736,17 +2759,7 @@ cifs_match_super(struct super_block *sb, void *data) volume_info = mnt_data->vol; - if (!volume_info->UNCip || !volume_info->UNC) - goto out; - - rc = cifs_fill_sockaddr((struct sockaddr *)&addr, - volume_info->UNCip, - strlen(volume_info->UNCip), - volume_info->port); - if (!rc) - goto out; - - if (!match_server(tcp_srv, (struct sockaddr *)&addr, volume_info) || + if (!match_server(tcp_srv, volume_info) || !match_session(ses, volume_info) || !match_tcon(tcon, volume_info->UNC)) { rc = 0; @@ -3261,8 +3274,6 @@ cleanup_volume_info_contents(struct smb_vol *volume_info) { kfree(volume_info->username); kzfree(volume_info->password); - if (volume_info->UNCip != volume_info->UNC + 2) - kfree(volume_info->UNCip); kfree(volume_info->UNC); kfree(volume_info->domainname); kfree(volume_info->iocharset); @@ -3280,14 +3291,16 @@ cifs_cleanup_volume_info(struct smb_vol *volume_info) #ifdef CONFIG_CIFS_DFS_UPCALL -/* build_path_to_root returns full path to root when - * we do not have an exiting connection (tcon) */ +/* + * cifs_build_path_to_root returns full path to root when we do not have an + * exiting connection (tcon) + */ static char * build_unc_path_to_root(const struct smb_vol *vol, const struct cifs_sb_info *cifs_sb) { char *full_path, *pos; - unsigned int pplen = vol->prepath ? strlen(vol->prepath) : 0; + unsigned int pplen = vol->prepath ? strlen(vol->prepath) + 1 : 0; unsigned int unc_len = strnlen(vol->UNC, MAX_TREE_SIZE + 1); full_path = kmalloc(unc_len + pplen + 1, GFP_KERNEL); @@ -3298,6 +3311,7 @@ build_unc_path_to_root(const struct smb_vol *vol, pos = full_path + unc_len; if (pplen) { + *pos++ = CIFS_DIR_SEP(cifs_sb); strncpy(pos, vol->prepath, pplen); pos += pplen; } @@ -3353,7 +3367,6 @@ expand_dfs_referral(const unsigned int xid, struct cifs_ses *ses, mdata = NULL; } else { cleanup_volume_info_contents(volume_info); - memset(volume_info, '\0', sizeof(*volume_info)); rc = cifs_setup_volume_info(volume_info, mdata, fake_devname); } @@ -3375,7 +3388,6 @@ cifs_setup_volume_info(struct smb_vol *volume_info, char *mount_data, if (cifs_parse_mount_options(mount_data, devname, volume_info)) return -EINVAL; - if (volume_info->nullauth) { cFYI(1, "Anonymous login"); kfree(volume_info->username); @@ -3412,7 +3424,7 @@ cifs_get_volume_info(char *mount_data, const char *devname) int rc; struct smb_vol *volume_info; - volume_info = kzalloc(sizeof(struct smb_vol), GFP_KERNEL); + volume_info = kmalloc(sizeof(struct smb_vol), GFP_KERNEL); if (!volume_info) return ERR_PTR(-ENOMEM); @@ -3537,8 +3549,10 @@ remote_path_check: rc = -ENOSYS; goto mount_fail_check; } - /* build_path_to_root works only when we have a valid tcon */ - full_path = build_path_to_root(volume_info, cifs_sb, tcon); + /* + * cifs_build_path_to_root works only when we have a valid tcon + */ + full_path = cifs_build_path_to_root(volume_info, cifs_sb, tcon); if (full_path == NULL) { rc = -ENOMEM; goto mount_fail_check; diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 7c0a81283645..8719bbe0dcc3 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -44,6 +44,38 @@ renew_parental_timestamps(struct dentry *direntry) } while (!IS_ROOT(direntry)); } +char * +cifs_build_path_to_root(struct smb_vol *vol, struct cifs_sb_info *cifs_sb, + struct cifs_tcon *tcon) +{ + int pplen = vol->prepath ? strlen(vol->prepath) + 1 : 0; + int dfsplen; + char *full_path = NULL; + + /* if no prefix path, simply set path to the root of share to "" */ + if (pplen == 0) { + full_path = kzalloc(1, GFP_KERNEL); + return full_path; + } + + if (tcon->Flags & SMB_SHARE_IS_IN_DFS) + dfsplen = strnlen(tcon->treeName, MAX_TREE_SIZE + 1); + else + dfsplen = 0; + + full_path = kmalloc(dfsplen + pplen + 1, GFP_KERNEL); + if (full_path == NULL) + return full_path; + + if (dfsplen) + strncpy(full_path, tcon->treeName, dfsplen); + full_path[dfsplen] = CIFS_DIR_SEP(cifs_sb); + strncpy(full_path + dfsplen + 1, vol->prepath, pplen); + convert_delimiter(full_path, CIFS_DIR_SEP(cifs_sb)); + full_path[dfsplen + pplen] = 0; /* add trailing null */ + return full_path; +} + /* Note: caller must free return buffer */ char * build_path_from_dentry(struct dentry *direntry) @@ -398,7 +430,16 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry, * in network traffic in the other paths. */ if (!(oflags & O_CREAT)) { - struct dentry *res = cifs_lookup(inode, direntry, 0); + struct dentry *res; + + /* + * Check for hashed negative dentry. We have already revalidated + * the dentry and it is fine. No need to perform another lookup. + */ + if (!d_unhashed(direntry)) + return -ENOENT; + + res = cifs_lookup(inode, direntry, 0); if (IS_ERR(res)) return PTR_ERR(res); diff --git a/fs/cifs/file.c b/fs/cifs/file.c index edb25b4bbb95..0a6677ba212b 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -505,16 +505,36 @@ out: return rc; } +static int cifs_push_posix_locks(struct cifsFileInfo *cfile); + /* * Try to reacquire byte range locks that were released when session - * to server was lost + * to server was lost. */ -static int cifs_relock_file(struct cifsFileInfo *cifsFile) +static int +cifs_relock_file(struct cifsFileInfo *cfile) { + struct cifs_sb_info *cifs_sb = CIFS_SB(cfile->dentry->d_sb); + struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode); + struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); int rc = 0; - /* BB list all locks open on this file and relock */ + /* we are going to update can_cache_brlcks here - need a write access */ + down_write(&cinode->lock_sem); + if (cinode->can_cache_brlcks) { + /* can cache locks - no need to push them */ + up_write(&cinode->lock_sem); + return rc; + } + if (cap_unix(tcon->ses) && + (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) && + ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) + rc = cifs_push_posix_locks(cfile); + else + rc = tcon->ses->server->ops->push_mand_locks(cfile); + + up_write(&cinode->lock_sem); return rc; } @@ -739,10 +759,15 @@ cifs_del_lock_waiters(struct cifsLockInfo *lock) } } +#define CIFS_LOCK_OP 0 +#define CIFS_READ_OP 1 +#define CIFS_WRITE_OP 2 + +/* @rw_check : 0 - no op, 1 - read, 2 - write */ static bool cifs_find_fid_lock_conflict(struct cifs_fid_locks *fdlocks, __u64 offset, __u64 length, __u8 type, struct cifsFileInfo *cfile, - struct cifsLockInfo **conf_lock, bool rw_check) + struct cifsLockInfo **conf_lock, int rw_check) { struct cifsLockInfo *li; struct cifsFileInfo *cur_cfile = fdlocks->cfile; @@ -752,9 +777,13 @@ cifs_find_fid_lock_conflict(struct cifs_fid_locks *fdlocks, __u64 offset, if (offset + length <= li->offset || offset >= li->offset + li->length) continue; - if (rw_check && server->ops->compare_fids(cfile, cur_cfile) && - current->tgid == li->pid) - continue; + if (rw_check != CIFS_LOCK_OP && current->tgid == li->pid && + server->ops->compare_fids(cfile, cur_cfile)) { + /* shared lock prevents write op through the same fid */ + if (!(li->type & server->vals->shared_lock_type) || + rw_check != CIFS_WRITE_OP) + continue; + } if ((type & server->vals->shared_lock_type) && ((server->ops->compare_fids(cfile, cur_cfile) && current->tgid == li->pid) || type == li->type)) @@ -769,7 +798,7 @@ cifs_find_fid_lock_conflict(struct cifs_fid_locks *fdlocks, __u64 offset, bool cifs_find_lock_conflict(struct cifsFileInfo *cfile, __u64 offset, __u64 length, __u8 type, struct cifsLockInfo **conf_lock, - bool rw_check) + int rw_check) { bool rc = false; struct cifs_fid_locks *cur; @@ -805,7 +834,7 @@ cifs_lock_test(struct cifsFileInfo *cfile, __u64 offset, __u64 length, down_read(&cinode->lock_sem); exist = cifs_find_lock_conflict(cfile, offset, length, type, - &conf_lock, false); + &conf_lock, CIFS_LOCK_OP); if (exist) { flock->fl_start = conf_lock->offset; flock->fl_end = conf_lock->offset + conf_lock->length - 1; @@ -852,7 +881,7 @@ try_again: down_write(&cinode->lock_sem); exist = cifs_find_lock_conflict(cfile, lock->offset, lock->length, - lock->type, &conf_lock, false); + lock->type, &conf_lock, CIFS_LOCK_OP); if (!exist && cinode->can_cache_brlcks) { list_add_tail(&lock->llist, &cfile->llist->locks); up_write(&cinode->lock_sem); @@ -948,7 +977,6 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile) int rc = 0, stored_rc; struct cifsLockInfo *li, *tmp; struct cifs_tcon *tcon; - struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode); unsigned int num, max_num, max_buf; LOCKING_ANDX_RANGE *buf, *cur; int types[] = {LOCKING_ANDX_LARGE_FILES, @@ -958,21 +986,12 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile) xid = get_xid(); tcon = tlink_tcon(cfile->tlink); - /* we are going to update can_cache_brlcks here - need a write access */ - down_write(&cinode->lock_sem); - if (!cinode->can_cache_brlcks) { - up_write(&cinode->lock_sem); - free_xid(xid); - return rc; - } - /* * Accessing maxBuf is racy with cifs_reconnect - need to store value * and check it for zero before using. */ max_buf = tcon->ses->server->maxBuf; if (!max_buf) { - up_write(&cinode->lock_sem); free_xid(xid); return -EINVAL; } @@ -981,7 +1000,6 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile) sizeof(LOCKING_ANDX_RANGE); buf = kzalloc(max_num * sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL); if (!buf) { - up_write(&cinode->lock_sem); free_xid(xid); return -ENOMEM; } @@ -1018,9 +1036,6 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile) } } - cinode->can_cache_brlcks = false; - up_write(&cinode->lock_sem); - kfree(buf); free_xid(xid); return rc; @@ -1043,7 +1058,6 @@ struct lock_to_push { static int cifs_push_posix_locks(struct cifsFileInfo *cfile) { - struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode); struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); struct file_lock *flock, **before; unsigned int count = 0, i = 0; @@ -1054,14 +1068,6 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile) xid = get_xid(); - /* we are going to update can_cache_brlcks here - need a write access */ - down_write(&cinode->lock_sem); - if (!cinode->can_cache_brlcks) { - up_write(&cinode->lock_sem); - free_xid(xid); - return rc; - } - lock_flocks(); cifs_for_each_lock(cfile->dentry->d_inode, before) { if ((*before)->fl_flags & FL_POSIX) @@ -1127,9 +1133,6 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile) } out: - cinode->can_cache_brlcks = false; - up_write(&cinode->lock_sem); - free_xid(xid); return rc; err_out: @@ -1144,14 +1147,27 @@ static int cifs_push_locks(struct cifsFileInfo *cfile) { struct cifs_sb_info *cifs_sb = CIFS_SB(cfile->dentry->d_sb); + struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode); struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); + int rc = 0; + + /* we are going to update can_cache_brlcks here - need a write access */ + down_write(&cinode->lock_sem); + if (!cinode->can_cache_brlcks) { + up_write(&cinode->lock_sem); + return rc; + } if (cap_unix(tcon->ses) && (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) && ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) - return cifs_push_posix_locks(cfile); + rc = cifs_push_posix_locks(cfile); + else + rc = tcon->ses->server->ops->push_mand_locks(cfile); - return tcon->ses->server->ops->push_mand_locks(cfile); + cinode->can_cache_brlcks = false; + up_write(&cinode->lock_sem); + return rc; } static void @@ -1436,16 +1452,18 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type, return -ENOMEM; rc = cifs_lock_add_if(cfile, lock, wait_flag); - if (rc < 0) + if (rc < 0) { kfree(lock); - if (rc <= 0) + return rc; + } + if (!rc) goto out; rc = server->ops->mand_lock(xid, cfile, flock->fl_start, length, type, 1, 0, wait_flag); if (rc) { kfree(lock); - goto out; + return rc; } cifs_lock_add(cfile, lock); @@ -1794,7 +1812,6 @@ static int cifs_writepages(struct address_space *mapping, struct TCP_Server_Info *server; struct page *page; int rc = 0; - loff_t isize = i_size_read(mapping->host); /* * If wsize is smaller than the page cache size, default to writing @@ -1899,7 +1916,7 @@ retry: */ set_page_writeback(page); - if (page_offset(page) >= isize) { + if (page_offset(page) >= i_size_read(mapping->host)) { done = true; unlock_page(page); end_page_writeback(page); @@ -1932,7 +1949,8 @@ retry: wdata->offset = page_offset(wdata->pages[0]); wdata->pagesz = PAGE_CACHE_SIZE; wdata->tailsz = - min(isize - page_offset(wdata->pages[nr_pages - 1]), + min(i_size_read(mapping->host) - + page_offset(wdata->pages[nr_pages - 1]), (loff_t)PAGE_CACHE_SIZE); wdata->bytes = ((nr_pages - 1) * PAGE_CACHE_SIZE) + wdata->tailsz; @@ -2085,7 +2103,15 @@ static int cifs_write_end(struct file *file, struct address_space *mapping, } else { rc = copied; pos += copied; - set_page_dirty(page); + /* + * When we use strict cache mode and cifs_strict_writev was run + * with level II oplock (indicated by leave_pages_clean field of + * CIFS_I(inode)), we can leave pages clean - cifs_strict_writev + * sent the data to the server itself. + */ + if (!CIFS_I(inode)->leave_pages_clean || + !(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO)) + set_page_dirty(page); } if (rc > 0) { @@ -2436,8 +2462,8 @@ ssize_t cifs_user_writev(struct kiocb *iocb, const struct iovec *iov, } static ssize_t -cifs_writev(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) +cifs_pagecache_writev(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos, bool cache_ex) { struct file *file = iocb->ki_filp; struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data; @@ -2457,10 +2483,14 @@ cifs_writev(struct kiocb *iocb, const struct iovec *iov, down_read(&cinode->lock_sem); if (!cifs_find_lock_conflict(cfile, pos, iov_length(iov, nr_segs), server->vals->exclusive_lock_type, NULL, - true)) { + CIFS_WRITE_OP)) { mutex_lock(&inode->i_mutex); + if (!cache_ex) + cinode->leave_pages_clean = true; rc = __generic_file_aio_write(iocb, iov, nr_segs, - &iocb->ki_pos); + &iocb->ki_pos); + if (!cache_ex) + cinode->leave_pages_clean = false; mutex_unlock(&inode->i_mutex); } @@ -2487,42 +2517,62 @@ cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov, struct cifsFileInfo *cfile = (struct cifsFileInfo *) iocb->ki_filp->private_data; struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); - -#ifdef CONFIG_CIFS_SMB2 + ssize_t written, written2; /* - * If we have an oplock for read and want to write a data to the file - * we need to store it in the page cache and then push it to the server - * to be sure the next read will get a valid data. + * We need to store clientCanCacheAll here to prevent race + * conditions - this value can be changed during an execution + * of generic_file_aio_write. For CIFS it can be changed from + * true to false only, but for SMB2 it can be changed both from + * true to false and vice versa. So, we can end up with a data + * stored in the cache, not marked dirty and not sent to the + * server if this value changes its state from false to true + * after cifs_write_end. */ - if (!cinode->clientCanCacheAll && cinode->clientCanCacheRead) { - ssize_t written; - int rc; - - written = generic_file_aio_write(iocb, iov, nr_segs, pos); - rc = filemap_fdatawrite(inode->i_mapping); - if (rc) - return (ssize_t)rc; + bool cache_ex = cinode->clientCanCacheAll; + bool cache_read = cinode->clientCanCacheRead; + int rc; + loff_t saved_pos; - return written; + if (cache_ex) { + if (cap_unix(tcon->ses) && + ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0) && + (CIFS_UNIX_FCNTL_CAP & le64_to_cpu( + tcon->fsUnixInfo.Capability))) + return generic_file_aio_write(iocb, iov, nr_segs, pos); + return cifs_pagecache_writev(iocb, iov, nr_segs, pos, cache_ex); } -#endif /* - * For non-oplocked files in strict cache mode we need to write the data - * to the server exactly from the pos to pos+len-1 rather than flush all - * affected pages because it may cause a error with mandatory locks on - * these pages but not on the region from pos to ppos+len-1. + * For files without exclusive oplock in strict cache mode we need to + * write the data to the server exactly from the pos to pos+len-1 rather + * than flush all affected pages because it may cause a error with + * mandatory locks on these pages but not on the region from pos to + * ppos+len-1. */ + written = cifs_user_writev(iocb, iov, nr_segs, pos); + if (!cache_read || written <= 0) + return written; - if (!cinode->clientCanCacheAll) - return cifs_user_writev(iocb, iov, nr_segs, pos); - + saved_pos = iocb->ki_pos; + iocb->ki_pos = pos; + /* we have a read oplock - need to store a data in the page cache */ if (cap_unix(tcon->ses) && - (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) && - ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) - return generic_file_aio_write(iocb, iov, nr_segs, pos); - - return cifs_writev(iocb, iov, nr_segs, pos); + ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0) && + (CIFS_UNIX_FCNTL_CAP & le64_to_cpu( + tcon->fsUnixInfo.Capability))) + written2 = generic_file_aio_write(iocb, iov, nr_segs, pos); + else + written2 = cifs_pagecache_writev(iocb, iov, nr_segs, pos, + cache_ex); + /* errors occured during writing - invalidate the page cache */ + if (written2 < 0) { + rc = cifs_invalidate_mapping(inode); + if (rc) + written = (ssize_t)rc; + else + iocb->ki_pos = saved_pos; + } + return written; } static struct cifs_readdata * @@ -2892,7 +2942,7 @@ cifs_strict_readv(struct kiocb *iocb, const struct iovec *iov, down_read(&cinode->lock_sem); if (!cifs_find_lock_conflict(cfile, pos, iov_length(iov, nr_segs), tcon->ses->server->vals->shared_lock_type, - NULL, true)) + NULL, CIFS_READ_OP)) rc = generic_file_aio_read(iocb, iov, nr_segs, pos); up_read(&cinode->lock_sem); return rc; @@ -3536,7 +3586,7 @@ void cifs_oplock_break(struct work_struct *work) if (cinode->clientCanCacheRead == 0) { rc = filemap_fdatawait(inode->i_mapping); mapping_set_error(inode->i_mapping, rc); - invalidate_remote_inode(inode); + cifs_invalidate_mapping(inode); } cFYI(1, "Oplock flush inode %p rc %d", inode, rc); } diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index afdff79651f1..ed6208ff85a7 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -1791,11 +1791,12 @@ int cifs_getattr(struct vfsmount *mnt, struct dentry *dentry, stat->ino = CIFS_I(inode)->uniqueid; /* - * If on a multiuser mount without unix extensions, and the admin hasn't - * overridden them, set the ownership to the fsuid/fsgid of the current - * process. + * If on a multiuser mount without unix extensions or cifsacl being + * enabled, and the admin hasn't overridden them, set the ownership + * to the fsuid/fsgid of the current process. */ if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER) && + !(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) && !tcon->unix_ext) { if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID)) stat->uid = current_fsuid(); diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c index d5ce9e26696c..a82bc51fdc82 100644 --- a/fs/cifs/netmisc.c +++ b/fs/cifs/netmisc.c @@ -204,7 +204,7 @@ cifs_convert_address(struct sockaddr *dst, const char *src, int len) return rc; } -int +void cifs_set_port(struct sockaddr *addr, const unsigned short int port) { switch (addr->sa_family) { @@ -214,19 +214,7 @@ cifs_set_port(struct sockaddr *addr, const unsigned short int port) case AF_INET6: ((struct sockaddr_in6 *)addr)->sin6_port = htons(port); break; - default: - return 0; } - return 1; -} - -int -cifs_fill_sockaddr(struct sockaddr *dst, const char *src, int len, - const unsigned short int port) -{ - if (!cifs_convert_address(dst, src, len)) - return 0; - return cifs_set_port(dst, port); } /***************************************************************************** diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index f9b5d3d6cf33..6002fdc920ae 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -66,18 +66,20 @@ static inline void dump_cifs_file_struct(struct file *file, char *label) #endif /* DEBUG2 */ /* + * Attempt to preload the dcache with the results from the FIND_FIRST/NEXT + * * Find the dentry that matches "name". If there isn't one, create one. If it's * a negative dentry or the uniqueid changed, then drop it and recreate it. */ -static struct dentry * -cifs_readdir_lookup(struct dentry *parent, struct qstr *name, +static void +cifs_prime_dcache(struct dentry *parent, struct qstr *name, struct cifs_fattr *fattr) { struct dentry *dentry, *alias; struct inode *inode; struct super_block *sb = parent->d_inode->i_sb; - cFYI(1, "For %s", name->name); + cFYI(1, "%s: for %s", __func__, name->name); if (parent->d_op && parent->d_op->d_hash) parent->d_op->d_hash(parent, parent->d_inode, name); @@ -86,35 +88,33 @@ cifs_readdir_lookup(struct dentry *parent, struct qstr *name, dentry = d_lookup(parent, name); if (dentry) { + int err; + inode = dentry->d_inode; /* update inode in place if i_ino didn't change */ if (inode && CIFS_I(inode)->uniqueid == fattr->cf_uniqueid) { cifs_fattr_to_inode(inode, fattr); - return dentry; + goto out; } - d_drop(dentry); + err = d_invalidate(dentry); dput(dentry); + if (err) + return; } dentry = d_alloc(parent, name); - if (dentry == NULL) - return NULL; + if (!dentry) + return; inode = cifs_iget(sb, fattr); - if (!inode) { - dput(dentry); - return NULL; - } + if (!inode) + goto out; alias = d_materialise_unique(dentry, inode); - if (alias != NULL) { - dput(dentry); - if (IS_ERR(alias)) - return NULL; - dentry = alias; - } - - return dentry; + if (alias && !IS_ERR(alias)) + dput(alias); +out: + dput(dentry); } static void @@ -134,6 +134,16 @@ cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb) if (fattr->cf_cifsattrs & ATTR_READONLY) fattr->cf_mode &= ~S_IWUGO; + /* + * We of course don't get ACL info in FIND_FIRST/NEXT results, so + * mark it for revalidation so that "ls -l" will look right. It might + * be super-slow, but if we don't do this then the ownership of files + * may look wrong since the inodes may not have timed out by the time + * "ls" does a stat() call on them. + */ + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) + fattr->cf_flags |= CIFS_FATTR_NEED_REVAL; + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL && fattr->cf_cifsattrs & ATTR_SYSTEM) { if (fattr->cf_eof == 0) { @@ -649,7 +659,6 @@ static int cifs_filldir(char *find_entry, struct file *file, filldir_t filldir, struct cifs_sb_info *cifs_sb = CIFS_SB(sb); struct cifs_dirent de = { NULL, }; struct cifs_fattr fattr; - struct dentry *dentry; struct qstr name; int rc = 0; ino_t ino; @@ -720,13 +729,11 @@ static int cifs_filldir(char *find_entry, struct file *file, filldir_t filldir, */ fattr.cf_flags |= CIFS_FATTR_NEED_REVAL; - ino = cifs_uniqueid_to_ino_t(fattr.cf_uniqueid); - dentry = cifs_readdir_lookup(file->f_dentry, &name, &fattr); + cifs_prime_dcache(file->f_dentry, &name, &fattr); + ino = cifs_uniqueid_to_ino_t(fattr.cf_uniqueid); rc = filldir(dirent, name.name, name.len, file->f_pos, ino, fattr.cf_dtype); - - dput(dentry); return rc; } diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index 56cc4be87807..a5d234c8d5d9 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -575,37 +575,6 @@ cifs_query_file_info(const unsigned int xid, struct cifs_tcon *tcon, return CIFSSMBQFileInfo(xid, tcon, fid->netfid, data); } -static char * -cifs_build_path_to_root(struct smb_vol *vol, struct cifs_sb_info *cifs_sb, - struct cifs_tcon *tcon) -{ - int pplen = vol->prepath ? strlen(vol->prepath) : 0; - int dfsplen; - char *full_path = NULL; - - /* if no prefix path, simply set path to the root of share to "" */ - if (pplen == 0) { - full_path = kzalloc(1, GFP_KERNEL); - return full_path; - } - - if (tcon->Flags & SMB_SHARE_IS_IN_DFS) - dfsplen = strnlen(tcon->treeName, MAX_TREE_SIZE + 1); - else - dfsplen = 0; - - full_path = kmalloc(dfsplen + pplen + 1, GFP_KERNEL); - if (full_path == NULL) - return full_path; - - if (dfsplen) - strncpy(full_path, tcon->treeName, dfsplen); - strncpy(full_path + dfsplen, vol->prepath, pplen); - convert_delimiter(full_path, CIFS_DIR_SEP(cifs_sb)); - full_path[dfsplen + pplen] = 0; /* add trailing null */ - return full_path; -} - static void cifs_clear_stats(struct cifs_tcon *tcon) { @@ -766,7 +735,6 @@ smb_set_file_info(struct inode *inode, const char *full_path, struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct tcon_link *tlink = NULL; struct cifs_tcon *tcon; - FILE_BASIC_INFO info_buf; /* if the file is already open for write, just use that fileid */ open_file = find_writable_file(cinode, true); @@ -817,7 +785,7 @@ smb_set_file_info(struct inode *inode, const char *full_path, netpid = current->tgid; set_via_filehandle: - rc = CIFSSMBSetFileInfo(xid, tcon, &info_buf, netfid, netpid); + rc = CIFSSMBSetFileInfo(xid, tcon, buf, netfid, netpid); if (!rc) cinode->cifsAttrs = le32_to_cpu(buf->Attributes); @@ -944,7 +912,6 @@ struct smb_version_operations smb1_operations = { .set_path_size = CIFSSMBSetEOF, .set_file_size = CIFSSMBSetFileSize, .set_file_info = smb_set_file_info, - .build_path_to_root = cifs_build_path_to_root, .echo = CIFSSMBEcho, .mkdir = CIFSSMBMkDir, .mkdir_setinfo = cifs_mkdir_setinfo, diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c index a93eec30a50d..71e6aed4b382 100644 --- a/fs/cifs/smb2file.c +++ b/fs/cifs/smb2file.c @@ -260,13 +260,6 @@ smb2_push_mandatory_locks(struct cifsFileInfo *cfile) struct cifs_fid_locks *fdlocks; xid = get_xid(); - /* we are going to update can_cache_brlcks here - need a write access */ - down_write(&cinode->lock_sem); - if (!cinode->can_cache_brlcks) { - up_write(&cinode->lock_sem); - free_xid(xid); - return rc; - } /* * Accessing maxBuf is racy with cifs_reconnect - need to store value @@ -274,7 +267,6 @@ smb2_push_mandatory_locks(struct cifsFileInfo *cfile) */ max_buf = tlink_tcon(cfile->tlink)->ses->server->maxBuf; if (!max_buf) { - up_write(&cinode->lock_sem); free_xid(xid); return -EINVAL; } @@ -282,7 +274,6 @@ smb2_push_mandatory_locks(struct cifsFileInfo *cfile) max_num = max_buf / sizeof(struct smb2_lock_element); buf = kzalloc(max_num * sizeof(struct smb2_lock_element), GFP_KERNEL); if (!buf) { - up_write(&cinode->lock_sem); free_xid(xid); return -ENOMEM; } @@ -293,10 +284,7 @@ smb2_push_mandatory_locks(struct cifsFileInfo *cfile) rc = stored_rc; } - cinode->can_cache_brlcks = false; kfree(buf); - - up_write(&cinode->lock_sem); free_xid(xid); return rc; } diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 4d9dbe0b7385..d79de7bc4435 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -262,23 +262,6 @@ smb2_query_file_info(const unsigned int xid, struct cifs_tcon *tcon, return rc; } -static char * -smb2_build_path_to_root(struct smb_vol *vol, struct cifs_sb_info *cifs_sb, - struct cifs_tcon *tcon) -{ - int pplen = vol->prepath ? strlen(vol->prepath) : 0; - char *full_path = NULL; - - /* if no prefix path, simply set path to the root of share to "" */ - if (pplen == 0) { - full_path = kzalloc(2, GFP_KERNEL); - return full_path; - } - - cERROR(1, "prefixpath is not supported for SMB2 now"); - return NULL; -} - static bool smb2_can_echo(struct TCP_Server_Info *server) { @@ -613,7 +596,6 @@ struct smb_version_operations smb21_operations = { .set_path_size = smb2_set_path_size, .set_file_size = smb2_set_file_size, .set_file_info = smb2_set_file_info, - .build_path_to_root = smb2_build_path_to_root, .mkdir = smb2_mkdir, .mkdir_setinfo = smb2_mkdir_setinfo, .rmdir = smb2_rmdir, @@ -641,6 +623,91 @@ struct smb_version_operations smb21_operations = { .get_lease_key = smb2_get_lease_key, .set_lease_key = smb2_set_lease_key, .new_lease_key = smb2_new_lease_key, + .calc_signature = smb2_calc_signature, +}; + + +struct smb_version_operations smb30_operations = { + .compare_fids = smb2_compare_fids, + .setup_request = smb2_setup_request, + .setup_async_request = smb2_setup_async_request, + .check_receive = smb2_check_receive, + .add_credits = smb2_add_credits, + .set_credits = smb2_set_credits, + .get_credits_field = smb2_get_credits_field, + .get_credits = smb2_get_credits, + .get_next_mid = smb2_get_next_mid, + .read_data_offset = smb2_read_data_offset, + .read_data_length = smb2_read_data_length, + .map_error = map_smb2_to_linux_error, + .find_mid = smb2_find_mid, + .check_message = smb2_check_message, + .dump_detail = smb2_dump_detail, + .clear_stats = smb2_clear_stats, + .print_stats = smb2_print_stats, + .is_oplock_break = smb2_is_valid_oplock_break, + .need_neg = smb2_need_neg, + .negotiate = smb2_negotiate, + .negotiate_wsize = smb2_negotiate_wsize, + .negotiate_rsize = smb2_negotiate_rsize, + .sess_setup = SMB2_sess_setup, + .logoff = SMB2_logoff, + .tree_connect = SMB2_tcon, + .tree_disconnect = SMB2_tdis, + .is_path_accessible = smb2_is_path_accessible, + .can_echo = smb2_can_echo, + .echo = SMB2_echo, + .query_path_info = smb2_query_path_info, + .get_srv_inum = smb2_get_srv_inum, + .query_file_info = smb2_query_file_info, + .set_path_size = smb2_set_path_size, + .set_file_size = smb2_set_file_size, + .set_file_info = smb2_set_file_info, + .mkdir = smb2_mkdir, + .mkdir_setinfo = smb2_mkdir_setinfo, + .rmdir = smb2_rmdir, + .unlink = smb2_unlink, + .rename = smb2_rename_path, + .create_hardlink = smb2_create_hardlink, + .open = smb2_open_file, + .set_fid = smb2_set_fid, + .close = smb2_close_file, + .flush = smb2_flush_file, + .async_readv = smb2_async_readv, + .async_writev = smb2_async_writev, + .sync_read = smb2_sync_read, + .sync_write = smb2_sync_write, + .query_dir_first = smb2_query_dir_first, + .query_dir_next = smb2_query_dir_next, + .close_dir = smb2_close_dir, + .calc_smb_size = smb2_calc_size, + .is_status_pending = smb2_is_status_pending, + .oplock_response = smb2_oplock_response, + .queryfs = smb2_queryfs, + .mand_lock = smb2_mand_lock, + .mand_unlock_range = smb2_unlock_range, + .push_mand_locks = smb2_push_mandatory_locks, + .get_lease_key = smb2_get_lease_key, + .set_lease_key = smb2_set_lease_key, + .new_lease_key = smb2_new_lease_key, + .calc_signature = smb3_calc_signature, +}; + +struct smb_version_values smb20_values = { + .version_string = SMB20_VERSION_STRING, + .protocol_id = SMB20_PROT_ID, + .req_capabilities = 0, /* MBZ */ + .large_lock_type = 0, + .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK, + .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK, + .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, + .header_size = sizeof(struct smb2_hdr), + .max_header_size = MAX_SMB2_HDR_SIZE, + .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, + .lock_cmd = SMB2_LOCK, + .cap_unix = 0, + .cap_nt_find = SMB2_NT_FIND, + .cap_large_files = SMB2_LARGE_FILES, }; struct smb_version_values smb21_values = { diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index cf33622cdac8..41d9d0725f0f 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -425,7 +425,7 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) } cFYI(1, "sec_flags 0x%x", sec_flags); - if (sec_flags & CIFSSEC_MUST_SIGN) { + if ((sec_flags & CIFSSEC_MUST_SIGN) == CIFSSEC_MUST_SIGN) { cFYI(1, "Signing required"); if (!(server->sec_mode & (SMB2_NEGOTIATE_SIGNING_REQUIRED | SMB2_NEGOTIATE_SIGNING_ENABLED))) { @@ -612,7 +612,8 @@ ssetup_ntlmssp_authenticate: /* BB add code to build os and lm fields */ - rc = SendReceive2(xid, ses, iov, 2, &resp_buftype, CIFS_LOG_ERROR); + rc = SendReceive2(xid, ses, iov, 2, &resp_buftype, + CIFS_LOG_ERROR | CIFS_NEG_OP); kfree(security_blob); rsp = (struct smb2_sess_setup_rsp *)iov[0].iov_base; diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index 7d25f8b14f93..2aa3535e38ce 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -47,6 +47,10 @@ extern struct mid_q_entry *smb2_setup_request(struct cifs_ses *ses, struct smb_rqst *rqst); extern struct mid_q_entry *smb2_setup_async_request( struct TCP_Server_Info *server, struct smb_rqst *rqst); +extern int smb2_calc_signature(struct smb_rqst *rqst, + struct TCP_Server_Info *server); +extern int smb3_calc_signature(struct smb_rqst *rqst, + struct TCP_Server_Info *server); extern void smb2_echo_request(struct work_struct *work); extern __le32 smb2_get_lease_state(struct cifsInodeInfo *cinode); extern __u8 smb2_map_lease_to_oplock(__le32 lease_state); diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c index 2a5fdf26f79f..8dd73e61d762 100644 --- a/fs/cifs/smb2transport.c +++ b/fs/cifs/smb2transport.c @@ -39,7 +39,7 @@ #include "smb2status.h" #include "smb2glob.h" -static int +int smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) { int i, rc; @@ -116,6 +116,13 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) return rc; } +int +smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) +{ + cFYI(1, "smb3 signatures not supported yet"); + return -EOPNOTSUPP; +} + /* must be called with server->srv_mutex held */ static int smb2_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server) @@ -132,7 +139,7 @@ smb2_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server) return rc; } - rc = smb2_calc_signature(rqst, server); + rc = server->ops->calc_signature(rqst, server); return rc; } @@ -168,7 +175,7 @@ smb2_verify_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) memset(smb2_pdu->Signature, 0, SMB2_SIGNATURE_SIZE); mutex_lock(&server->srv_mutex); - rc = smb2_calc_signature(rqst, server); + rc = server->ops->calc_signature(rqst, server); mutex_unlock(&server->srv_mutex); if (rc) diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index 4c6285fff598..e2f57a007029 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -844,6 +844,9 @@ COMPATIBLE_IOCTL(TIOCGDEV) COMPATIBLE_IOCTL(TIOCCBRK) COMPATIBLE_IOCTL(TIOCGSID) COMPATIBLE_IOCTL(TIOCGICOUNT) +COMPATIBLE_IOCTL(TIOCGPKT) +COMPATIBLE_IOCTL(TIOCGPTLCK) +COMPATIBLE_IOCTL(TIOCGEXCL) /* Little t */ COMPATIBLE_IOCTL(TIOCGETD) COMPATIBLE_IOCTL(TIOCSETD) diff --git a/fs/coredump.c b/fs/coredump.c index ce47379bfa61..177493272a61 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -458,7 +458,7 @@ static int umh_pipe_setup(struct subprocess_info *info, struct cred *new) return err; } -void do_coredump(siginfo_t *siginfo, struct pt_regs *regs) +void do_coredump(siginfo_t *siginfo) { struct core_state core_state; struct core_name cn; @@ -474,7 +474,7 @@ void do_coredump(siginfo_t *siginfo, struct pt_regs *regs) static atomic_t core_dump_count = ATOMIC_INIT(0); struct coredump_params cprm = { .siginfo = siginfo, - .regs = regs, + .regs = signal_pt_regs(), .limit = rlimit(RLIMIT_CORE), /* * We must use the same mm->flags while dumping core to avoid diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index b607d92cdf24..153bb1e42e63 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -59,7 +59,6 @@ static struct inode *debugfs_get_inode(struct super_block *sb, umode_t mode, dev case S_IFDIR: inode->i_op = &simple_dir_inode_operations; inode->i_fop = &simple_dir_operations; - inode->i_private = NULL; /* directory inodes start off with i_nlink == 2 * (for "." entry) */ diff --git a/fs/direct-io.c b/fs/direct-io.c index f86c720dba0e..cf5b44b10c67 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -540,6 +540,7 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio, sector_t fs_endblk; /* Into file, in filesystem-sized blocks */ unsigned long fs_count; /* Number of filesystem-sized blocks */ int create; + unsigned int i_blkbits = sdio->blkbits + sdio->blkfactor; /* * If there was a memory error and we've overwritten all the @@ -554,7 +555,7 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio, fs_count = fs_endblk - fs_startblk + 1; map_bh->b_state = 0; - map_bh->b_size = fs_count << dio->inode->i_blkbits; + map_bh->b_size = fs_count << i_blkbits; /* * For writes inside i_size on a DIO_SKIP_HOLES filesystem we @@ -1053,7 +1054,8 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, int seg; size_t size; unsigned long addr; - unsigned blkbits = inode->i_blkbits; + unsigned i_blkbits = ACCESS_ONCE(inode->i_blkbits); + unsigned blkbits = i_blkbits; unsigned blocksize_mask = (1 << blkbits) - 1; ssize_t retval = -EINVAL; loff_t end = offset; @@ -1149,7 +1151,7 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, dio->inode = inode; dio->rw = rw; sdio.blkbits = blkbits; - sdio.blkfactor = inode->i_blkbits - blkbits; + sdio.blkfactor = i_blkbits - blkbits; sdio.block_in_file = offset >> blkbits; sdio.get_block = get_block; diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig index 1897eb1b4b6a..e4242c3f8486 100644 --- a/fs/dlm/Kconfig +++ b/fs/dlm/Kconfig @@ -1,6 +1,6 @@ menuconfig DLM tristate "Distributed Lock Manager (DLM)" - depends on EXPERIMENTAL && INET + depends on INET depends on SYSFS && CONFIGFS_FS && (IPV6 || IPV6=n) select IP_SCTP help diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index 871c1abf6029..77c0f70f8fe8 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h @@ -337,6 +337,7 @@ enum rsb_flags { RSB_NEW_MASTER2, RSB_RECOVER_CONVERT, RSB_RECOVER_GRANT, + RSB_RECOVER_LVB_INVAL, }; static inline void rsb_set_flag(struct dlm_rsb *r, enum rsb_flags flag) diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index b56950758188..a579f30f237d 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -5393,6 +5393,13 @@ static void purge_dead_list(struct dlm_ls *ls, struct dlm_rsb *r, if ((lkb->lkb_nodeid == nodeid_gone) || dlm_is_removed(ls, lkb->lkb_nodeid)) { + /* tell recover_lvb to invalidate the lvb + because a node holding EX/PW failed */ + if ((lkb->lkb_exflags & DLM_LKF_VALBLK) && + (lkb->lkb_grmode >= DLM_LOCK_PW)) { + rsb_set_flag(r, RSB_RECOVER_LVB_INVAL); + } + del_lkb(r, lkb); /* this put should free the lkb */ @@ -6025,15 +6032,18 @@ static int orphan_proc_lock(struct dlm_ls *ls, struct dlm_lkb *lkb) return error; } -/* The force flag allows the unlock to go ahead even if the lkb isn't granted. - Regardless of what rsb queue the lock is on, it's removed and freed. */ +/* The FORCEUNLOCK flag allows the unlock to go ahead even if the lkb isn't + granted. Regardless of what rsb queue the lock is on, it's removed and + freed. The IVVALBLK flag causes the lvb on the resource to be invalidated + if our lock is PW/EX (it's ignored if our granted mode is smaller.) */ static int unlock_proc_lock(struct dlm_ls *ls, struct dlm_lkb *lkb) { struct dlm_args args; int error; - set_unlock_args(DLM_LKF_FORCEUNLOCK, lkb->lkb_ua, &args); + set_unlock_args(DLM_LKF_FORCEUNLOCK | DLM_LKF_IVVALBLK, + lkb->lkb_ua, &args); error = unlock_lock(ls, lkb, &args); if (error == -DLM_EUNLOCK) diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 331ea4f94efd..dd87a31bcc21 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -1385,7 +1385,6 @@ void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc) struct connection *con; struct writequeue_entry *e; int offset = 0; - int users = 0; con = nodeid2con(nodeid, allocation); if (!con) @@ -1399,7 +1398,7 @@ void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc) } else { offset = e->end; e->end += len; - users = e->users++; + e->users++; } spin_unlock(&con->writequeue_lock); @@ -1414,7 +1413,7 @@ void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc) spin_lock(&con->writequeue_lock); offset = e->end; e->end += len; - users = e->users++; + e->users++; list_add_tail(&e->list, &con->writequeue); spin_unlock(&con->writequeue_lock); goto got_one; diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c index 4a7a76e42fc3..aedea28a86a1 100644 --- a/fs/dlm/recover.c +++ b/fs/dlm/recover.c @@ -717,8 +717,14 @@ void dlm_recovered_lock(struct dlm_rsb *r) * the VALNOTVALID flag if necessary, and determining the correct lvb contents * based on the lvb's of the locks held on the rsb. * - * RSB_VALNOTVALID is set if there are only NL/CR locks on the rsb. If it - * was already set prior to recovery, it's not cleared, regardless of locks. + * RSB_VALNOTVALID is set in two cases: + * + * 1. we are master, but not new, and we purged an EX/PW lock held by a + * failed node (in dlm_recover_purge which set RSB_RECOVER_LVB_INVAL) + * + * 2. we are a new master, and there are only NL/CR locks left. + * (We could probably improve this by only invaliding in this way when + * the previous master left uncleanly. VMS docs mention that.) * * The LVB contents are only considered for changing when this is a new master * of the rsb (NEW_MASTER2). Then, the rsb's lvb is taken from any lkb with @@ -734,6 +740,19 @@ static void recover_lvb(struct dlm_rsb *r) int big_lock_exists = 0; int lvblen = r->res_ls->ls_lvblen; + if (!rsb_flag(r, RSB_NEW_MASTER2) && + rsb_flag(r, RSB_RECOVER_LVB_INVAL)) { + /* case 1 above */ + rsb_set_flag(r, RSB_VALNOTVALID); + return; + } + + if (!rsb_flag(r, RSB_NEW_MASTER2)) + return; + + /* we are the new master, so figure out if VALNOTVALID should + be set, and set the rsb lvb from the best lkb available. */ + list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) { if (!(lkb->lkb_exflags & DLM_LKF_VALBLK)) continue; @@ -772,13 +791,10 @@ static void recover_lvb(struct dlm_rsb *r) if (!lock_lvb_exists) goto out; + /* lvb is invalidated if only NL/CR locks remain */ if (!big_lock_exists) rsb_set_flag(r, RSB_VALNOTVALID); - /* don't mess with the lvb unless we're the new master */ - if (!rsb_flag(r, RSB_NEW_MASTER2)) - goto out; - if (!r->res_lvbptr) { r->res_lvbptr = dlm_allocate_lvb(r->res_ls); if (!r->res_lvbptr) @@ -852,12 +868,19 @@ void dlm_recover_rsbs(struct dlm_ls *ls) if (is_master(r)) { if (rsb_flag(r, RSB_RECOVER_CONVERT)) recover_conversion(r); + + /* recover lvb before granting locks so the updated + lvb/VALNOTVALID is presented in the completion */ + recover_lvb(r); + if (rsb_flag(r, RSB_NEW_MASTER2)) recover_grant(r); - recover_lvb(r); count++; + } else { + rsb_clear_flag(r, RSB_VALNOTVALID); } rsb_clear_flag(r, RSB_RECOVER_CONVERT); + rsb_clear_flag(r, RSB_RECOVER_LVB_INVAL); rsb_clear_flag(r, RSB_NEW_MASTER2); unlock_rsb(r); } diff --git a/fs/eventpoll.c b/fs/eventpoll.c index da72250ddc1c..cd96649bfe62 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -346,7 +346,7 @@ static inline struct epitem *ep_item_from_epqueue(poll_table *p) /* Tells if the epoll_ctl(2) operation needs an event copy from userspace */ static inline int ep_op_has_event(int op) { - return op == EPOLL_CTL_ADD || op == EPOLL_CTL_MOD; + return op != EPOLL_CTL_DEL; } /* Initialize the poll safe wake up structure */ @@ -676,34 +676,6 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi) return 0; } -/* - * Disables a "struct epitem" in the eventpoll set. Returns -EBUSY if the item - * had no event flags set, indicating that another thread may be currently - * handling that item's events (in the case that EPOLLONESHOT was being - * used). Otherwise a zero result indicates that the item has been disabled - * from receiving events. A disabled item may be re-enabled via - * EPOLL_CTL_MOD. Must be called with "mtx" held. - */ -static int ep_disable(struct eventpoll *ep, struct epitem *epi) -{ - int result = 0; - unsigned long flags; - - spin_lock_irqsave(&ep->lock, flags); - if (epi->event.events & ~EP_PRIVATE_BITS) { - if (ep_is_linked(&epi->rdllink)) - list_del_init(&epi->rdllink); - /* Ensure ep_poll_callback will not add epi back onto ready - list: */ - epi->event.events &= EP_PRIVATE_BITS; - } - else - result = -EBUSY; - spin_unlock_irqrestore(&ep->lock, flags); - - return result; -} - static void ep_free(struct eventpoll *ep) { struct rb_node *rbp; @@ -1048,6 +1020,8 @@ static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi) rb_insert_color(&epi->rbn, &ep->rbr); } + + #define PATH_ARR_SIZE 5 /* * These are the number paths of length 1 to 5, that we are allowing to emanate @@ -1813,12 +1787,6 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, } else error = -ENOENT; break; - case EPOLL_CTL_DISABLE: - if (epi) - error = ep_disable(ep, epi); - else - error = -ENOENT; - break; } mutex_unlock(&ep->mtx); diff --git a/fs/exec.c b/fs/exec.c index 0039055b1fc6..721a29929511 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1349,7 +1349,7 @@ EXPORT_SYMBOL(remove_arg_zero); /* * cycle the list of binary formats handler, until one recognizes the image */ -int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs) +int search_binary_handler(struct linux_binprm *bprm) { unsigned int depth = bprm->recursion_depth; int try,retval; @@ -1374,13 +1374,13 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs) for (try=0; try<2; try++) { read_lock(&binfmt_lock); list_for_each_entry(fmt, &formats, lh) { - int (*fn)(struct linux_binprm *, struct pt_regs *) = fmt->load_binary; + int (*fn)(struct linux_binprm *) = fmt->load_binary; if (!fn) continue; if (!try_module_get(fmt->module)) continue; read_unlock(&binfmt_lock); - retval = fn(bprm, regs); + retval = fn(bprm); /* * Restore the depth counter to its starting value * in this call, so we don't have to rely on every @@ -1439,8 +1439,7 @@ EXPORT_SYMBOL(search_binary_handler); */ static int do_execve_common(const char *filename, struct user_arg_ptr argv, - struct user_arg_ptr envp, - struct pt_regs *regs) + struct user_arg_ptr envp) { struct linux_binprm *bprm; struct file *file; @@ -1524,7 +1523,7 @@ static int do_execve_common(const char *filename, if (retval < 0) goto out; - retval = search_binary_handler(bprm,regs); + retval = search_binary_handler(bprm); if (retval < 0) goto out; @@ -1566,19 +1565,17 @@ out_ret: int do_execve(const char *filename, const char __user *const __user *__argv, - const char __user *const __user *__envp, - struct pt_regs *regs) + const char __user *const __user *__envp) { struct user_arg_ptr argv = { .ptr.native = __argv }; struct user_arg_ptr envp = { .ptr.native = __envp }; - return do_execve_common(filename, argv, envp, regs); + return do_execve_common(filename, argv, envp); } #ifdef CONFIG_COMPAT -int compat_do_execve(const char *filename, +static int compat_do_execve(const char *filename, const compat_uptr_t __user *__argv, - const compat_uptr_t __user *__envp, - struct pt_regs *regs) + const compat_uptr_t __user *__envp) { struct user_arg_ptr argv = { .is_compat = true, @@ -1588,7 +1585,7 @@ int compat_do_execve(const char *filename, .is_compat = true, .ptr.compat = __envp, }; - return do_execve_common(filename, argv, envp, regs); + return do_execve_common(filename, argv, envp); } #endif @@ -1669,7 +1666,7 @@ SYSCALL_DEFINE3(execve, struct filename *path = getname(filename); int error = PTR_ERR(path); if (!IS_ERR(path)) { - error = do_execve(path->name, argv, envp, current_pt_regs()); + error = do_execve(path->name, argv, envp); putname(path); } return error; @@ -1682,8 +1679,7 @@ asmlinkage long compat_sys_execve(const char __user * filename, struct filename *path = getname(filename); int error = PTR_ERR(path); if (!IS_ERR(path)) { - error = compat_do_execve(path->name, argv, envp, - current_pt_regs()); + error = compat_do_execve(path->name, argv, envp); putname(path); } return error; @@ -1696,12 +1692,9 @@ int kernel_execve(const char *filename, const char *const argv[], const char *const envp[]) { - struct pt_regs *p = current_pt_regs(); - int ret; - - ret = do_execve(filename, + int ret = do_execve(filename, (const char __user *const __user *)argv, - (const char __user *const __user *)envp, p); + (const char __user *const __user *)envp); if (ret < 0) return ret; @@ -1709,6 +1702,6 @@ int kernel_execve(const char *filename, * We were successful. We won't be returning to our caller, but * instead to user space by manipulating the kernel stack. */ - ret_from_kernel_execve(p); + ret_from_kernel_execve(current_pt_regs()); } #endif diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c index 7320a66e958f..22548f56197b 100644 --- a/fs/ext3/balloc.c +++ b/fs/ext3/balloc.c @@ -2101,8 +2101,9 @@ int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range) end = start + (range->len >> sb->s_blocksize_bits) - 1; minlen = range->minlen >> sb->s_blocksize_bits; - if (unlikely(minlen > EXT3_BLOCKS_PER_GROUP(sb)) || - unlikely(start >= max_blks)) + if (minlen > EXT3_BLOCKS_PER_GROUP(sb) || + start >= max_blks || + range->len < sb->s_blocksize) return -EINVAL; if (end >= max_blks) end = max_blks - 1; diff --git a/fs/file.c b/fs/file.c index 708d997a7748..15cb8618e95d 100644 --- a/fs/file.c +++ b/fs/file.c @@ -519,12 +519,6 @@ struct files_struct init_files = { .file_lock = __SPIN_LOCK_UNLOCKED(init_task.file_lock), }; -void daemonize_descriptors(void) -{ - atomic_inc(&init_files.count); - reset_files_struct(&init_files); -} - /* * allocate a file descriptor, mark it busy. */ @@ -685,7 +679,6 @@ void do_close_on_exec(struct files_struct *files) struct fdtable *fdt; /* exec unshares first */ - BUG_ON(atomic_read(&files->count) != 1); spin_lock(&files->file_lock); for (i = 0; ; i++) { unsigned long set; @@ -995,16 +988,18 @@ int iterate_fd(struct files_struct *files, unsigned n, const void *p) { struct fdtable *fdt; - struct file *file; int res = 0; if (!files) return 0; spin_lock(&files->file_lock); - fdt = files_fdtable(files); - while (!res && n < fdt->max_fds) { - file = rcu_dereference_check_fdtable(files, fdt->fd[n++]); - if (file) - res = f(p, file, n); + for (fdt = files_fdtable(files); n < fdt->max_fds; n++) { + struct file *file; + file = rcu_dereference_check_fdtable(files, fdt->fd[n]); + if (!file) + continue; + res = f(p, file, n); + if (res) + break; } spin_unlock(&files->file_lock); return res; diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 51ea267d444c..3e3422f7f0a4 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -228,6 +228,8 @@ static void requeue_io(struct inode *inode, struct bdi_writeback *wb) static void inode_sync_complete(struct inode *inode) { inode->i_state &= ~I_SYNC; + /* If inode is clean an unused, put it into LRU now... */ + inode_add_lru(inode); /* Waiters must see I_SYNC cleared before being woken up */ smp_mb(); wake_up_bit(&inode->i_state, __I_SYNC); diff --git a/fs/fs_struct.c b/fs/fs_struct.c index 5df4775fea03..fe6ca583bbc0 100644 --- a/fs/fs_struct.c +++ b/fs/fs_struct.c @@ -164,27 +164,3 @@ struct fs_struct init_fs = { .seq = SEQCNT_ZERO, .umask = 0022, }; - -void daemonize_fs_struct(void) -{ - struct fs_struct *fs = current->fs; - - if (fs) { - int kill; - - task_lock(current); - - spin_lock(&init_fs.lock); - init_fs.users++; - spin_unlock(&init_fs.lock); - - spin_lock(&fs->lock); - current->fs = &init_fs; - kill = !--fs->users; - spin_unlock(&fs->lock); - - task_unlock(current); - if (kill) - free_fs_struct(fs); - } -} diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 0def0504afc1..e056b4ce4877 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -516,15 +516,13 @@ static int gfs2_mmap(struct file *file, struct vm_area_struct *vma) struct gfs2_holder i_gh; int error; - gfs2_holder_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); - error = gfs2_glock_nq(&i_gh); - if (error == 0) { - file_accessed(file); - gfs2_glock_dq(&i_gh); - } - gfs2_holder_uninit(&i_gh); + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, + &i_gh); if (error) return error; + /* grab lock to update inode */ + gfs2_glock_dq_uninit(&i_gh); + file_accessed(file); } vma->vm_ops = &gfs2_vm_ops; @@ -677,10 +675,8 @@ static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov, size_t writesize = iov_length(iov, nr_segs); struct dentry *dentry = file->f_dentry; struct gfs2_inode *ip = GFS2_I(dentry->d_inode); - struct gfs2_sbd *sdp; int ret; - sdp = GFS2_SB(file->f_mapping->host); ret = gfs2_rs_alloc(ip); if (ret) return ret; diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index e6c2fd53cab2..0f22d09f358d 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -768,7 +768,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, mapping->host = s->s_bdev->bd_inode; mapping->flags = 0; mapping_set_gfp_mask(mapping, GFP_NOFS); - mapping->assoc_mapping = NULL; + mapping->private_data = NULL; mapping->backing_dev_info = s->s_bdi; mapping->writeback_index = 0; } diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 8ff95a2d54ee..9ceccb1595a3 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -393,12 +393,10 @@ static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) struct gfs2_meta_header *mh; struct gfs2_trans *tr; - lock_buffer(bd->bd_bh); - gfs2_log_lock(sdp); tr = current->journal_info; tr->tr_touched = 1; if (!list_empty(&bd->bd_list)) - goto out; + return; set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags); set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags); mh = (struct gfs2_meta_header *)bd->bd_bh->b_data; @@ -414,9 +412,6 @@ static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) sdp->sd_log_num_buf++; list_add(&bd->bd_list, &sdp->sd_log_le_buf); tr->tr_num_buf_new++; -out: - gfs2_log_unlock(sdp); - unlock_buffer(bd->bd_bh); } static void gfs2_check_magic(struct buffer_head *bh) @@ -621,7 +616,6 @@ static void revoke_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) static void revoke_lo_before_commit(struct gfs2_sbd *sdp) { - struct gfs2_log_descriptor *ld; struct gfs2_meta_header *mh; unsigned int offset; struct list_head *head = &sdp->sd_log_le_revoke; @@ -634,7 +628,6 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp) length = gfs2_struct2blk(sdp, sdp->sd_log_num_revoke, sizeof(u64)); page = gfs2_get_log_desc(sdp, GFS2_LOG_DESC_REVOKE, length, sdp->sd_log_num_revoke); - ld = page_address(page); offset = sizeof(struct gfs2_log_descriptor); list_for_each_entry(bd, head, bd_list) { @@ -777,12 +770,10 @@ static void databuf_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) struct address_space *mapping = bd->bd_bh->b_page->mapping; struct gfs2_inode *ip = GFS2_I(mapping->host); - lock_buffer(bd->bd_bh); - gfs2_log_lock(sdp); if (tr) tr->tr_touched = 1; if (!list_empty(&bd->bd_list)) - goto out; + return; set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags); set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags); if (gfs2_is_jdata(ip)) { @@ -793,9 +784,6 @@ static void databuf_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) } else { list_add_tail(&bd->bd_list, &sdp->sd_log_le_ordered); } -out: - gfs2_log_unlock(sdp); - unlock_buffer(bd->bd_bh); } /** diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 40c4b0d42fa8..c5af8e18f27a 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -497,8 +497,11 @@ int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid) struct gfs2_quota_data **qd; int error; - if (ip->i_res == NULL) - gfs2_rs_alloc(ip); + if (ip->i_res == NULL) { + error = gfs2_rs_alloc(ip); + if (error) + return error; + } qd = ip->i_res->rs_qa_qd; diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 3cc402ce6fea..38fe18f2f055 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -553,7 +553,6 @@ void gfs2_free_clones(struct gfs2_rgrpd *rgd) */ int gfs2_rs_alloc(struct gfs2_inode *ip) { - int error = 0; struct gfs2_blkreserv *res; if (ip->i_res) @@ -561,7 +560,7 @@ int gfs2_rs_alloc(struct gfs2_inode *ip) res = kmem_cache_zalloc(gfs2_rsrv_cachep, GFP_NOFS); if (!res) - error = -ENOMEM; + return -ENOMEM; RB_CLEAR_NODE(&res->rs_node); @@ -571,7 +570,7 @@ int gfs2_rs_alloc(struct gfs2_inode *ip) else ip->i_res = res; up_write(&ip->i_rw_mutex); - return error; + return 0; } static void dump_rs(struct seq_file *seq, const struct gfs2_blkreserv *rs) @@ -1263,7 +1262,9 @@ int gfs2_fitrim(struct file *filp, void __user *argp) int ret = 0; u64 amt; u64 trimmed = 0; + u64 start, end, minlen; unsigned int x; + unsigned bs_shift = sdp->sd_sb.sb_bsize_shift; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -1271,19 +1272,25 @@ int gfs2_fitrim(struct file *filp, void __user *argp) if (!blk_queue_discard(q)) return -EOPNOTSUPP; - if (argp == NULL) { - r.start = 0; - r.len = ULLONG_MAX; - r.minlen = 0; - } else if (copy_from_user(&r, argp, sizeof(r))) + if (copy_from_user(&r, argp, sizeof(r))) return -EFAULT; ret = gfs2_rindex_update(sdp); if (ret) return ret; - rgd = gfs2_blk2rgrpd(sdp, r.start, 0); - rgd_end = gfs2_blk2rgrpd(sdp, r.start + r.len, 0); + start = r.start >> bs_shift; + end = start + (r.len >> bs_shift); + minlen = max_t(u64, r.minlen, + q->limits.discard_granularity) >> bs_shift; + + rgd = gfs2_blk2rgrpd(sdp, start, 0); + rgd_end = gfs2_blk2rgrpd(sdp, end - 1, 0); + + if (end <= start || + minlen > sdp->sd_max_rg_data || + start > rgd_end->rd_data0 + rgd_end->rd_data) + return -EINVAL; while (1) { @@ -1295,7 +1302,9 @@ int gfs2_fitrim(struct file *filp, void __user *argp) /* Trim each bitmap in the rgrp */ for (x = 0; x < rgd->rd_length; x++) { struct gfs2_bitmap *bi = rgd->rd_bits + x; - ret = gfs2_rgrp_send_discards(sdp, rgd->rd_data0, NULL, bi, r.minlen, &amt); + ret = gfs2_rgrp_send_discards(sdp, + rgd->rd_data0, NULL, bi, minlen, + &amt); if (ret) { gfs2_glock_dq_uninit(&gh); goto out; @@ -1324,7 +1333,7 @@ int gfs2_fitrim(struct file *filp, void __user *argp) out: r.len = trimmed << 9; - if (argp && copy_to_user(argp, &r, sizeof(r))) + if (copy_to_user(argp, &r, sizeof(r))) return -EFAULT; return ret; diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index bc737261f234..d6488674d916 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -810,7 +810,8 @@ static void gfs2_dirty_inode(struct inode *inode, int flags) return; } need_unlock = 1; - } + } else if (WARN_ON_ONCE(ip->i_gl->gl_state != LM_ST_EXCLUSIVE)) + return; if (current->journal_info == NULL) { ret = gfs2_trans_begin(sdp, RES_DINODE, 0); diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c index adbd27875ef9..413627072f36 100644 --- a/fs/gfs2/trans.c +++ b/fs/gfs2/trans.c @@ -155,14 +155,22 @@ void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta) struct gfs2_sbd *sdp = gl->gl_sbd; struct gfs2_bufdata *bd; + lock_buffer(bh); + gfs2_log_lock(sdp); bd = bh->b_private; if (bd) gfs2_assert(sdp, bd->bd_gl == gl); else { + gfs2_log_unlock(sdp); + unlock_buffer(bh); gfs2_attach_bufdata(gl, bh, meta); bd = bh->b_private; + lock_buffer(bh); + gfs2_log_lock(sdp); } lops_add(sdp, bd); + gfs2_log_unlock(sdp); + unlock_buffer(bh); } void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index c5bc355d8243..4a55f35a6ced 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -151,8 +151,8 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; - unsigned long start_addr; struct hstate *h = hstate_file(file); + struct vm_unmapped_area_info info; if (len & ~huge_page_mask(h)) return -EINVAL; @@ -173,39 +173,13 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, return addr; } - if (len > mm->cached_hole_size) - start_addr = mm->free_area_cache; - else { - start_addr = TASK_UNMAPPED_BASE; - mm->cached_hole_size = 0; - } - -full_search: - addr = ALIGN(start_addr, huge_page_size(h)); - - for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { - /* At this point: (!vma || addr < vma->vm_end). */ - if (TASK_SIZE - len < addr) { - /* - * Start a new search - just in case we missed - * some holes. - */ - if (start_addr != TASK_UNMAPPED_BASE) { - start_addr = TASK_UNMAPPED_BASE; - mm->cached_hole_size = 0; - goto full_search; - } - return -ENOMEM; - } - - if (!vma || addr + len <= vma->vm_start) { - mm->free_area_cache = addr + len; - return addr; - } - if (addr + mm->cached_hole_size < vma->vm_start) - mm->cached_hole_size = vma->vm_start - addr; - addr = ALIGN(vma->vm_end, huge_page_size(h)); - } + info.flags = 0; + info.length = len; + info.low_limit = TASK_UNMAPPED_BASE; + info.high_limit = TASK_SIZE; + info.align_mask = PAGE_MASK & ~huge_page_mask(h); + info.align_offset = 0; + return vm_unmapped_area(&info); } #endif @@ -608,11 +582,11 @@ static int hugetlbfs_migrate_page(struct address_space *mapping, int rc; rc = migrate_huge_page_move_mapping(mapping, newpage, page); - if (rc) + if (rc != MIGRATEPAGE_SUCCESS) return rc; migrate_page_copy(newpage, page); - return 0; + return MIGRATEPAGE_SUCCESS; } static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf) @@ -923,7 +897,7 @@ static struct file_system_type hugetlbfs_fs_type = { .kill_sb = kill_litter_super, }; -static struct vfsmount *hugetlbfs_vfsmount; +static struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE]; static int can_do_hugetlb_shm(void) { @@ -932,9 +906,22 @@ static int can_do_hugetlb_shm(void) return capable(CAP_IPC_LOCK) || in_group_p(shm_group); } +static int get_hstate_idx(int page_size_log) +{ + struct hstate *h; + + if (!page_size_log) + return default_hstate_idx; + h = size_to_hstate(1 << page_size_log); + if (!h) + return -1; + return h - hstates; +} + struct file *hugetlb_file_setup(const char *name, unsigned long addr, size_t size, vm_flags_t acctflag, - struct user_struct **user, int creat_flags) + struct user_struct **user, + int creat_flags, int page_size_log) { int error = -ENOMEM; struct file *file; @@ -944,9 +931,14 @@ struct file *hugetlb_file_setup(const char *name, unsigned long addr, struct qstr quick_string; struct hstate *hstate; unsigned long num_pages; + int hstate_idx; + + hstate_idx = get_hstate_idx(page_size_log); + if (hstate_idx < 0) + return ERR_PTR(-ENODEV); *user = NULL; - if (!hugetlbfs_vfsmount) + if (!hugetlbfs_vfsmount[hstate_idx]) return ERR_PTR(-ENOENT); if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) { @@ -963,7 +955,7 @@ struct file *hugetlb_file_setup(const char *name, unsigned long addr, } } - root = hugetlbfs_vfsmount->mnt_root; + root = hugetlbfs_vfsmount[hstate_idx]->mnt_root; quick_string.name = name; quick_string.len = strlen(quick_string.name); quick_string.hash = 0; @@ -971,7 +963,7 @@ struct file *hugetlb_file_setup(const char *name, unsigned long addr, if (!path.dentry) goto out_shm_unlock; - path.mnt = mntget(hugetlbfs_vfsmount); + path.mnt = mntget(hugetlbfs_vfsmount[hstate_idx]); error = -ENOSPC; inode = hugetlbfs_get_inode(root->d_sb, NULL, S_IFREG | S_IRWXUGO, 0); if (!inode) @@ -1011,8 +1003,9 @@ out_shm_unlock: static int __init init_hugetlbfs_fs(void) { + struct hstate *h; int error; - struct vfsmount *vfsmount; + int i; error = bdi_init(&hugetlbfs_backing_dev_info); if (error) @@ -1029,14 +1022,26 @@ static int __init init_hugetlbfs_fs(void) if (error) goto out; - vfsmount = kern_mount(&hugetlbfs_fs_type); + i = 0; + for_each_hstate(h) { + char buf[50]; + unsigned ps_kb = 1U << (h->order + PAGE_SHIFT - 10); - if (!IS_ERR(vfsmount)) { - hugetlbfs_vfsmount = vfsmount; - return 0; - } + snprintf(buf, sizeof(buf), "pagesize=%uK", ps_kb); + hugetlbfs_vfsmount[i] = kern_mount_data(&hugetlbfs_fs_type, + buf); - error = PTR_ERR(vfsmount); + if (IS_ERR(hugetlbfs_vfsmount[i])) { + pr_err("hugetlb: Cannot mount internal hugetlbfs for " + "page size %uK", ps_kb); + error = PTR_ERR(hugetlbfs_vfsmount[i]); + hugetlbfs_vfsmount[i] = NULL; + } + i++; + } + /* Non default hstates are optional */ + if (!IS_ERR_OR_NULL(hugetlbfs_vfsmount[default_hstate_idx])) + return 0; out: kmem_cache_destroy(hugetlbfs_inode_cachep); @@ -1047,13 +1052,19 @@ static int __init init_hugetlbfs_fs(void) static void __exit exit_hugetlbfs_fs(void) { + struct hstate *h; + int i; + + /* * Make sure all delayed rcu free inodes are flushed before we * destroy cache. */ rcu_barrier(); kmem_cache_destroy(hugetlbfs_inode_cachep); - kern_unmount(hugetlbfs_vfsmount); + i = 0; + for_each_hstate(h) + kern_unmount(hugetlbfs_vfsmount[i++]); unregister_filesystem(&hugetlbfs_fs_type); bdi_destroy(&hugetlbfs_backing_dev_info); } diff --git a/fs/inode.c b/fs/inode.c index b03c71957246..14084b72b259 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -165,7 +165,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode) mapping->host = inode; mapping->flags = 0; mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE); - mapping->assoc_mapping = NULL; + mapping->private_data = NULL; mapping->backing_dev_info = &default_backing_dev_info; mapping->writeback_index = 0; @@ -408,6 +408,19 @@ static void inode_lru_list_add(struct inode *inode) spin_unlock(&inode->i_sb->s_inode_lru_lock); } +/* + * Add inode to LRU if needed (inode is unused and clean). + * + * Needs inode->i_lock held. + */ +void inode_add_lru(struct inode *inode) +{ + if (!(inode->i_state & (I_DIRTY | I_SYNC | I_FREEING | I_WILL_FREE)) && + !atomic_read(&inode->i_count) && inode->i_sb->s_flags & MS_ACTIVE) + inode_lru_list_add(inode); +} + + static void inode_lru_list_del(struct inode *inode) { spin_lock(&inode->i_sb->s_inode_lru_lock); @@ -1390,8 +1403,7 @@ static void iput_final(struct inode *inode) if (!drop && (sb->s_flags & MS_ACTIVE)) { inode->i_state |= I_REFERENCED; - if (!(inode->i_state & (I_DIRTY|I_SYNC))) - inode_lru_list_add(inode); + inode_add_lru(inode); spin_unlock(&inode->i_lock); return; } diff --git a/fs/internal.h b/fs/internal.h index 916b7cbf3e3e..2f6af7f645eb 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -110,6 +110,7 @@ extern int open_check_o_direct(struct file *f); * inode.c */ extern spinlock_t inode_sb_list_lock; +extern void inode_add_lru(struct inode *inode); /* * fs-writeback.c diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c index 78b7f84241d4..7f5120bf0ec2 100644 --- a/fs/jbd/transaction.c +++ b/fs/jbd/transaction.c @@ -1961,7 +1961,9 @@ retry: spin_unlock(&journal->j_list_lock); jbd_unlock_bh_state(bh); spin_unlock(&journal->j_state_lock); + unlock_buffer(bh); log_wait_commit(journal, tid); + lock_buffer(bh); goto retry; } /* diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c index 60ef3fb707ff..1506673c087e 100644 --- a/fs/jffs2/file.c +++ b/fs/jffs2/file.c @@ -138,33 +138,39 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping, struct page *pg; struct inode *inode = mapping->host; struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode); + struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb); + struct jffs2_raw_inode ri; + uint32_t alloc_len = 0; pgoff_t index = pos >> PAGE_CACHE_SHIFT; uint32_t pageofs = index << PAGE_CACHE_SHIFT; int ret = 0; + jffs2_dbg(1, "%s()\n", __func__); + + if (pageofs > inode->i_size) { + ret = jffs2_reserve_space(c, sizeof(ri), &alloc_len, + ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE); + if (ret) + return ret; + } + + mutex_lock(&f->sem); pg = grab_cache_page_write_begin(mapping, index, flags); - if (!pg) + if (!pg) { + if (alloc_len) + jffs2_complete_reservation(c); + mutex_unlock(&f->sem); return -ENOMEM; + } *pagep = pg; - jffs2_dbg(1, "%s()\n", __func__); - - if (pageofs > inode->i_size) { + if (alloc_len) { /* Make new hole frag from old EOF to new page */ - struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb); - struct jffs2_raw_inode ri; struct jffs2_full_dnode *fn; - uint32_t alloc_len; jffs2_dbg(1, "Writing new hole frag 0x%x-0x%x between current EOF and new page\n", (unsigned int)inode->i_size, pageofs); - ret = jffs2_reserve_space(c, sizeof(ri), &alloc_len, - ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE); - if (ret) - goto out_page; - - mutex_lock(&f->sem); memset(&ri, 0, sizeof(ri)); ri.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK); @@ -191,7 +197,6 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping, if (IS_ERR(fn)) { ret = PTR_ERR(fn); jffs2_complete_reservation(c); - mutex_unlock(&f->sem); goto out_page; } ret = jffs2_add_full_dnode_to_inode(c, f, fn); @@ -206,12 +211,10 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping, jffs2_mark_node_obsolete(c, fn->raw); jffs2_free_full_dnode(fn); jffs2_complete_reservation(c); - mutex_unlock(&f->sem); goto out_page; } jffs2_complete_reservation(c); inode->i_size = pageofs; - mutex_unlock(&f->sem); } /* @@ -220,18 +223,18 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping, * case of a short-copy. */ if (!PageUptodate(pg)) { - mutex_lock(&f->sem); ret = jffs2_do_readpage_nolock(inode, pg); - mutex_unlock(&f->sem); if (ret) goto out_page; } + mutex_unlock(&f->sem); jffs2_dbg(1, "end write_begin(). pg->flags %lx\n", pg->flags); return ret; out_page: unlock_page(pg); page_cache_release(pg); + mutex_unlock(&f->sem); return ret; } diff --git a/fs/namei.c b/fs/namei.c index 937f9d50c84b..5f4cdf3ad913 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2131,6 +2131,11 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) if (!len) return ERR_PTR(-EACCES); + if (unlikely(name[0] == '.')) { + if (len < 2 || (len == 2 && name[1] == '.')) + return ERR_PTR(-EACCES); + } + while (len--) { c = *(const unsigned char *)name++; if (c == '/' || c == '\0') diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index ce8cb926526b..b9e66b7e0c14 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -450,7 +450,8 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry) nfs_refresh_inode(dentry->d_inode, entry->fattr); goto out; } else { - d_drop(dentry); + if (d_invalidate(dentry) != 0) + goto out; dput(dentry); } } @@ -1100,6 +1101,8 @@ out_set_verifier: out_zap_parent: nfs_zap_caches(dir); out_bad: + nfs_free_fattr(fattr); + nfs_free_fhandle(fhandle); nfs_mark_for_revalidate(dir); if (inode && S_ISDIR(inode->i_mode)) { /* Purge readdir caches. */ @@ -1112,8 +1115,6 @@ out_zap_parent: shrink_dcache_parent(dentry); } d_drop(dentry); - nfs_free_fattr(fattr); - nfs_free_fhandle(fhandle); dput(parent); dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is invalid\n", __func__, dentry->d_parent->d_name.name, diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index 3e7b2a0dc0c8..07f76db04ec7 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -431,7 +431,7 @@ void nilfs_mapping_init(struct address_space *mapping, struct inode *inode, mapping->host = inode; mapping->flags = 0; mapping_set_gfp_mask(mapping, GFP_NOFS); - mapping->assoc_mapping = NULL; + mapping->private_data = NULL; mapping->backing_dev_info = bdi; mapping->a_ops = &empty_aops; } diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index f35794b97e8e..a50636025364 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -21,6 +21,7 @@ static bool should_merge(struct fsnotify_event *old, struct fsnotify_event *new) if ((old->path.mnt == new->path.mnt) && (old->path.dentry == new->path.dentry)) return true; + break; case (FSNOTIFY_EVENT_NONE): return true; default: diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 721d692fa8d4..6fcaeb8c902e 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -258,7 +258,8 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, if (ret) goto out_close_fd; - fd_install(fd, f); + if (fd != FAN_NOFD) + fd_install(fd, f); return fanotify_event_metadata.event_len; out_close_fd: diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 5a4ee77cec51..dda089804942 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2513,18 +2513,15 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe, ret = sd.num_spliced; if (ret > 0) { - unsigned long nr_pages; int err; - nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - err = generic_write_sync(out, *ppos, ret); if (err) ret = err; else *ppos += ret; - balance_dirty_pages_ratelimited_nr(mapping, nr_pages); + balance_dirty_pages_ratelimited(mapping); } return ret; diff --git a/fs/proc/array.c b/fs/proc/array.c index c1c207c36cae..d3696708fc1a 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -438,7 +438,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, min_flt += sig->min_flt; maj_flt += sig->maj_flt; - thread_group_times(task, &utime, &stime); + thread_group_cputime_adjusted(task, &utime, &stime); gtime += sig->gtime; } @@ -454,7 +454,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, if (!whole) { min_flt = task->min_flt; maj_flt = task->maj_flt; - task_times(task, &utime, &stime); + task_cputime_adjusted(task, &utime, &stime); gtime = task->gtime; } diff --git a/fs/proc/base.c b/fs/proc/base.c index 144a96732dd7..aa63d25157b8 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -873,12 +873,119 @@ static const struct file_operations proc_environ_operations = { .release = mem_release, }; +static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count, + loff_t *ppos) +{ + struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); + char buffer[PROC_NUMBUF]; + int oom_adj = OOM_ADJUST_MIN; + size_t len; + unsigned long flags; + + if (!task) + return -ESRCH; + if (lock_task_sighand(task, &flags)) { + if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MAX) + oom_adj = OOM_ADJUST_MAX; + else + oom_adj = (task->signal->oom_score_adj * -OOM_DISABLE) / + OOM_SCORE_ADJ_MAX; + unlock_task_sighand(task, &flags); + } + put_task_struct(task); + len = snprintf(buffer, sizeof(buffer), "%d\n", oom_adj); + return simple_read_from_buffer(buf, count, ppos, buffer, len); +} + +static ssize_t oom_adj_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task; + char buffer[PROC_NUMBUF]; + int oom_adj; + unsigned long flags; + int err; + + memset(buffer, 0, sizeof(buffer)); + if (count > sizeof(buffer) - 1) + count = sizeof(buffer) - 1; + if (copy_from_user(buffer, buf, count)) { + err = -EFAULT; + goto out; + } + + err = kstrtoint(strstrip(buffer), 0, &oom_adj); + if (err) + goto out; + if ((oom_adj < OOM_ADJUST_MIN || oom_adj > OOM_ADJUST_MAX) && + oom_adj != OOM_DISABLE) { + err = -EINVAL; + goto out; + } + + task = get_proc_task(file->f_path.dentry->d_inode); + if (!task) { + err = -ESRCH; + goto out; + } + + task_lock(task); + if (!task->mm) { + err = -EINVAL; + goto err_task_lock; + } + + if (!lock_task_sighand(task, &flags)) { + err = -ESRCH; + goto err_task_lock; + } + + /* + * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum + * value is always attainable. + */ + if (oom_adj == OOM_ADJUST_MAX) + oom_adj = OOM_SCORE_ADJ_MAX; + else + oom_adj = (oom_adj * OOM_SCORE_ADJ_MAX) / -OOM_DISABLE; + + if (oom_adj < task->signal->oom_score_adj && + !capable(CAP_SYS_RESOURCE)) { + err = -EACCES; + goto err_sighand; + } + + /* + * /proc/pid/oom_adj is provided for legacy purposes, ask users to use + * /proc/pid/oom_score_adj instead. + */ + printk_once(KERN_WARNING "%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n", + current->comm, task_pid_nr(current), task_pid_nr(task), + task_pid_nr(task)); + + task->signal->oom_score_adj = oom_adj; + trace_oom_score_adj_update(task); +err_sighand: + unlock_task_sighand(task, &flags); +err_task_lock: + task_unlock(task); + put_task_struct(task); +out: + return err < 0 ? err : count; +} + +static const struct file_operations proc_oom_adj_operations = { + .read = oom_adj_read, + .write = oom_adj_write, + .llseek = generic_file_llseek, +}; + static ssize_t oom_score_adj_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); char buffer[PROC_NUMBUF]; - int oom_score_adj = OOM_SCORE_ADJ_MIN; + short oom_score_adj = OOM_SCORE_ADJ_MIN; unsigned long flags; size_t len; @@ -889,7 +996,7 @@ static ssize_t oom_score_adj_read(struct file *file, char __user *buf, unlock_task_sighand(task, &flags); } put_task_struct(task); - len = snprintf(buffer, sizeof(buffer), "%d\n", oom_score_adj); + len = snprintf(buffer, sizeof(buffer), "%hd\n", oom_score_adj); return simple_read_from_buffer(buf, count, ppos, buffer, len); } @@ -936,15 +1043,15 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf, goto err_task_lock; } - if (oom_score_adj < task->signal->oom_score_adj_min && + if ((short)oom_score_adj < task->signal->oom_score_adj_min && !capable(CAP_SYS_RESOURCE)) { err = -EACCES; goto err_sighand; } - task->signal->oom_score_adj = oom_score_adj; + task->signal->oom_score_adj = (short)oom_score_adj; if (has_capability_noaudit(current, CAP_SYS_RESOURCE)) - task->signal->oom_score_adj_min = oom_score_adj; + task->signal->oom_score_adj_min = (short)oom_score_adj; trace_oom_score_adj_update(task); err_sighand: @@ -1770,8 +1877,9 @@ static struct dentry *proc_map_files_lookup(struct inode *dir, if (!vma) goto out_no_vma; - result = proc_map_files_instantiate(dir, dentry, task, - (void *)(unsigned long)vma->vm_file->f_mode); + if (vma->vm_file) + result = proc_map_files_instantiate(dir, dentry, task, + (void *)(unsigned long)vma->vm_file->f_mode); out_no_vma: up_read(&mm->mmap_sem); @@ -2598,6 +2706,7 @@ static const struct pid_entry tgid_base_stuff[] = { REG("cgroup", S_IRUGO, proc_cgroup_operations), #endif INF("oom_score", S_IRUGO, proc_oom_score), + REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations), REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), #ifdef CONFIG_AUDITSYSCALL REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), @@ -2964,6 +3073,7 @@ static const struct pid_entry tid_base_stuff[] = { REG("cgroup", S_IRUGO, proc_cgroup_operations), #endif INF("oom_score", S_IRUGO, proc_oom_score), + REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations), REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), #ifdef CONFIG_AUDITSYSCALL REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index a781bdf06694..701580ddfcc3 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -378,12 +378,13 @@ static int test_perm(int mode, int op) return -EACCES; } -static int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op) +static int sysctl_perm(struct ctl_table_header *head, struct ctl_table *table, int op) { + struct ctl_table_root *root = head->root; int mode; if (root->permissions) - mode = root->permissions(root, current->nsproxy, table); + mode = root->permissions(head, table); else mode = table->mode; @@ -491,7 +492,7 @@ static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf, * and won't be until we finish. */ error = -EPERM; - if (sysctl_perm(head->root, table, write ? MAY_WRITE : MAY_READ)) + if (sysctl_perm(head, table, write ? MAY_WRITE : MAY_READ)) goto out; /* if that can happen at all, it should be -EINVAL, not -EISDIR */ @@ -717,7 +718,7 @@ static int proc_sys_permission(struct inode *inode, int mask) if (!table) /* global root - r-xr-xr-x */ error = mask & MAY_WRITE ? -EACCES : 0; else /* Use the permissions on the sysctl table entry */ - error = sysctl_perm(head->root, table, mask & ~MAY_NOT_BLOCK); + error = sysctl_perm(head, table, mask & ~MAY_NOT_BLOCK); sysctl_head_finish(head); return error; diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index 4ab572e6d277..ed1d8c7212da 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -49,6 +49,7 @@ struct pstore_private { struct pstore_info *psi; enum pstore_type_id type; u64 id; + int count; ssize_t size; char data[]; }; @@ -175,7 +176,8 @@ static int pstore_unlink(struct inode *dir, struct dentry *dentry) struct pstore_private *p = dentry->d_inode->i_private; if (p->psi->erase) - p->psi->erase(p->type, p->id, p->psi); + p->psi->erase(p->type, p->id, p->count, + dentry->d_inode->i_ctime, p->psi); return simple_unlink(dir, dentry); } @@ -270,7 +272,7 @@ int pstore_is_mounted(void) * Load it up with "size" bytes of data from "buf". * Set the mtime & ctime to the date that this record was originally stored. */ -int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, +int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count, char *data, size_t size, struct timespec time, struct pstore_info *psi) { @@ -306,6 +308,7 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, goto fail_alloc; private->type = type; private->id = id; + private->count = count; private->psi = psi; switch (type) { diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h index 4847f588b7d5..937d820f273c 100644 --- a/fs/pstore/internal.h +++ b/fs/pstore/internal.h @@ -50,7 +50,7 @@ extern struct pstore_info *psinfo; extern void pstore_set_kmsg_bytes(int); extern void pstore_get_records(int); extern int pstore_mkfile(enum pstore_type_id, char *psname, u64 id, - char *data, size_t size, + int count, char *data, size_t size, struct timespec time, struct pstore_info *psi); extern int pstore_is_mounted(void); diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index a40da07e93d6..5ea2e77ff023 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -136,7 +136,7 @@ static void pstore_dump(struct kmsg_dumper *dumper, break; ret = psinfo->write(PSTORE_TYPE_DMESG, reason, &id, part, - hsize + len, psinfo); + oopscount, hsize + len, psinfo); if (ret == 0 && reason == KMSG_DUMP_OOPS && pstore_is_mounted()) pstore_new_entry = 1; @@ -161,6 +161,7 @@ static void pstore_console_write(struct console *con, const char *s, unsigned c) while (s < e) { unsigned long flags; + u64 id; if (c > psinfo->bufsize) c = psinfo->bufsize; @@ -172,7 +173,7 @@ static void pstore_console_write(struct console *con, const char *s, unsigned c) spin_lock_irqsave(&psinfo->buf_lock, flags); } memcpy(psinfo->buf, s, c); - psinfo->write(PSTORE_TYPE_CONSOLE, 0, NULL, 0, c, psinfo); + psinfo->write(PSTORE_TYPE_CONSOLE, 0, &id, 0, 0, c, psinfo); spin_unlock_irqrestore(&psinfo->buf_lock, flags); s += c; c = e - s; @@ -196,7 +197,7 @@ static void pstore_register_console(void) {} static int pstore_write_compat(enum pstore_type_id type, enum kmsg_dump_reason reason, - u64 *id, unsigned int part, + u64 *id, unsigned int part, int count, size_t size, struct pstore_info *psi) { return psi->write_buf(type, reason, id, part, psinfo->buf, size, psi); @@ -266,6 +267,7 @@ void pstore_get_records(int quiet) char *buf = NULL; ssize_t size; u64 id; + int count; enum pstore_type_id type; struct timespec time; int failed = 0, rc; @@ -277,9 +279,9 @@ void pstore_get_records(int quiet) if (psi->open && psi->open(psi)) goto out; - while ((size = psi->read(&id, &type, &time, &buf, psi)) > 0) { - rc = pstore_mkfile(type, psi->name, id, buf, (size_t)size, - time, psi); + while ((size = psi->read(&id, &type, &count, &time, &buf, psi)) > 0) { + rc = pstore_mkfile(type, psi->name, id, count, buf, + (size_t)size, time, psi); kfree(buf); buf = NULL; if (rc && (rc != -EEXIST || !quiet)) diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index 1a4f6da58eab..2bfa36e0ffe8 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -132,9 +132,8 @@ ramoops_get_next_prz(struct persistent_ram_zone *przs[], uint *c, uint max, } static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type, - struct timespec *time, - char **buf, - struct pstore_info *psi) + int *count, struct timespec *time, + char **buf, struct pstore_info *psi) { ssize_t size; struct ramoops_context *cxt = psi->data; @@ -236,8 +235,8 @@ static int notrace ramoops_pstore_write_buf(enum pstore_type_id type, return 0; } -static int ramoops_pstore_erase(enum pstore_type_id type, u64 id, - struct pstore_info *psi) +static int ramoops_pstore_erase(enum pstore_type_id type, u64 id, int count, + struct timespec time, struct pstore_info *psi) { struct ramoops_context *cxt = psi->data; struct persistent_ram_zone *prz; diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index f27f01a98aa2..d83736fbc26c 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -1782,8 +1782,9 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, BUG_ON(!th->t_trans_id); - dquot_initialize(inode); + reiserfs_write_unlock(inode->i_sb); err = dquot_alloc_inode(inode); + reiserfs_write_lock(inode->i_sb); if (err) goto out_end_trans; if (!dir->i_nlink) { @@ -1979,8 +1980,10 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, out_end_trans: journal_end(th, th->t_super, th->t_blocks_allocated); + reiserfs_write_unlock(inode->i_sb); /* Drop can be outside and it needs more credits so it's better to have it outside */ dquot_drop(inode); + reiserfs_write_lock(inode->i_sb); inode->i_flags |= S_NOQUOTA; make_bad_inode(inode); @@ -3103,10 +3106,9 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) /* must be turned off for recursive notify_change calls */ ia_valid = attr->ia_valid &= ~(ATTR_KILL_SUID|ATTR_KILL_SGID); - depth = reiserfs_write_lock_once(inode->i_sb); if (is_quota_modification(inode, attr)) dquot_initialize(inode); - + depth = reiserfs_write_lock_once(inode->i_sb); if (attr->ia_valid & ATTR_SIZE) { /* version 2 items will be caught by the s_maxbytes check ** done for us in vmtruncate @@ -3170,7 +3172,9 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) error = journal_begin(&th, inode->i_sb, jbegin_count); if (error) goto out; + reiserfs_write_unlock_once(inode->i_sb, depth); error = dquot_transfer(inode, attr); + depth = reiserfs_write_lock_once(inode->i_sb); if (error) { journal_end(&th, inode->i_sb, jbegin_count); goto out; diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c index f8afa4b162b8..2f40a4c70a4d 100644 --- a/fs/reiserfs/stree.c +++ b/fs/reiserfs/stree.c @@ -1968,7 +1968,9 @@ int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct tree key2type(&(key->on_disk_key))); #endif + reiserfs_write_unlock(inode->i_sb); retval = dquot_alloc_space_nodirty(inode, pasted_size); + reiserfs_write_lock(inode->i_sb); if (retval) { pathrelse(search_path); return retval; @@ -2061,9 +2063,11 @@ int reiserfs_insert_item(struct reiserfs_transaction_handle *th, "reiserquota insert_item(): allocating %u id=%u type=%c", quota_bytes, inode->i_uid, head2type(ih)); #endif + reiserfs_write_unlock(inode->i_sb); /* We can't dirty inode here. It would be immediately written but * appropriate stat item isn't inserted yet... */ retval = dquot_alloc_space_nodirty(inode, quota_bytes); + reiserfs_write_lock(inode->i_sb); if (retval) { pathrelse(path); return retval; diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 1078ae179993..418bdc3a57da 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -298,7 +298,9 @@ static int finish_unfinished(struct super_block *s) retval = remove_save_link_only(s, &save_link_key, 0); continue; } + reiserfs_write_unlock(s); dquot_initialize(inode); + reiserfs_write_lock(s); if (truncate && S_ISDIR(inode->i_mode)) { /* We got a truncate request for a dir which is impossible. @@ -1335,7 +1337,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) kfree(qf_names[i]); #endif err = -EINVAL; - goto out_err; + goto out_unlock; } #ifdef CONFIG_QUOTA handle_quota_files(s, qf_names, &qfmt); @@ -1379,7 +1381,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) if (blocks) { err = reiserfs_resize(s, blocks); if (err != 0) - goto out_err; + goto out_unlock; } if (*mount_flags & MS_RDONLY) { @@ -1389,9 +1391,15 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) /* it is read-only already */ goto out_ok; + /* + * Drop write lock. Quota will retake it when needed and lock + * ordering requires calling dquot_suspend() without it. + */ + reiserfs_write_unlock(s); err = dquot_suspend(s, -1); if (err < 0) goto out_err; + reiserfs_write_lock(s); /* try to remount file system with read-only permissions */ if (sb_umount_state(rs) == REISERFS_VALID_FS @@ -1401,7 +1409,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) err = journal_begin(&th, s, 10); if (err) - goto out_err; + goto out_unlock; /* Mounting a rw partition read-only. */ reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1); @@ -1416,7 +1424,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) if (reiserfs_is_journal_aborted(journal)) { err = journal->j_errno; - goto out_err; + goto out_unlock; } handle_data_mode(s, mount_options); @@ -1425,7 +1433,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) s->s_flags &= ~MS_RDONLY; /* now it is safe to call journal_begin */ err = journal_begin(&th, s, 10); if (err) - goto out_err; + goto out_unlock; /* Mount a partition which is read-only, read-write */ reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1); @@ -1442,10 +1450,16 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) SB_JOURNAL(s)->j_must_wait = 1; err = journal_end(&th, s, 10); if (err) - goto out_err; + goto out_unlock; if (!(*mount_flags & MS_RDONLY)) { + /* + * Drop write lock. Quota will retake it when needed and lock + * ordering requires calling dquot_resume() without it. + */ + reiserfs_write_unlock(s); dquot_resume(s, -1); + reiserfs_write_lock(s); finish_unfinished(s); reiserfs_xattr_init(s, *mount_flags); } @@ -1455,9 +1469,10 @@ out_ok: reiserfs_write_unlock(s); return 0; +out_unlock: + reiserfs_write_unlock(s); out_err: kfree(new_opts); - reiserfs_write_unlock(s); return err; } @@ -2095,13 +2110,15 @@ static int reiserfs_write_dquot(struct dquot *dquot) REISERFS_QUOTA_TRANS_BLOCKS(dquot->dq_sb)); if (ret) goto out; + reiserfs_write_unlock(dquot->dq_sb); ret = dquot_commit(dquot); + reiserfs_write_lock(dquot->dq_sb); err = journal_end(&th, dquot->dq_sb, REISERFS_QUOTA_TRANS_BLOCKS(dquot->dq_sb)); if (!ret && err) ret = err; - out: +out: reiserfs_write_unlock(dquot->dq_sb); return ret; } @@ -2117,13 +2134,15 @@ static int reiserfs_acquire_dquot(struct dquot *dquot) REISERFS_QUOTA_INIT_BLOCKS(dquot->dq_sb)); if (ret) goto out; + reiserfs_write_unlock(dquot->dq_sb); ret = dquot_acquire(dquot); + reiserfs_write_lock(dquot->dq_sb); err = journal_end(&th, dquot->dq_sb, REISERFS_QUOTA_INIT_BLOCKS(dquot->dq_sb)); if (!ret && err) ret = err; - out: +out: reiserfs_write_unlock(dquot->dq_sb); return ret; } @@ -2137,19 +2156,21 @@ static int reiserfs_release_dquot(struct dquot *dquot) ret = journal_begin(&th, dquot->dq_sb, REISERFS_QUOTA_DEL_BLOCKS(dquot->dq_sb)); + reiserfs_write_unlock(dquot->dq_sb); if (ret) { /* Release dquot anyway to avoid endless cycle in dqput() */ dquot_release(dquot); goto out; } ret = dquot_release(dquot); + reiserfs_write_lock(dquot->dq_sb); err = journal_end(&th, dquot->dq_sb, REISERFS_QUOTA_DEL_BLOCKS(dquot->dq_sb)); if (!ret && err) ret = err; - out: reiserfs_write_unlock(dquot->dq_sb); +out: return ret; } @@ -2174,11 +2195,13 @@ static int reiserfs_write_info(struct super_block *sb, int type) ret = journal_begin(&th, sb, 2); if (ret) goto out; + reiserfs_write_unlock(sb); ret = dquot_commit_info(sb, type); + reiserfs_write_lock(sb); err = journal_end(&th, sb, 2); if (!ret && err) ret = err; - out: +out: reiserfs_write_unlock(sb); return ret; } @@ -2203,8 +2226,11 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id, struct reiserfs_transaction_handle th; int opt = type == USRQUOTA ? REISERFS_USRQUOTA : REISERFS_GRPQUOTA; - if (!(REISERFS_SB(sb)->s_mount_opt & (1 << opt))) - return -EINVAL; + reiserfs_write_lock(sb); + if (!(REISERFS_SB(sb)->s_mount_opt & (1 << opt))) { + err = -EINVAL; + goto out; + } /* Quotafile not on the same filesystem? */ if (path->dentry->d_sb != sb) { @@ -2246,8 +2272,10 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id, if (err) goto out; } - err = dquot_quota_on(sb, type, format_id, path); + reiserfs_write_unlock(sb); + return dquot_quota_on(sb, type, format_id, path); out: + reiserfs_write_unlock(sb); return err; } @@ -2320,7 +2348,9 @@ static ssize_t reiserfs_quota_write(struct super_block *sb, int type, tocopy = sb->s_blocksize - offset < towrite ? sb->s_blocksize - offset : towrite; tmp_bh.b_state = 0; + reiserfs_write_lock(sb); err = reiserfs_get_block(inode, blk, &tmp_bh, GET_BLOCK_CREATE); + reiserfs_write_unlock(sb); if (err) goto out; if (offset || tocopy != sb->s_blocksize) @@ -2336,10 +2366,12 @@ static ssize_t reiserfs_quota_write(struct super_block *sb, int type, flush_dcache_page(bh->b_page); set_buffer_uptodate(bh); unlock_buffer(bh); + reiserfs_write_lock(sb); reiserfs_prepare_for_journal(sb, bh, 1); journal_mark_dirty(current->journal_info, sb, bh); if (!journal_quota) reiserfs_add_ordered_list(inode, bh); + reiserfs_write_unlock(sb); brelse(bh); offset = 0; towrite -= tocopy; diff --git a/fs/splice.c b/fs/splice.c index 13e5b4776e7a..8890604e3fcd 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1024,17 +1024,14 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, ret = sd.num_spliced; if (ret > 0) { - unsigned long nr_pages; int err; - nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - err = generic_write_sync(out, *ppos, ret); if (err) ret = err; else *ppos += ret; - balance_dirty_pages_ratelimited_nr(mapping, nr_pages); + balance_dirty_pages_ratelimited(mapping); } sb_end_write(inode->i_sb); diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 00012e31829d..602f56db0442 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -485,8 +485,8 @@ const struct file_operations sysfs_file_operations = { .poll = sysfs_poll, }; -int sysfs_attr_ns(struct kobject *kobj, const struct attribute *attr, - const void **pns) +static int sysfs_attr_ns(struct kobject *kobj, const struct attribute *attr, + const void **pns) { struct sysfs_dirent *dir_sd = kobj->sd; const struct sysfs_ops *ops; diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c index 28ec13af28d9..2dcf3d473fec 100644 --- a/fs/ubifs/find.c +++ b/fs/ubifs/find.c @@ -681,8 +681,16 @@ int ubifs_find_free_leb_for_idx(struct ubifs_info *c) if (!lprops) { lprops = ubifs_fast_find_freeable(c); if (!lprops) { - ubifs_assert(c->freeable_cnt == 0); - if (c->lst.empty_lebs - c->lst.taken_empty_lebs > 0) { + /* + * The first condition means the following: go scan the + * LPT if there are uncategorized lprops, which means + * there may be freeable LEBs there (UBIFS does not + * store the information about freeable LEBs in the + * master node). + */ + if (c->in_a_category_cnt != c->main_lebs || + c->lst.empty_lebs - c->lst.taken_empty_lebs > 0) { + ubifs_assert(c->freeable_cnt == 0); lprops = scan_for_leb_for_idx(c); if (IS_ERR(lprops)) { err = PTR_ERR(lprops); diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c index e5a2a35a46dc..46190a7c42a6 100644 --- a/fs/ubifs/lprops.c +++ b/fs/ubifs/lprops.c @@ -300,8 +300,11 @@ void ubifs_add_to_cat(struct ubifs_info *c, struct ubifs_lprops *lprops, default: ubifs_assert(0); } + lprops->flags &= ~LPROPS_CAT_MASK; lprops->flags |= cat; + c->in_a_category_cnt += 1; + ubifs_assert(c->in_a_category_cnt <= c->main_lebs); } /** @@ -334,6 +337,9 @@ static void ubifs_remove_from_cat(struct ubifs_info *c, default: ubifs_assert(0); } + + c->in_a_category_cnt -= 1; + ubifs_assert(c->in_a_category_cnt >= 0); } /** diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 5486346d0a3f..d133c276fe05 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -1183,6 +1183,8 @@ struct ubifs_debug_info; * @freeable_list: list of freeable non-index LEBs (free + dirty == @leb_size) * @frdi_idx_list: list of freeable index LEBs (free + dirty == @leb_size) * @freeable_cnt: number of freeable LEBs in @freeable_list + * @in_a_category_cnt: count of lprops which are in a certain category, which + * basically meants that they were loaded from the flash * * @ltab_lnum: LEB number of LPT's own lprops table * @ltab_offs: offset of LPT's own lprops table @@ -1412,6 +1414,7 @@ struct ubifs_info { struct list_head freeable_list; struct list_head frdi_idx_list; int freeable_cnt; + int in_a_category_cnt; int ltab_lnum; int ltab_offs; diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig index 6100ec0fa1d4..5a7ffe54f5d5 100644 --- a/fs/xfs/Kconfig +++ b/fs/xfs/Kconfig @@ -2,6 +2,7 @@ config XFS_FS tristate "XFS filesystem support" depends on BLOCK select EXPORTFS + select LIBCRC32C help XFS is a high performance journaling filesystem which originated on the SGI IRIX platform. It is completely multi-threaded, can diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index d2bf974b1a2f..d02201df855b 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -37,9 +37,8 @@ xfs-y += xfs_aops.o \ xfs_file.o \ xfs_filestream.o \ xfs_fsops.o \ - xfs_fs_subr.o \ xfs_globals.o \ - xfs_iget.o \ + xfs_icache.o \ xfs_ioctl.o \ xfs_iomap.o \ xfs_iops.o \ @@ -47,7 +46,6 @@ xfs-y += xfs_aops.o \ xfs_message.o \ xfs_mru_cache.o \ xfs_super.o \ - xfs_sync.o \ xfs_xattr.o \ xfs_rename.o \ xfs_utils.o \ diff --git a/fs/xfs/uuid.h b/fs/xfs/uuid.h index 4732d71262cc..104db0f3bed6 100644 --- a/fs/xfs/uuid.h +++ b/fs/xfs/uuid.h @@ -26,4 +26,10 @@ extern int uuid_is_nil(uuid_t *uuid); extern int uuid_equal(uuid_t *uuid1, uuid_t *uuid2); extern void uuid_getnodeuniq(uuid_t *uuid, int fsid [2]); +static inline void +uuid_copy(uuid_t *dst, uuid_t *src) +{ + memcpy(dst, src, sizeof(uuid_t)); +} + #endif /* __XFS_SUPPORT_UUID_H__ */ diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h index 44d65c1533c0..f2aeedb6a579 100644 --- a/fs/xfs/xfs_ag.h +++ b/fs/xfs/xfs_ag.h @@ -108,6 +108,8 @@ typedef struct xfs_agf { extern int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, int flags, struct xfs_buf **bpp); +extern const struct xfs_buf_ops xfs_agf_buf_ops; + /* * Size of the unlinked inode hash table in the agi. */ @@ -161,6 +163,8 @@ typedef struct xfs_agi { extern int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, struct xfs_buf **bpp); +extern const struct xfs_buf_ops xfs_agi_buf_ops; + /* * The third a.g. block contains the a.g. freelist, an array * of block pointers to blocks owned by the allocation btree code. @@ -233,6 +237,7 @@ typedef struct xfs_perag { #define XFS_ICI_NO_TAG (-1) /* special flag for an untagged lookup in xfs_inode_ag_iterator */ #define XFS_ICI_RECLAIM_TAG 0 /* inode is to be reclaimed */ +#define XFS_ICI_EOFBLOCKS_TAG 1 /* inode has blocks beyond EOF */ #define XFS_AG_MAXLEVELS(mp) ((mp)->m_ag_maxlevels) #define XFS_MIN_FREELIST_RAW(bl,cl,mp) \ diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 4f33c32affe3..393055fe3aef 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -430,6 +430,60 @@ xfs_alloc_fixup_trees( return 0; } +static void +xfs_agfl_verify( + struct xfs_buf *bp) +{ +#ifdef WHEN_CRCS_COME_ALONG + /* + * we cannot actually do any verification of the AGFL because mkfs does + * not initialise the AGFL to zero or NULL. Hence the only valid part of + * the AGFL is what the AGF says is active. We can't get to the AGF, so + * we can't verify just those entries are valid. + * + * This problem goes away when the CRC format change comes along as that + * requires the AGFL to be initialised by mkfs. At that point, we can + * verify the blocks in the agfl -active or not- lie within the bounds + * of the AG. Until then, just leave this check ifdef'd out. + */ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_agfl *agfl = XFS_BUF_TO_AGFL(bp); + int agfl_ok = 1; + + int i; + + for (i = 0; i < XFS_AGFL_SIZE(mp); i++) { + if (be32_to_cpu(agfl->agfl_bno[i]) == NULLAGBLOCK || + be32_to_cpu(agfl->agfl_bno[i]) >= mp->m_sb.sb_agblocks) + agfl_ok = 0; + } + + if (!agfl_ok) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agfl); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } +#endif +} + +static void +xfs_agfl_write_verify( + struct xfs_buf *bp) +{ + xfs_agfl_verify(bp); +} + +static void +xfs_agfl_read_verify( + struct xfs_buf *bp) +{ + xfs_agfl_verify(bp); +} + +const struct xfs_buf_ops xfs_agfl_buf_ops = { + .verify_read = xfs_agfl_read_verify, + .verify_write = xfs_agfl_write_verify, +}; + /* * Read in the allocation group free block array. */ @@ -447,7 +501,7 @@ xfs_alloc_read_agfl( error = xfs_trans_read_buf( mp, tp, mp->m_ddev_targp, XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), 0, &bp); + XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_agfl_buf_ops); if (error) return error; ASSERT(!xfs_buf_geterror(bp)); @@ -1866,6 +1920,7 @@ xfs_alloc_fix_freelist( /* * Initialize the args structure. */ + memset(&targs, 0, sizeof(targs)); targs.tp = tp; targs.mp = mp; targs.agbp = agbp; @@ -2090,6 +2145,63 @@ xfs_alloc_put_freelist( return 0; } +static void +xfs_agf_verify( + struct xfs_buf *bp) + { + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_agf *agf; + int agf_ok; + + agf = XFS_BUF_TO_AGF(bp); + + agf_ok = agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) && + XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) && + be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) && + be32_to_cpu(agf->agf_flfirst) < XFS_AGFL_SIZE(mp) && + be32_to_cpu(agf->agf_fllast) < XFS_AGFL_SIZE(mp) && + be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp); + + /* + * during growfs operations, the perag is not fully initialised, + * so we can't use it for any useful checking. growfs ensures we can't + * use it by using uncached buffers that don't have the perag attached + * so we can detect and avoid this problem. + */ + if (bp->b_pag) + agf_ok = agf_ok && be32_to_cpu(agf->agf_seqno) == + bp->b_pag->pag_agno; + + if (xfs_sb_version_haslazysbcount(&mp->m_sb)) + agf_ok = agf_ok && be32_to_cpu(agf->agf_btreeblks) <= + be32_to_cpu(agf->agf_length); + + if (unlikely(XFS_TEST_ERROR(!agf_ok, mp, XFS_ERRTAG_ALLOC_READ_AGF, + XFS_RANDOM_ALLOC_READ_AGF))) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agf); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } +} + +static void +xfs_agf_read_verify( + struct xfs_buf *bp) +{ + xfs_agf_verify(bp); +} + +static void +xfs_agf_write_verify( + struct xfs_buf *bp) +{ + xfs_agf_verify(bp); +} + +const struct xfs_buf_ops xfs_agf_buf_ops = { + .verify_read = xfs_agf_read_verify, + .verify_write = xfs_agf_write_verify, +}; + /* * Read in the allocation group header (free/alloc section). */ @@ -2101,44 +2213,19 @@ xfs_read_agf( int flags, /* XFS_BUF_ */ struct xfs_buf **bpp) /* buffer for the ag freelist header */ { - struct xfs_agf *agf; /* ag freelist header */ - int agf_ok; /* set if agf is consistent */ int error; ASSERT(agno != NULLAGNUMBER); error = xfs_trans_read_buf( mp, tp, mp->m_ddev_targp, XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), flags, bpp); + XFS_FSS_TO_BB(mp, 1), flags, bpp, &xfs_agf_buf_ops); if (error) return error; if (!*bpp) return 0; ASSERT(!(*bpp)->b_error); - agf = XFS_BUF_TO_AGF(*bpp); - - /* - * Validate the magic number of the agf block. - */ - agf_ok = - agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) && - XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) && - be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) && - be32_to_cpu(agf->agf_flfirst) < XFS_AGFL_SIZE(mp) && - be32_to_cpu(agf->agf_fllast) < XFS_AGFL_SIZE(mp) && - be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp) && - be32_to_cpu(agf->agf_seqno) == agno; - if (xfs_sb_version_haslazysbcount(&mp->m_sb)) - agf_ok = agf_ok && be32_to_cpu(agf->agf_btreeblks) <= - be32_to_cpu(agf->agf_length); - if (unlikely(XFS_TEST_ERROR(!agf_ok, mp, XFS_ERRTAG_ALLOC_READ_AGF, - XFS_RANDOM_ALLOC_READ_AGF))) { - XFS_CORRUPTION_ERROR("xfs_alloc_read_agf", - XFS_ERRLEVEL_LOW, mp, agf); - xfs_trans_brelse(tp, *bpp); - return XFS_ERROR(EFSCORRUPTED); - } xfs_buf_set_ref(*bpp, XFS_AGF_REF); return 0; } @@ -2207,7 +2294,7 @@ xfs_alloc_read_agf( * group or loop over the allocation groups to find the result. */ int /* error */ -__xfs_alloc_vextent( +xfs_alloc_vextent( xfs_alloc_arg_t *args) /* allocation argument structure */ { xfs_agblock_t agsize; /* allocation group size */ @@ -2417,46 +2504,6 @@ error0: return error; } -static void -xfs_alloc_vextent_worker( - struct work_struct *work) -{ - struct xfs_alloc_arg *args = container_of(work, - struct xfs_alloc_arg, work); - unsigned long pflags; - - /* we are in a transaction context here */ - current_set_flags_nested(&pflags, PF_FSTRANS); - - args->result = __xfs_alloc_vextent(args); - complete(args->done); - - current_restore_flags_nested(&pflags, PF_FSTRANS); -} - -/* - * Data allocation requests often come in with little stack to work on. Push - * them off to a worker thread so there is lots of stack to use. Metadata - * requests, OTOH, are generally from low stack usage paths, so avoid the - * context switch overhead here. - */ -int -xfs_alloc_vextent( - struct xfs_alloc_arg *args) -{ - DECLARE_COMPLETION_ONSTACK(done); - - if (!args->userdata) - return __xfs_alloc_vextent(args); - - - args->done = &done; - INIT_WORK_ONSTACK(&args->work, xfs_alloc_vextent_worker); - queue_work(xfs_alloc_wq, &args->work); - wait_for_completion(&done); - return args->result; -} - /* * Free an extent. * Just break up the extent address and hand off to xfs_free_ag_extent diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h index 93be4a667ca1..99d0a6101558 100644 --- a/fs/xfs/xfs_alloc.h +++ b/fs/xfs/xfs_alloc.h @@ -120,9 +120,6 @@ typedef struct xfs_alloc_arg { char isfl; /* set if is freelist blocks - !acctg */ char userdata; /* set if this is user data */ xfs_fsblock_t firstblock; /* io first block allocated */ - struct completion *done; - struct work_struct work; - int result; } xfs_alloc_arg_t; /* @@ -234,4 +231,7 @@ xfs_alloc_get_rec( xfs_extlen_t *len, /* output: length of extent */ int *stat); /* output: success/failure */ +extern const struct xfs_buf_ops xfs_agf_buf_ops; +extern const struct xfs_buf_ops xfs_agfl_buf_ops; + #endif /* __XFS_ALLOC_H__ */ diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c index f1647caace8f..b1ddef6b2689 100644 --- a/fs/xfs/xfs_alloc_btree.c +++ b/fs/xfs/xfs_alloc_btree.c @@ -121,6 +121,8 @@ xfs_allocbt_free_block( xfs_extent_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1, XFS_EXTENT_BUSY_SKIP_DISCARD); xfs_trans_agbtree_delta(cur->bc_tp, -1); + + xfs_trans_binval(cur->bc_tp, bp); return 0; } @@ -270,6 +272,82 @@ xfs_allocbt_key_diff( return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock; } +static void +xfs_allocbt_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + struct xfs_perag *pag = bp->b_pag; + unsigned int level; + int sblock_ok; /* block passes checks */ + + /* + * magic number and level verification + * + * During growfs operations, we can't verify the exact level as the + * perag is not fully initialised and hence not attached to the buffer. + * In this case, check against the maximum tree depth. + */ + level = be16_to_cpu(block->bb_level); + switch (block->bb_magic) { + case cpu_to_be32(XFS_ABTB_MAGIC): + if (pag) + sblock_ok = level < pag->pagf_levels[XFS_BTNUM_BNOi]; + else + sblock_ok = level < mp->m_ag_maxlevels; + break; + case cpu_to_be32(XFS_ABTC_MAGIC): + if (pag) + sblock_ok = level < pag->pagf_levels[XFS_BTNUM_CNTi]; + else + sblock_ok = level < mp->m_ag_maxlevels; + break; + default: + sblock_ok = 0; + break; + } + + /* numrecs verification */ + sblock_ok = sblock_ok && + be16_to_cpu(block->bb_numrecs) <= mp->m_alloc_mxr[level != 0]; + + /* sibling pointer verification */ + sblock_ok = sblock_ok && + (block->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) || + be32_to_cpu(block->bb_u.s.bb_leftsib) < mp->m_sb.sb_agblocks) && + block->bb_u.s.bb_leftsib && + (block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK) || + be32_to_cpu(block->bb_u.s.bb_rightsib) < mp->m_sb.sb_agblocks) && + block->bb_u.s.bb_rightsib; + + if (!sblock_ok) { + trace_xfs_btree_corrupt(bp, _RET_IP_); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, block); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } +} + +static void +xfs_allocbt_read_verify( + struct xfs_buf *bp) +{ + xfs_allocbt_verify(bp); +} + +static void +xfs_allocbt_write_verify( + struct xfs_buf *bp) +{ + xfs_allocbt_verify(bp); +} + +const struct xfs_buf_ops xfs_allocbt_buf_ops = { + .verify_read = xfs_allocbt_read_verify, + .verify_write = xfs_allocbt_write_verify, +}; + + #ifdef DEBUG STATIC int xfs_allocbt_keys_inorder( @@ -325,6 +403,7 @@ static const struct xfs_btree_ops xfs_allocbt_ops = { .init_rec_from_cur = xfs_allocbt_init_rec_from_cur, .init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur, .key_diff = xfs_allocbt_key_diff, + .buf_ops = &xfs_allocbt_buf_ops, #ifdef DEBUG .keys_inorder = xfs_allocbt_keys_inorder, .recs_inorder = xfs_allocbt_recs_inorder, diff --git a/fs/xfs/xfs_alloc_btree.h b/fs/xfs/xfs_alloc_btree.h index 359fb86ed876..7e89a2b429dd 100644 --- a/fs/xfs/xfs_alloc_btree.h +++ b/fs/xfs/xfs_alloc_btree.h @@ -93,4 +93,6 @@ extern struct xfs_btree_cur *xfs_allocbt_init_cursor(struct xfs_mount *, xfs_agnumber_t, xfs_btnum_t); extern int xfs_allocbt_maxrecs(struct xfs_mount *, int, int); +extern const struct xfs_buf_ops xfs_allocbt_buf_ops; + #endif /* __XFS_ALLOC_BTREE_H__ */ diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index e562dd43f41f..4111a40ebe1a 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -124,7 +124,7 @@ xfs_setfilesize_trans_alloc( ioend->io_append_trans = tp; /* - * We will pass freeze protection with a transaction. So tell lockdep + * We may pass freeze protection with a transaction. So tell lockdep * we released it. */ rwsem_release(&ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], @@ -149,11 +149,13 @@ xfs_setfilesize( xfs_fsize_t isize; /* - * The transaction was allocated in the I/O submission thread, - * thus we need to mark ourselves as beeing in a transaction - * manually. + * The transaction may have been allocated in the I/O submission thread, + * thus we need to mark ourselves as beeing in a transaction manually. + * Similarly for freeze protection. */ current_set_flags_nested(&tp->t_pflags, PF_FSTRANS); + rwsem_acquire_read(&VFS_I(ip)->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], + 0, 1, _THIS_IP_); xfs_ilock(ip, XFS_ILOCK_EXCL); isize = xfs_new_eof(ip, ioend->io_offset + ioend->io_size); @@ -187,7 +189,8 @@ xfs_finish_ioend( if (ioend->io_type == XFS_IO_UNWRITTEN) queue_work(mp->m_unwritten_workqueue, &ioend->io_work); - else if (ioend->io_append_trans) + else if (ioend->io_append_trans || + (ioend->io_isdirect && xfs_ioend_is_append(ioend))) queue_work(mp->m_data_workqueue, &ioend->io_work); else xfs_destroy_ioend(ioend); @@ -205,15 +208,6 @@ xfs_end_io( struct xfs_inode *ip = XFS_I(ioend->io_inode); int error = 0; - if (ioend->io_append_trans) { - /* - * We've got freeze protection passed with the transaction. - * Tell lockdep about it. - */ - rwsem_acquire_read( - &ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], - 0, 1, _THIS_IP_); - } if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { ioend->io_error = -EIO; goto done; @@ -226,35 +220,31 @@ xfs_end_io( * range to normal written extens after the data I/O has finished. */ if (ioend->io_type == XFS_IO_UNWRITTEN) { + error = xfs_iomap_write_unwritten(ip, ioend->io_offset, + ioend->io_size); + } else if (ioend->io_isdirect && xfs_ioend_is_append(ioend)) { /* - * For buffered I/O we never preallocate a transaction when - * doing the unwritten extent conversion, but for direct I/O - * we do not know if we are converting an unwritten extent - * or not at the point where we preallocate the transaction. + * For direct I/O we do not know if we need to allocate blocks + * or not so we can't preallocate an append transaction as that + * results in nested reservations and log space deadlocks. Hence + * allocate the transaction here. While this is sub-optimal and + * can block IO completion for some time, we're stuck with doing + * it this way until we can pass the ioend to the direct IO + * allocation callbacks and avoid nesting that way. */ - if (ioend->io_append_trans) { - ASSERT(ioend->io_isdirect); - - current_set_flags_nested( - &ioend->io_append_trans->t_pflags, PF_FSTRANS); - xfs_trans_cancel(ioend->io_append_trans, 0); - } - - error = xfs_iomap_write_unwritten(ip, ioend->io_offset, - ioend->io_size); - if (error) { - ioend->io_error = -error; + error = xfs_setfilesize_trans_alloc(ioend); + if (error) goto done; - } + error = xfs_setfilesize(ioend); } else if (ioend->io_append_trans) { error = xfs_setfilesize(ioend); - if (error) - ioend->io_error = -error; } else { ASSERT(!xfs_ioend_is_append(ioend)); } done: + if (error) + ioend->io_error = -error; xfs_destroy_ioend(ioend); } @@ -481,11 +471,17 @@ static inline int bio_add_buffer(struct bio *bio, struct buffer_head *bh) * * The fix is two passes across the ioend list - one to start writeback on the * buffer_heads, and then submit them for I/O on the second pass. + * + * If @fail is non-zero, it means that we have a situation where some part of + * the submission process has failed after we have marked paged for writeback + * and unlocked them. In this situation, we need to fail the ioend chain rather + * than submit it to IO. This typically only happens on a filesystem shutdown. */ STATIC void xfs_submit_ioend( struct writeback_control *wbc, - xfs_ioend_t *ioend) + xfs_ioend_t *ioend, + int fail) { xfs_ioend_t *head = ioend; xfs_ioend_t *next; @@ -506,6 +502,18 @@ xfs_submit_ioend( next = ioend->io_list; bio = NULL; + /* + * If we are failing the IO now, just mark the ioend with an + * error and finish it. This will run IO completion immediately + * as there is only one reference to the ioend at this point in + * time. + */ + if (fail) { + ioend->io_error = -fail; + xfs_finish_ioend(ioend); + continue; + } + for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) { if (!bio) { @@ -1060,7 +1068,18 @@ xfs_vm_writepage( xfs_start_page_writeback(page, 1, count); - if (ioend && imap_valid) { + /* if there is no IO to be submitted for this page, we are done */ + if (!ioend) + return 0; + + ASSERT(iohead); + + /* + * Any errors from this point onwards need tobe reported through the IO + * completion path as we have marked the initial page as under writeback + * and unlocked it. + */ + if (imap_valid) { xfs_off_t end_index; end_index = imap.br_startoff + imap.br_blockcount; @@ -1079,20 +1098,15 @@ xfs_vm_writepage( wbc, end_index); } - if (iohead) { - /* - * Reserve log space if we might write beyond the on-disk - * inode size. - */ - if (ioend->io_type != XFS_IO_UNWRITTEN && - xfs_ioend_is_append(ioend)) { - err = xfs_setfilesize_trans_alloc(ioend); - if (err) - goto error; - } - xfs_submit_ioend(wbc, iohead); - } + /* + * Reserve log space if we might write beyond the on-disk inode size. + */ + err = 0; + if (ioend->io_type != XFS_IO_UNWRITTEN && xfs_ioend_is_append(ioend)) + err = xfs_setfilesize_trans_alloc(ioend); + + xfs_submit_ioend(wbc, iohead, err); return 0; @@ -1408,25 +1422,21 @@ xfs_vm_direct_IO( size_t size = iov_length(iov, nr_segs); /* - * We need to preallocate a transaction for a size update - * here. In the case that this write both updates the size - * and converts at least on unwritten extent we will cancel - * the still clean transaction after the I/O has finished. + * We cannot preallocate a size update transaction here as we + * don't know whether allocation is necessary or not. Hence we + * can only tell IO completion that one is necessary if we are + * not doing unwritten extent conversion. */ iocb->private = ioend = xfs_alloc_ioend(inode, XFS_IO_DIRECT); - if (offset + size > XFS_I(inode)->i_d.di_size) { - ret = xfs_setfilesize_trans_alloc(ioend); - if (ret) - goto out_destroy_ioend; + if (offset + size > XFS_I(inode)->i_d.di_size) ioend->io_isdirect = 1; - } ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, nr_segs, xfs_get_blocks_direct, xfs_end_io_direct_write, NULL, 0); if (ret != -EIOCBQUEUED && iocb->private) - goto out_trans_cancel; + goto out_destroy_ioend; } else { ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, nr_segs, @@ -1436,15 +1446,6 @@ xfs_vm_direct_IO( return ret; -out_trans_cancel: - if (ioend->io_append_trans) { - current_set_flags_nested(&ioend->io_append_trans->t_pflags, - PF_FSTRANS); - rwsem_acquire_read( - &inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], - 0, 1, _THIS_IP_); - xfs_trans_cancel(ioend->io_append_trans, 0); - } out_destroy_ioend: xfs_destroy_ioend(ioend); return ret; @@ -1617,7 +1618,7 @@ xfs_vm_bmap( trace_xfs_vm_bmap(XFS_I(inode)); xfs_ilock(ip, XFS_IOLOCK_SHARED); - xfs_flush_pages(ip, (xfs_off_t)0, -1, 0, FI_REMAPF); + filemap_write_and_wait(mapping); xfs_iunlock(ip, XFS_IOLOCK_SHARED); return generic_block_bmap(mapping, block, xfs_get_blocks); } diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index 0ca1f0be62d2..aaf472532b3c 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -903,11 +903,9 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) */ dp = args->dp; args->blkno = 0; - error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp, - XFS_ATTR_FORK); + error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); if (error) - return(error); - ASSERT(bp != NULL); + return error; /* * Look up the given attribute in the leaf block. Figure out if @@ -1031,12 +1029,12 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) * Read in the block containing the "old" attr, then * remove the "old" attr from that block (neat, huh!) */ - error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, - &bp, XFS_ATTR_FORK); + error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, + -1, &bp); if (error) - return(error); - ASSERT(bp != NULL); - (void)xfs_attr_leaf_remove(bp, args); + return error; + + xfs_attr_leaf_remove(bp, args); /* * If the result is small enough, shrink it all into the inode. @@ -1100,20 +1098,17 @@ xfs_attr_leaf_removename(xfs_da_args_t *args) */ dp = args->dp; args->blkno = 0; - error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp, - XFS_ATTR_FORK); - if (error) { - return(error); - } + error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); + if (error) + return error; - ASSERT(bp != NULL); error = xfs_attr_leaf_lookup_int(bp, args); if (error == ENOATTR) { xfs_trans_brelse(args->trans, bp); return(error); } - (void)xfs_attr_leaf_remove(bp, args); + xfs_attr_leaf_remove(bp, args); /* * If the result is small enough, shrink it all into the inode. @@ -1155,12 +1150,12 @@ xfs_attr_leaf_get(xfs_da_args_t *args) struct xfs_buf *bp; int error; + trace_xfs_attr_leaf_get(args); + args->blkno = 0; - error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp, - XFS_ATTR_FORK); + error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); if (error) - return(error); - ASSERT(bp != NULL); + return error; error = xfs_attr_leaf_lookup_int(bp, args); if (error != EEXIST) { @@ -1181,22 +1176,15 @@ xfs_attr_leaf_get(xfs_da_args_t *args) STATIC int xfs_attr_leaf_list(xfs_attr_list_context_t *context) { - xfs_attr_leafblock_t *leaf; int error; struct xfs_buf *bp; + trace_xfs_attr_leaf_list(context); + context->cursor->blkno = 0; - error = xfs_da_read_buf(NULL, context->dp, 0, -1, &bp, XFS_ATTR_FORK); + error = xfs_attr_leaf_read(NULL, context->dp, 0, -1, &bp); if (error) return XFS_ERROR(error); - ASSERT(bp != NULL); - leaf = bp->b_addr; - if (unlikely(leaf->hdr.info.magic != cpu_to_be16(XFS_ATTR_LEAF_MAGIC))) { - XFS_CORRUPTION_ERROR("xfs_attr_leaf_list", XFS_ERRLEVEL_LOW, - context->dp->i_mount, leaf); - xfs_trans_brelse(NULL, bp); - return XFS_ERROR(EFSCORRUPTED); - } error = xfs_attr_leaf_list_int(bp, context); xfs_trans_brelse(NULL, bp); @@ -1600,12 +1588,9 @@ xfs_attr_node_removename(xfs_da_args_t *args) ASSERT(state->path.blk[0].bp); state->path.blk[0].bp = NULL; - error = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp, - XFS_ATTR_FORK); + error = xfs_attr_leaf_read(args->trans, args->dp, 0, -1, &bp); if (error) goto out; - ASSERT((((xfs_attr_leafblock_t *)bp->b_addr)->hdr.info.magic) == - cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) { xfs_bmap_init(args->flist, args->firstblock); @@ -1653,6 +1638,8 @@ xfs_attr_fillstate(xfs_da_state_t *state) xfs_da_state_blk_t *blk; int level; + trace_xfs_attr_fillstate(state->args); + /* * Roll down the "path" in the state structure, storing the on-disk * block number for those buffers in the "path". @@ -1699,6 +1686,8 @@ xfs_attr_refillstate(xfs_da_state_t *state) xfs_da_state_blk_t *blk; int level, error; + trace_xfs_attr_refillstate(state->args); + /* * Roll down the "path" in the state structure, storing the on-disk * block number for those buffers in the "path". @@ -1707,7 +1696,7 @@ xfs_attr_refillstate(xfs_da_state_t *state) ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); for (blk = path->blk, level = 0; level < path->active; blk++, level++) { if (blk->disk_blkno) { - error = xfs_da_read_buf(state->args->trans, + error = xfs_da_node_read(state->args->trans, state->args->dp, blk->blkno, blk->disk_blkno, &blk->bp, XFS_ATTR_FORK); @@ -1726,7 +1715,7 @@ xfs_attr_refillstate(xfs_da_state_t *state) ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); for (blk = path->blk, level = 0; level < path->active; blk++, level++) { if (blk->disk_blkno) { - error = xfs_da_read_buf(state->args->trans, + error = xfs_da_node_read(state->args->trans, state->args->dp, blk->blkno, blk->disk_blkno, &blk->bp, XFS_ATTR_FORK); @@ -1755,6 +1744,8 @@ xfs_attr_node_get(xfs_da_args_t *args) int error, retval; int i; + trace_xfs_attr_node_get(args); + state = xfs_da_state_alloc(); state->args = args; state->mp = args->dp->i_mount; @@ -1804,6 +1795,8 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) int error, i; struct xfs_buf *bp; + trace_xfs_attr_node_list(context); + cursor = context->cursor; cursor->initted = 1; @@ -1814,7 +1807,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) */ bp = NULL; if (cursor->blkno > 0) { - error = xfs_da_read_buf(NULL, context->dp, cursor->blkno, -1, + error = xfs_da_node_read(NULL, context->dp, cursor->blkno, -1, &bp, XFS_ATTR_FORK); if ((error != 0) && (error != EFSCORRUPTED)) return(error); @@ -1856,17 +1849,11 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) if (bp == NULL) { cursor->blkno = 0; for (;;) { - error = xfs_da_read_buf(NULL, context->dp, + error = xfs_da_node_read(NULL, context->dp, cursor->blkno, -1, &bp, XFS_ATTR_FORK); if (error) return(error); - if (unlikely(bp == NULL)) { - XFS_ERROR_REPORT("xfs_attr_node_list(2)", - XFS_ERRLEVEL_LOW, - context->dp->i_mount); - return(XFS_ERROR(EFSCORRUPTED)); - } node = bp->b_addr; if (node->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)) @@ -1907,14 +1894,6 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) */ for (;;) { leaf = bp->b_addr; - if (unlikely(leaf->hdr.info.magic != - cpu_to_be16(XFS_ATTR_LEAF_MAGIC))) { - XFS_CORRUPTION_ERROR("xfs_attr_node_list(4)", - XFS_ERRLEVEL_LOW, - context->dp->i_mount, leaf); - xfs_trans_brelse(NULL, bp); - return(XFS_ERROR(EFSCORRUPTED)); - } error = xfs_attr_leaf_list_int(bp, context); if (error) { xfs_trans_brelse(NULL, bp); @@ -1924,16 +1903,10 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) break; cursor->blkno = be32_to_cpu(leaf->hdr.info.forw); xfs_trans_brelse(NULL, bp); - error = xfs_da_read_buf(NULL, context->dp, cursor->blkno, -1, - &bp, XFS_ATTR_FORK); + error = xfs_attr_leaf_read(NULL, context->dp, cursor->blkno, -1, + &bp); if (error) - return(error); - if (unlikely((bp == NULL))) { - XFS_ERROR_REPORT("xfs_attr_node_list(5)", - XFS_ERRLEVEL_LOW, - context->dp->i_mount); - return(XFS_ERROR(EFSCORRUPTED)); - } + return error; } xfs_trans_brelse(NULL, bp); return(0); @@ -1959,6 +1932,8 @@ xfs_attr_rmtval_get(xfs_da_args_t *args) int nmap, error, tmp, valuelen, blkcnt, i; xfs_dablk_t lblkno; + trace_xfs_attr_rmtval_get(args); + ASSERT(!(args->flags & ATTR_KERNOVAL)); mp = args->dp->i_mount; @@ -1980,7 +1955,7 @@ xfs_attr_rmtval_get(xfs_da_args_t *args) dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock); blkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount); error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, - dblkno, blkcnt, 0, &bp); + dblkno, blkcnt, 0, &bp, NULL); if (error) return(error); @@ -2014,6 +1989,8 @@ xfs_attr_rmtval_set(xfs_da_args_t *args) xfs_dablk_t lblkno; int blkcnt, valuelen, nmap, error, tmp, committed; + trace_xfs_attr_rmtval_set(args); + dp = args->dp; mp = dp->i_mount; src = args->value; @@ -2143,6 +2120,8 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args) xfs_dablk_t lblkno; int valuelen, blkcnt, nmap, error, done, committed; + trace_xfs_attr_rmtval_remove(args); + mp = args->dp->i_mount; /* diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index d330111ca738..ee24993c7d12 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -57,7 +57,8 @@ STATIC int xfs_attr_leaf_create(xfs_da_args_t *args, xfs_dablk_t which_block, struct xfs_buf **bpp); STATIC int xfs_attr_leaf_add_work(struct xfs_buf *leaf_buffer, xfs_da_args_t *args, int freemap_index); -STATIC void xfs_attr_leaf_compact(xfs_trans_t *tp, struct xfs_buf *leaf_buffer); +STATIC void xfs_attr_leaf_compact(struct xfs_da_args *args, + struct xfs_buf *leaf_buffer); STATIC void xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, xfs_da_state_blk_t *blk2); @@ -87,6 +88,52 @@ STATIC void xfs_attr_leaf_moveents(xfs_attr_leafblock_t *src_leaf, xfs_mount_t *mp); STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index); +static void +xfs_attr_leaf_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_attr_leaf_hdr *hdr = bp->b_addr; + int block_ok = 0; + + block_ok = hdr->info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC); + if (!block_ok) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } +} + +static void +xfs_attr_leaf_read_verify( + struct xfs_buf *bp) +{ + xfs_attr_leaf_verify(bp); +} + +static void +xfs_attr_leaf_write_verify( + struct xfs_buf *bp) +{ + xfs_attr_leaf_verify(bp); +} + +const struct xfs_buf_ops xfs_attr_leaf_buf_ops = { + .verify_read = xfs_attr_leaf_read_verify, + .verify_write = xfs_attr_leaf_write_verify, +}; + +int +xfs_attr_leaf_read( + struct xfs_trans *tp, + struct xfs_inode *dp, + xfs_dablk_t bno, + xfs_daddr_t mappedbno, + struct xfs_buf **bpp) +{ + return xfs_da_read_buf(tp, dp, bno, mappedbno, bpp, + XFS_ATTR_FORK, &xfs_attr_leaf_buf_ops); +} + /*======================================================================== * Namespace helper routines *========================================================================*/ @@ -869,17 +916,16 @@ xfs_attr_leaf_to_node(xfs_da_args_t *args) error = xfs_da_grow_inode(args, &blkno); if (error) goto out; - error = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp1, - XFS_ATTR_FORK); + error = xfs_attr_leaf_read(args->trans, args->dp, 0, -1, &bp1); if (error) goto out; - ASSERT(bp1 != NULL); + bp2 = NULL; error = xfs_da_get_buf(args->trans, args->dp, blkno, -1, &bp2, XFS_ATTR_FORK); if (error) goto out; - ASSERT(bp2 != NULL); + bp2->b_ops = bp1->b_ops; memcpy(bp2->b_addr, bp1->b_addr, XFS_LBSIZE(dp->i_mount)); bp1 = NULL; xfs_trans_log_buf(args->trans, bp2, 0, XFS_LBSIZE(dp->i_mount) - 1); @@ -933,7 +979,7 @@ xfs_attr_leaf_create( XFS_ATTR_FORK); if (error) return(error); - ASSERT(bp != NULL); + bp->b_ops = &xfs_attr_leaf_buf_ops; leaf = bp->b_addr; memset((char *)leaf, 0, XFS_LBSIZE(dp->i_mount)); hdr = &leaf->hdr; @@ -1071,7 +1117,7 @@ xfs_attr_leaf_add( * Compact the entries to coalesce free space. * This may change the hdr->count via dropping INCOMPLETE entries. */ - xfs_attr_leaf_compact(args->trans, bp); + xfs_attr_leaf_compact(args, bp); /* * After compaction, the block is guaranteed to have only one @@ -1102,6 +1148,8 @@ xfs_attr_leaf_add_work( xfs_mount_t *mp; int tmp, i; + trace_xfs_attr_leaf_add_work(args); + leaf = bp->b_addr; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); hdr = &leaf->hdr; @@ -1214,15 +1262,17 @@ xfs_attr_leaf_add_work( */ STATIC void xfs_attr_leaf_compact( - struct xfs_trans *trans, - struct xfs_buf *bp) + struct xfs_da_args *args, + struct xfs_buf *bp) { - xfs_attr_leafblock_t *leaf_s, *leaf_d; - xfs_attr_leaf_hdr_t *hdr_s, *hdr_d; - xfs_mount_t *mp; - char *tmpbuffer; + xfs_attr_leafblock_t *leaf_s, *leaf_d; + xfs_attr_leaf_hdr_t *hdr_s, *hdr_d; + struct xfs_trans *trans = args->trans; + struct xfs_mount *mp = trans->t_mountp; + char *tmpbuffer; + + trace_xfs_attr_leaf_compact(args); - mp = trans->t_mountp; tmpbuffer = kmem_alloc(XFS_LBSIZE(mp), KM_SLEEP); ASSERT(tmpbuffer != NULL); memcpy(tmpbuffer, bp->b_addr, XFS_LBSIZE(mp)); @@ -1291,6 +1341,7 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, leaf2 = blk2->bp->b_addr; ASSERT(leaf1->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); ASSERT(leaf2->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); + ASSERT(leaf2->hdr.count == 0); args = state->args; trace_xfs_attr_leaf_rebalance(args); @@ -1344,9 +1395,8 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, max = be16_to_cpu(hdr2->firstused) - sizeof(xfs_attr_leaf_hdr_t); max -= be16_to_cpu(hdr2->count) * sizeof(xfs_attr_leaf_entry_t); - if (space > max) { - xfs_attr_leaf_compact(args->trans, blk2->bp); - } + if (space > max) + xfs_attr_leaf_compact(args, blk2->bp); /* * Move high entries from leaf1 to low end of leaf2. @@ -1361,6 +1411,7 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, * I assert that since all callers pass in an empty * second buffer, this code should never execute. */ + ASSERT(0); /* * Figure the total bytes to be added to the destination leaf. @@ -1376,9 +1427,8 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, max = be16_to_cpu(hdr1->firstused) - sizeof(xfs_attr_leaf_hdr_t); max -= be16_to_cpu(hdr1->count) * sizeof(xfs_attr_leaf_entry_t); - if (space > max) { - xfs_attr_leaf_compact(args->trans, blk1->bp); - } + if (space > max) + xfs_attr_leaf_compact(args, blk1->bp); /* * Move low entries from leaf2 to high end of leaf1. @@ -1422,10 +1472,24 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, args->index2 = 0; args->blkno2 = blk2->blkno; } else { + /* + * On a double leaf split, the original attr location + * is already stored in blkno2/index2, so don't + * overwrite it overwise we corrupt the tree. + */ blk2->index = blk1->index - be16_to_cpu(leaf1->hdr.count); - args->index = args->index2 = blk2->index; - args->blkno = args->blkno2 = blk2->blkno; + args->index = blk2->index; + args->blkno = blk2->blkno; + if (!state->extravalid) { + /* + * set the new attr location to match the old + * one and let the higher level split code + * decide where in the leaf to place it. + */ + args->index2 = blk2->index; + args->blkno2 = blk2->blkno; + } } } else { ASSERT(state->inleaf == 1); @@ -1561,6 +1625,8 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action) xfs_dablk_t blkno; struct xfs_buf *bp; + trace_xfs_attr_leaf_toosmall(state->args); + /* * Check for the degenerate case of the block being over 50% full. * If so, it's not worth even looking to see if we might be able @@ -1620,18 +1686,16 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action) blkno = be32_to_cpu(info->back); if (blkno == 0) continue; - error = xfs_da_read_buf(state->args->trans, state->args->dp, - blkno, -1, &bp, XFS_ATTR_FORK); + error = xfs_attr_leaf_read(state->args->trans, state->args->dp, + blkno, -1, &bp); if (error) return(error); - ASSERT(bp != NULL); leaf = (xfs_attr_leafblock_t *)info; count = be16_to_cpu(leaf->hdr.count); bytes = state->blocksize - (state->blocksize>>2); bytes -= be16_to_cpu(leaf->hdr.usedbytes); leaf = bp->b_addr; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); count += be16_to_cpu(leaf->hdr.count); bytes -= be16_to_cpu(leaf->hdr.usedbytes); bytes -= count * sizeof(xfs_attr_leaf_entry_t); @@ -1686,6 +1750,8 @@ xfs_attr_leaf_remove( int tablesize, tmp, i; xfs_mount_t *mp; + trace_xfs_attr_leaf_remove(args); + leaf = bp->b_addr; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); hdr = &leaf->hdr; @@ -2495,15 +2561,11 @@ xfs_attr_leaf_clearflag(xfs_da_args_t *args) /* * Set up the operation. */ - error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp, - XFS_ATTR_FORK); - if (error) { + error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); + if (error) return(error); - } - ASSERT(bp != NULL); leaf = bp->b_addr; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); ASSERT(args->index < be16_to_cpu(leaf->hdr.count)); ASSERT(args->index >= 0); entry = &leaf->entries[ args->index ]; @@ -2560,15 +2622,11 @@ xfs_attr_leaf_setflag(xfs_da_args_t *args) /* * Set up the operation. */ - error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp, - XFS_ATTR_FORK); - if (error) { + error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); + if (error) return(error); - } - ASSERT(bp != NULL); leaf = bp->b_addr; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); ASSERT(args->index < be16_to_cpu(leaf->hdr.count)); ASSERT(args->index >= 0); entry = &leaf->entries[ args->index ]; @@ -2617,35 +2675,28 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args) /* * Read the block containing the "old" attr */ - error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp1, - XFS_ATTR_FORK); - if (error) { - return(error); - } - ASSERT(bp1 != NULL); + error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp1); + if (error) + return error; /* * Read the block containing the "new" attr, if it is different */ if (args->blkno2 != args->blkno) { - error = xfs_da_read_buf(args->trans, args->dp, args->blkno2, - -1, &bp2, XFS_ATTR_FORK); - if (error) { - return(error); - } - ASSERT(bp2 != NULL); + error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno2, + -1, &bp2); + if (error) + return error; } else { bp2 = bp1; } leaf1 = bp1->b_addr; - ASSERT(leaf1->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); ASSERT(args->index < be16_to_cpu(leaf1->hdr.count)); ASSERT(args->index >= 0); entry1 = &leaf1->entries[ args->index ]; leaf2 = bp2->b_addr; - ASSERT(leaf2->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); ASSERT(args->index2 < be16_to_cpu(leaf2->hdr.count)); ASSERT(args->index2 >= 0); entry2 = &leaf2->entries[ args->index2 ]; @@ -2730,7 +2781,7 @@ xfs_attr_root_inactive(xfs_trans_t **trans, xfs_inode_t *dp) * the extents in reverse order the extent containing * block 0 must still be there. */ - error = xfs_da_read_buf(*trans, dp, 0, -1, &bp, XFS_ATTR_FORK); + error = xfs_da_node_read(*trans, dp, 0, -1, &bp, XFS_ATTR_FORK); if (error) return(error); blkno = XFS_BUF_ADDR(bp); @@ -2815,7 +2866,7 @@ xfs_attr_node_inactive( * traversal of the tree so we may deal with many blocks * before we come back to this one. */ - error = xfs_da_read_buf(*trans, dp, child_fsb, -2, &child_bp, + error = xfs_da_node_read(*trans, dp, child_fsb, -2, &child_bp, XFS_ATTR_FORK); if (error) return(error); @@ -2856,8 +2907,8 @@ xfs_attr_node_inactive( * child block number. */ if ((i+1) < count) { - error = xfs_da_read_buf(*trans, dp, 0, parent_blkno, - &bp, XFS_ATTR_FORK); + error = xfs_da_node_read(*trans, dp, 0, parent_blkno, + &bp, XFS_ATTR_FORK); if (error) return(error); child_fsb = be32_to_cpu(node->btree[i+1].before); diff --git a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/xfs_attr_leaf.h index dea17722945e..77de139a58f0 100644 --- a/fs/xfs/xfs_attr_leaf.h +++ b/fs/xfs/xfs_attr_leaf.h @@ -261,4 +261,10 @@ int xfs_attr_leaf_order(struct xfs_buf *leaf1_bp, struct xfs_buf *leaf2_bp); int xfs_attr_leaf_newentsize(int namelen, int valuelen, int blocksize, int *local); +int xfs_attr_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp, + xfs_dablk_t bno, xfs_daddr_t mappedbno, + struct xfs_buf **bpp); + +extern const struct xfs_buf_ops xfs_attr_leaf_buf_ops; + #endif /* __XFS_ATTR_LEAF_H__ */ diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 848ffa77707b..0e92d12765d2 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -2437,6 +2437,7 @@ xfs_bmap_btalloc( * Normal allocation, done through xfs_alloc_vextent. */ tryagain = isaligned = 0; + memset(&args, 0, sizeof(args)); args.tp = ap->tp; args.mp = mp; args.fsbno = ap->blkno; @@ -2661,8 +2662,9 @@ xfs_bmap_btree_to_extents( if ((error = xfs_btree_check_lptr(cur, cbno, 1))) return error; #endif - if ((error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp, - XFS_BMAP_BTREE_REF))) + error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp, XFS_BMAP_BTREE_REF, + &xfs_bmbt_buf_ops); + if (error) return error; cblock = XFS_BUF_TO_BLOCK(cbp); if ((error = xfs_btree_check_block(cur, cblock, 0, cbp))) @@ -3082,6 +3084,7 @@ xfs_bmap_extents_to_btree( * Convert to a btree with two levels, one record in root. */ XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_BTREE); + memset(&args, 0, sizeof(args)); args.tp = tp; args.mp = mp; args.firstblock = *firstblock; @@ -3121,6 +3124,7 @@ xfs_bmap_extents_to_btree( /* * Fill in the child block. */ + abp->b_ops = &xfs_bmbt_buf_ops; ablock = XFS_BUF_TO_BLOCK(abp); ablock->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC); ablock->bb_level = 0; @@ -3237,6 +3241,7 @@ xfs_bmap_local_to_extents( xfs_buf_t *bp; /* buffer for extent block */ xfs_bmbt_rec_host_t *ep;/* extent record pointer */ + memset(&args, 0, sizeof(args)); args.tp = tp; args.mp = ip->i_mount; args.firstblock = *firstblock; @@ -3266,6 +3271,7 @@ xfs_bmap_local_to_extents( ASSERT(args.len == 1); *firstblock = args.fsbno; bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0); + bp->b_ops = &xfs_bmbt_buf_ops; memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes); xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1); xfs_bmap_forkoff_reset(args.mp, ip, whichfork); @@ -4075,8 +4081,9 @@ xfs_bmap_read_extents( * pointer (leftmost) at each level. */ while (level-- > 0) { - if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, - XFS_BMAP_BTREE_REF))) + error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, + XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops); + if (error) return error; block = XFS_BUF_TO_BLOCK(bp); XFS_WANT_CORRUPTED_GOTO( @@ -4121,7 +4128,8 @@ xfs_bmap_read_extents( */ nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); if (nextbno != NULLFSBLOCK) - xfs_btree_reada_bufl(mp, nextbno, 1); + xfs_btree_reada_bufl(mp, nextbno, 1, + &xfs_bmbt_buf_ops); /* * Copy records into the extent records. */ @@ -4153,8 +4161,9 @@ xfs_bmap_read_extents( */ if (bno == NULLFSBLOCK) break; - if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, - XFS_BMAP_BTREE_REF))) + error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, + XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops); + if (error) return error; block = XFS_BUF_TO_BLOCK(bp); } @@ -4616,12 +4625,11 @@ xfs_bmapi_delay( STATIC int -xfs_bmapi_allocate( - struct xfs_bmalloca *bma, - int flags) +__xfs_bmapi_allocate( + struct xfs_bmalloca *bma) { struct xfs_mount *mp = bma->ip->i_mount; - int whichfork = (flags & XFS_BMAPI_ATTRFORK) ? + int whichfork = (bma->flags & XFS_BMAPI_ATTRFORK) ? XFS_ATTR_FORK : XFS_DATA_FORK; struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork); int tmp_logflags = 0; @@ -4654,24 +4662,27 @@ xfs_bmapi_allocate( * Indicate if this is the first user data in the file, or just any * user data. */ - if (!(flags & XFS_BMAPI_METADATA)) { + if (!(bma->flags & XFS_BMAPI_METADATA)) { bma->userdata = (bma->offset == 0) ? XFS_ALLOC_INITIAL_USER_DATA : XFS_ALLOC_USERDATA; } - bma->minlen = (flags & XFS_BMAPI_CONTIG) ? bma->length : 1; + bma->minlen = (bma->flags & XFS_BMAPI_CONTIG) ? bma->length : 1; /* * Only want to do the alignment at the eof if it is userdata and * allocation length is larger than a stripe unit. */ if (mp->m_dalign && bma->length >= mp->m_dalign && - !(flags & XFS_BMAPI_METADATA) && whichfork == XFS_DATA_FORK) { + !(bma->flags & XFS_BMAPI_METADATA) && whichfork == XFS_DATA_FORK) { error = xfs_bmap_isaeof(bma, whichfork); if (error) return error; } + if (bma->flags & XFS_BMAPI_STACK_SWITCH) + bma->stack_switch = 1; + error = xfs_bmap_alloc(bma); if (error) return error; @@ -4706,7 +4717,7 @@ xfs_bmapi_allocate( * A wasdelay extent has been initialized, so shouldn't be flagged * as unwritten. */ - if (!bma->wasdel && (flags & XFS_BMAPI_PREALLOC) && + if (!bma->wasdel && (bma->flags & XFS_BMAPI_PREALLOC) && xfs_sb_version_hasextflgbit(&mp->m_sb)) bma->got.br_state = XFS_EXT_UNWRITTEN; @@ -4734,6 +4745,45 @@ xfs_bmapi_allocate( return 0; } +static void +xfs_bmapi_allocate_worker( + struct work_struct *work) +{ + struct xfs_bmalloca *args = container_of(work, + struct xfs_bmalloca, work); + unsigned long pflags; + + /* we are in a transaction context here */ + current_set_flags_nested(&pflags, PF_FSTRANS); + + args->result = __xfs_bmapi_allocate(args); + complete(args->done); + + current_restore_flags_nested(&pflags, PF_FSTRANS); +} + +/* + * Some allocation requests often come in with little stack to work on. Push + * them off to a worker thread so there is lots of stack to use. Otherwise just + * call directly to avoid the context switch overhead here. + */ +int +xfs_bmapi_allocate( + struct xfs_bmalloca *args) +{ + DECLARE_COMPLETION_ONSTACK(done); + + if (!args->stack_switch) + return __xfs_bmapi_allocate(args); + + + args->done = &done; + INIT_WORK_ONSTACK(&args->work, xfs_bmapi_allocate_worker); + queue_work(xfs_alloc_wq, &args->work); + wait_for_completion(&done); + return args->result; +} + STATIC int xfs_bmapi_convert_unwritten( struct xfs_bmalloca *bma, @@ -4919,6 +4969,7 @@ xfs_bmapi_write( bma.conv = !!(flags & XFS_BMAPI_CONVERT); bma.wasdel = wasdelay; bma.offset = bno; + bma.flags = flags; /* * There's a 32/64 bit type mismatch between the @@ -4934,7 +4985,7 @@ xfs_bmapi_write( ASSERT(len > 0); ASSERT(bma.length > 0); - error = xfs_bmapi_allocate(&bma, flags); + error = xfs_bmapi_allocate(&bma); if (error) goto error0; if (bma.blkno == NULLFSBLOCK) @@ -5554,7 +5605,7 @@ xfs_getbmap( xfs_ilock(ip, XFS_IOLOCK_SHARED); if (whichfork == XFS_DATA_FORK && !(iflags & BMV_IF_DELALLOC)) { if (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size) { - error = xfs_flush_pages(ip, 0, -1, 0, FI_REMAPF); + error = -filemap_write_and_wait(VFS_I(ip)->i_mapping); if (error) goto out_unlock_iolock; } @@ -5823,15 +5874,16 @@ xfs_bmap_check_leaf_extents( */ while (level-- > 0) { /* See if buf is in cur first */ + bp_release = 0; bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno)); - if (bp) { - bp_release = 0; - } else { + if (!bp) { bp_release = 1; + error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp, + XFS_BMAP_BTREE_REF, + &xfs_bmbt_buf_ops); + if (error) + goto error_norelse; } - if (!bp && (error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp, - XFS_BMAP_BTREE_REF))) - goto error_norelse; block = XFS_BUF_TO_BLOCK(bp); XFS_WANT_CORRUPTED_GOTO( xfs_bmap_sanity_check(mp, bp, level), @@ -5908,15 +5960,16 @@ xfs_bmap_check_leaf_extents( if (bno == NULLFSBLOCK) break; + bp_release = 0; bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno)); - if (bp) { - bp_release = 0; - } else { + if (!bp) { bp_release = 1; + error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp, + XFS_BMAP_BTREE_REF, + &xfs_bmbt_buf_ops); + if (error) + goto error_norelse; } - if (!bp && (error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp, - XFS_BMAP_BTREE_REF))) - goto error_norelse; block = XFS_BUF_TO_BLOCK(bp); } if (bp_release) { @@ -6007,7 +6060,9 @@ xfs_bmap_count_tree( struct xfs_btree_block *block, *nextblock; int numrecs; - if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF))) + error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF, + &xfs_bmbt_buf_ops); + if (error) return error; *count += 1; block = XFS_BUF_TO_BLOCK(bp); @@ -6016,8 +6071,10 @@ xfs_bmap_count_tree( /* Not at node above leaves, count this level of nodes */ nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); while (nextbno != NULLFSBLOCK) { - if ((error = xfs_btree_read_bufl(mp, tp, nextbno, - 0, &nbp, XFS_BMAP_BTREE_REF))) + error = xfs_btree_read_bufl(mp, tp, nextbno, 0, &nbp, + XFS_BMAP_BTREE_REF, + &xfs_bmbt_buf_ops); + if (error) return error; *count += 1; nextblock = XFS_BUF_TO_BLOCK(nbp); @@ -6046,8 +6103,10 @@ xfs_bmap_count_tree( if (nextbno == NULLFSBLOCK) break; bno = nextbno; - if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, - XFS_BMAP_BTREE_REF))) + error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, + XFS_BMAP_BTREE_REF, + &xfs_bmbt_buf_ops); + if (error) return error; *count += 1; block = XFS_BUF_TO_BLOCK(bp); diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h index 803b56d7ce16..5f469c3516eb 100644 --- a/fs/xfs/xfs_bmap.h +++ b/fs/xfs/xfs_bmap.h @@ -77,6 +77,7 @@ typedef struct xfs_bmap_free * from written to unwritten, otherwise convert from unwritten to written. */ #define XFS_BMAPI_CONVERT 0x040 +#define XFS_BMAPI_STACK_SWITCH 0x080 #define XFS_BMAPI_FLAGS \ { XFS_BMAPI_ENTIRE, "ENTIRE" }, \ @@ -85,7 +86,8 @@ typedef struct xfs_bmap_free { XFS_BMAPI_PREALLOC, "PREALLOC" }, \ { XFS_BMAPI_IGSTATE, "IGSTATE" }, \ { XFS_BMAPI_CONTIG, "CONTIG" }, \ - { XFS_BMAPI_CONVERT, "CONVERT" } + { XFS_BMAPI_CONVERT, "CONVERT" }, \ + { XFS_BMAPI_STACK_SWITCH, "STACK_SWITCH" } static inline int xfs_bmapi_aflag(int w) @@ -133,6 +135,11 @@ typedef struct xfs_bmalloca { char userdata;/* set if is user data */ char aeof; /* allocated space at eof */ char conv; /* overwriting unwritten extents */ + char stack_switch; + int flags; + struct completion *done; + struct work_struct work; + int result; } xfs_bmalloca_t; /* diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c index 862084a47a7e..061b45cbe614 100644 --- a/fs/xfs/xfs_bmap_btree.c +++ b/fs/xfs/xfs_bmap_btree.c @@ -36,6 +36,7 @@ #include "xfs_bmap.h" #include "xfs_error.h" #include "xfs_quota.h" +#include "xfs_trace.h" /* * Determine the extent state. @@ -707,6 +708,67 @@ xfs_bmbt_key_diff( cur->bc_rec.b.br_startoff; } +static void +xfs_bmbt_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + unsigned int level; + int lblock_ok; /* block passes checks */ + + /* magic number and level verification. + * + * We don't know waht fork we belong to, so just verify that the level + * is less than the maximum of the two. Later checks will be more + * precise. + */ + level = be16_to_cpu(block->bb_level); + lblock_ok = block->bb_magic == cpu_to_be32(XFS_BMAP_MAGIC) && + level < max(mp->m_bm_maxlevels[0], mp->m_bm_maxlevels[1]); + + /* numrecs verification */ + lblock_ok = lblock_ok && + be16_to_cpu(block->bb_numrecs) <= mp->m_bmap_dmxr[level != 0]; + + /* sibling pointer verification */ + lblock_ok = lblock_ok && + block->bb_u.l.bb_leftsib && + (block->bb_u.l.bb_leftsib == cpu_to_be64(NULLDFSBNO) || + XFS_FSB_SANITY_CHECK(mp, + be64_to_cpu(block->bb_u.l.bb_leftsib))) && + block->bb_u.l.bb_rightsib && + (block->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO) || + XFS_FSB_SANITY_CHECK(mp, + be64_to_cpu(block->bb_u.l.bb_rightsib))); + + if (!lblock_ok) { + trace_xfs_btree_corrupt(bp, _RET_IP_); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, block); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } +} + +static void +xfs_bmbt_read_verify( + struct xfs_buf *bp) +{ + xfs_bmbt_verify(bp); +} + +static void +xfs_bmbt_write_verify( + struct xfs_buf *bp) +{ + xfs_bmbt_verify(bp); +} + +const struct xfs_buf_ops xfs_bmbt_buf_ops = { + .verify_read = xfs_bmbt_read_verify, + .verify_write = xfs_bmbt_write_verify, +}; + + #ifdef DEBUG STATIC int xfs_bmbt_keys_inorder( @@ -746,6 +808,7 @@ static const struct xfs_btree_ops xfs_bmbt_ops = { .init_rec_from_cur = xfs_bmbt_init_rec_from_cur, .init_ptr_from_cur = xfs_bmbt_init_ptr_from_cur, .key_diff = xfs_bmbt_key_diff, + .buf_ops = &xfs_bmbt_buf_ops, #ifdef DEBUG .keys_inorder = xfs_bmbt_keys_inorder, .recs_inorder = xfs_bmbt_recs_inorder, diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h index 0e66c4ea0f85..88469ca08696 100644 --- a/fs/xfs/xfs_bmap_btree.h +++ b/fs/xfs/xfs_bmap_btree.h @@ -236,5 +236,6 @@ extern int xfs_bmbt_maxrecs(struct xfs_mount *, int blocklen, int leaf); extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *, struct xfs_trans *, struct xfs_inode *, int); +extern const struct xfs_buf_ops xfs_bmbt_buf_ops; #endif /* __XFS_BMAP_BTREE_H__ */ diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c index e53e317b1582..db010408d701 100644 --- a/fs/xfs/xfs_btree.c +++ b/fs/xfs/xfs_btree.c @@ -266,9 +266,13 @@ xfs_btree_dup_cursor( for (i = 0; i < new->bc_nlevels; i++) { new->bc_ptrs[i] = cur->bc_ptrs[i]; new->bc_ra[i] = cur->bc_ra[i]; - if ((bp = cur->bc_bufs[i])) { - if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, - XFS_BUF_ADDR(bp), mp->m_bsize, 0, &bp))) { + bp = cur->bc_bufs[i]; + if (bp) { + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, + XFS_BUF_ADDR(bp), mp->m_bsize, + 0, &bp, + cur->bc_ops->buf_ops); + if (error) { xfs_btree_del_cursor(new, error); *ncur = NULL; return error; @@ -609,25 +613,26 @@ xfs_btree_offsets( * Get a buffer for the block, return it read in. * Long-form addressing. */ -int /* error */ +int xfs_btree_read_bufl( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_fsblock_t fsbno, /* file system block number */ - uint lock, /* lock flags for read_buf */ - xfs_buf_t **bpp, /* buffer for fsbno */ - int refval) /* ref count value for buffer */ -{ - xfs_buf_t *bp; /* return value */ + struct xfs_mount *mp, /* file system mount point */ + struct xfs_trans *tp, /* transaction pointer */ + xfs_fsblock_t fsbno, /* file system block number */ + uint lock, /* lock flags for read_buf */ + struct xfs_buf **bpp, /* buffer for fsbno */ + int refval, /* ref count value for buffer */ + const struct xfs_buf_ops *ops) +{ + struct xfs_buf *bp; /* return value */ xfs_daddr_t d; /* real disk block address */ - int error; + int error; ASSERT(fsbno != NULLFSBLOCK); d = XFS_FSB_TO_DADDR(mp, fsbno); - if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d, - mp->m_bsize, lock, &bp))) { + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d, + mp->m_bsize, lock, &bp, ops); + if (error) return error; - } ASSERT(!xfs_buf_geterror(bp)); if (bp) xfs_buf_set_ref(bp, refval); @@ -642,15 +647,16 @@ xfs_btree_read_bufl( /* ARGSUSED */ void xfs_btree_reada_bufl( - xfs_mount_t *mp, /* file system mount point */ - xfs_fsblock_t fsbno, /* file system block number */ - xfs_extlen_t count) /* count of filesystem blocks */ + struct xfs_mount *mp, /* file system mount point */ + xfs_fsblock_t fsbno, /* file system block number */ + xfs_extlen_t count, /* count of filesystem blocks */ + const struct xfs_buf_ops *ops) { xfs_daddr_t d; ASSERT(fsbno != NULLFSBLOCK); d = XFS_FSB_TO_DADDR(mp, fsbno); - xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count); + xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops); } /* @@ -660,17 +666,18 @@ xfs_btree_reada_bufl( /* ARGSUSED */ void xfs_btree_reada_bufs( - xfs_mount_t *mp, /* file system mount point */ - xfs_agnumber_t agno, /* allocation group number */ - xfs_agblock_t agbno, /* allocation group block number */ - xfs_extlen_t count) /* count of filesystem blocks */ + struct xfs_mount *mp, /* file system mount point */ + xfs_agnumber_t agno, /* allocation group number */ + xfs_agblock_t agbno, /* allocation group block number */ + xfs_extlen_t count, /* count of filesystem blocks */ + const struct xfs_buf_ops *ops) { xfs_daddr_t d; ASSERT(agno != NULLAGNUMBER); ASSERT(agbno != NULLAGBLOCK); d = XFS_AGB_TO_DADDR(mp, agno, agbno); - xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count); + xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops); } STATIC int @@ -684,12 +691,14 @@ xfs_btree_readahead_lblock( xfs_dfsbno_t right = be64_to_cpu(block->bb_u.l.bb_rightsib); if ((lr & XFS_BTCUR_LEFTRA) && left != NULLDFSBNO) { - xfs_btree_reada_bufl(cur->bc_mp, left, 1); + xfs_btree_reada_bufl(cur->bc_mp, left, 1, + cur->bc_ops->buf_ops); rval++; } if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLDFSBNO) { - xfs_btree_reada_bufl(cur->bc_mp, right, 1); + xfs_btree_reada_bufl(cur->bc_mp, right, 1, + cur->bc_ops->buf_ops); rval++; } @@ -709,13 +718,13 @@ xfs_btree_readahead_sblock( if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) { xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno, - left, 1); + left, 1, cur->bc_ops->buf_ops); rval++; } if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) { xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno, - right, 1); + right, 1, cur->bc_ops->buf_ops); rval++; } @@ -853,18 +862,22 @@ xfs_btree_set_sibling( } } -STATIC void +void xfs_btree_init_block( - struct xfs_btree_cur *cur, - int level, - int numrecs, - struct xfs_btree_block *new) /* new block */ + struct xfs_mount *mp, + struct xfs_buf *bp, + __u32 magic, + __u16 level, + __u16 numrecs, + unsigned int flags) { - new->bb_magic = cpu_to_be32(xfs_magics[cur->bc_btnum]); + struct xfs_btree_block *new = XFS_BUF_TO_BLOCK(bp); + + new->bb_magic = cpu_to_be32(magic); new->bb_level = cpu_to_be16(level); new->bb_numrecs = cpu_to_be16(numrecs); - if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { + if (flags & XFS_BTREE_LONG_PTRS) { new->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO); new->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO); } else { @@ -873,6 +886,17 @@ xfs_btree_init_block( } } +STATIC void +xfs_btree_init_block_cur( + struct xfs_btree_cur *cur, + int level, + int numrecs, + struct xfs_buf *bp) +{ + xfs_btree_init_block(cur->bc_mp, bp, xfs_magics[cur->bc_btnum], + level, numrecs, cur->bc_flags); +} + /* * Return true if ptr is the last record in the btree and * we need to track updateÑ• to this record. The decision @@ -972,6 +996,7 @@ xfs_btree_get_buf_block( if (!*bpp) return ENOMEM; + (*bpp)->b_ops = cur->bc_ops->buf_ops; *block = XFS_BUF_TO_BLOCK(*bpp); return 0; } @@ -998,19 +1023,15 @@ xfs_btree_read_buf_block( d = xfs_btree_ptr_to_daddr(cur, ptr); error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d, - mp->m_bsize, flags, bpp); + mp->m_bsize, flags, bpp, + cur->bc_ops->buf_ops); if (error) return error; ASSERT(!xfs_buf_geterror(*bpp)); - xfs_btree_set_refs(cur, *bpp); *block = XFS_BUF_TO_BLOCK(*bpp); - - error = xfs_btree_check_block(cur, *block, level, *bpp); - if (error) - xfs_trans_brelse(cur->bc_tp, *bpp); - return error; + return 0; } /* @@ -2183,7 +2204,7 @@ xfs_btree_split( goto error0; /* Fill in the btree header for the new right block. */ - xfs_btree_init_block(cur, xfs_btree_get_level(left), 0, right); + xfs_btree_init_block_cur(cur, xfs_btree_get_level(left), 0, rbp); /* * Split the entries between the old and the new block evenly. @@ -2492,7 +2513,7 @@ xfs_btree_new_root( nptr = 2; } /* Fill in the new block's btree header and log it. */ - xfs_btree_init_block(cur, cur->bc_nlevels, 2, new); + xfs_btree_init_block_cur(cur, cur->bc_nlevels, 2, nbp); xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS); ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) && !xfs_btree_ptr_is_null(cur, &rptr)); diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h index 5b240de104c0..f932897194eb 100644 --- a/fs/xfs/xfs_btree.h +++ b/fs/xfs/xfs_btree.h @@ -188,6 +188,8 @@ struct xfs_btree_ops { __int64_t (*key_diff)(struct xfs_btree_cur *cur, union xfs_btree_key *key); + const struct xfs_buf_ops *buf_ops; + #ifdef DEBUG /* check that k1 is lower than k2 */ int (*keys_inorder)(struct xfs_btree_cur *cur, @@ -355,7 +357,8 @@ xfs_btree_read_bufl( xfs_fsblock_t fsbno, /* file system block number */ uint lock, /* lock flags for read_buf */ struct xfs_buf **bpp, /* buffer for fsbno */ - int refval);/* ref count value for buffer */ + int refval, /* ref count value for buffer */ + const struct xfs_buf_ops *ops); /* * Read-ahead the block, don't wait for it, don't return a buffer. @@ -365,7 +368,8 @@ void /* error */ xfs_btree_reada_bufl( struct xfs_mount *mp, /* file system mount point */ xfs_fsblock_t fsbno, /* file system block number */ - xfs_extlen_t count); /* count of filesystem blocks */ + xfs_extlen_t count, /* count of filesystem blocks */ + const struct xfs_buf_ops *ops); /* * Read-ahead the block, don't wait for it, don't return a buffer. @@ -376,8 +380,20 @@ xfs_btree_reada_bufs( struct xfs_mount *mp, /* file system mount point */ xfs_agnumber_t agno, /* allocation group number */ xfs_agblock_t agbno, /* allocation group block number */ - xfs_extlen_t count); /* count of filesystem blocks */ + xfs_extlen_t count, /* count of filesystem blocks */ + const struct xfs_buf_ops *ops); +/* + * Initialise a new btree block header + */ +void +xfs_btree_init_block( + struct xfs_mount *mp, + struct xfs_buf *bp, + __u32 magic, + __u16 level, + __u16 numrecs, + unsigned int flags); /* * Common btree core entry points. diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 933b7930b863..26673a0b20e7 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -569,7 +569,9 @@ found: */ if (bp->b_flags & XBF_STALE) { ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0); + ASSERT(bp->b_iodone == NULL); bp->b_flags &= _XBF_KMEM | _XBF_PAGES; + bp->b_ops = NULL; } trace_xfs_buf_find(bp, flags, _RET_IP_); @@ -654,7 +656,8 @@ xfs_buf_read_map( struct xfs_buftarg *target, struct xfs_buf_map *map, int nmaps, - xfs_buf_flags_t flags) + xfs_buf_flags_t flags, + const struct xfs_buf_ops *ops) { struct xfs_buf *bp; @@ -666,6 +669,7 @@ xfs_buf_read_map( if (!XFS_BUF_ISDONE(bp)) { XFS_STATS_INC(xb_get_read); + bp->b_ops = ops; _xfs_buf_read(bp, flags); } else if (flags & XBF_ASYNC) { /* @@ -691,13 +695,14 @@ void xfs_buf_readahead_map( struct xfs_buftarg *target, struct xfs_buf_map *map, - int nmaps) + int nmaps, + const struct xfs_buf_ops *ops) { if (bdi_read_congested(target->bt_bdi)) return; xfs_buf_read_map(target, map, nmaps, - XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD); + XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD, ops); } /* @@ -709,10 +714,10 @@ xfs_buf_read_uncached( struct xfs_buftarg *target, xfs_daddr_t daddr, size_t numblks, - int flags) + int flags, + const struct xfs_buf_ops *ops) { - xfs_buf_t *bp; - int error; + struct xfs_buf *bp; bp = xfs_buf_get_uncached(target, numblks, flags); if (!bp) @@ -723,13 +728,10 @@ xfs_buf_read_uncached( bp->b_bn = daddr; bp->b_maps[0].bm_bn = daddr; bp->b_flags |= XBF_READ; + bp->b_ops = ops; xfsbdstrat(target->bt_mount, bp); - error = xfs_buf_iowait(bp); - if (error) { - xfs_buf_relse(bp); - return NULL; - } + xfs_buf_iowait(bp); return bp; } @@ -999,27 +1001,37 @@ STATIC void xfs_buf_iodone_work( struct work_struct *work) { - xfs_buf_t *bp = + struct xfs_buf *bp = container_of(work, xfs_buf_t, b_iodone_work); + bool read = !!(bp->b_flags & XBF_READ); + + bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD); + if (read && bp->b_ops) + bp->b_ops->verify_read(bp); if (bp->b_iodone) (*(bp->b_iodone))(bp); else if (bp->b_flags & XBF_ASYNC) xfs_buf_relse(bp); + else { + ASSERT(read && bp->b_ops); + complete(&bp->b_iowait); + } } void xfs_buf_ioend( - xfs_buf_t *bp, - int schedule) + struct xfs_buf *bp, + int schedule) { + bool read = !!(bp->b_flags & XBF_READ); + trace_xfs_buf_iodone(bp, _RET_IP_); - bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD); if (bp->b_error == 0) bp->b_flags |= XBF_DONE; - if ((bp->b_iodone) || (bp->b_flags & XBF_ASYNC)) { + if (bp->b_iodone || (read && bp->b_ops) || (bp->b_flags & XBF_ASYNC)) { if (schedule) { INIT_WORK(&bp->b_iodone_work, xfs_buf_iodone_work); queue_work(xfslogd_workqueue, &bp->b_iodone_work); @@ -1027,6 +1039,7 @@ xfs_buf_ioend( xfs_buf_iodone_work(&bp->b_iodone_work); } } else { + bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD); complete(&bp->b_iowait); } } @@ -1197,9 +1210,14 @@ xfs_buf_bio_end_io( { xfs_buf_t *bp = (xfs_buf_t *)bio->bi_private; - xfs_buf_ioerror(bp, -error); + /* + * don't overwrite existing errors - otherwise we can lose errors on + * buffers that require multiple bios to complete. + */ + if (!bp->b_error) + xfs_buf_ioerror(bp, -error); - if (!error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ)) + if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ)) invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp)); _xfs_buf_ioend(bp, 1); @@ -1279,6 +1297,11 @@ next_chunk: if (size) goto next_chunk; } else { + /* + * This is guaranteed not to be the last io reference count + * because the caller (xfs_buf_iorequest) holds a count itself. + */ + atomic_dec(&bp->b_io_remaining); xfs_buf_ioerror(bp, EIO); bio_put(bio); } @@ -1304,6 +1327,20 @@ _xfs_buf_ioapply( rw |= REQ_FUA; if (bp->b_flags & XBF_FLUSH) rw |= REQ_FLUSH; + + /* + * Run the write verifier callback function if it exists. If + * this function fails it will mark the buffer with an error and + * the IO should not be dispatched. + */ + if (bp->b_ops) { + bp->b_ops->verify_write(bp); + if (bp->b_error) { + xfs_force_shutdown(bp->b_target->bt_mount, + SHUTDOWN_CORRUPT_INCORE); + return; + } + } } else if (bp->b_flags & XBF_READ_AHEAD) { rw = READA; } else { diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 7c0b6a0a1557..23f5642480bb 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -100,6 +100,7 @@ typedef struct xfs_buftarg { struct xfs_buf; typedef void (*xfs_buf_iodone_t)(struct xfs_buf *); + #define XB_PAGES 2 struct xfs_buf_map { @@ -110,6 +111,11 @@ struct xfs_buf_map { #define DEFINE_SINGLE_BUF_MAP(map, blkno, numblk) \ struct xfs_buf_map (map) = { .bm_bn = (blkno), .bm_len = (numblk) }; +struct xfs_buf_ops { + void (*verify_read)(struct xfs_buf *); + void (*verify_write)(struct xfs_buf *); +}; + typedef struct xfs_buf { /* * first cacheline holds all the fields needed for an uncontended cache @@ -153,13 +159,13 @@ typedef struct xfs_buf { unsigned int b_page_count; /* size of page array */ unsigned int b_offset; /* page offset in first page */ unsigned short b_error; /* error code on I/O */ + const struct xfs_buf_ops *b_ops; #ifdef XFS_BUF_LOCK_TRACKING int b_last_holder; #endif } xfs_buf_t; - /* Finding and Reading Buffers */ struct xfs_buf *_xfs_buf_find(struct xfs_buftarg *target, struct xfs_buf_map *map, int nmaps, @@ -196,9 +202,11 @@ struct xfs_buf *xfs_buf_get_map(struct xfs_buftarg *target, xfs_buf_flags_t flags); struct xfs_buf *xfs_buf_read_map(struct xfs_buftarg *target, struct xfs_buf_map *map, int nmaps, - xfs_buf_flags_t flags); + xfs_buf_flags_t flags, + const struct xfs_buf_ops *ops); void xfs_buf_readahead_map(struct xfs_buftarg *target, - struct xfs_buf_map *map, int nmaps); + struct xfs_buf_map *map, int nmaps, + const struct xfs_buf_ops *ops); static inline struct xfs_buf * xfs_buf_get( @@ -216,20 +224,22 @@ xfs_buf_read( struct xfs_buftarg *target, xfs_daddr_t blkno, size_t numblks, - xfs_buf_flags_t flags) + xfs_buf_flags_t flags, + const struct xfs_buf_ops *ops) { DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); - return xfs_buf_read_map(target, &map, 1, flags); + return xfs_buf_read_map(target, &map, 1, flags, ops); } static inline void xfs_buf_readahead( struct xfs_buftarg *target, xfs_daddr_t blkno, - size_t numblks) + size_t numblks, + const struct xfs_buf_ops *ops) { DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); - return xfs_buf_readahead_map(target, &map, 1); + return xfs_buf_readahead_map(target, &map, 1, ops); } struct xfs_buf *xfs_buf_get_empty(struct xfs_buftarg *target, size_t numblks); @@ -239,7 +249,8 @@ int xfs_buf_associate_memory(struct xfs_buf *bp, void *mem, size_t length); struct xfs_buf *xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks, int flags); struct xfs_buf *xfs_buf_read_uncached(struct xfs_buftarg *target, - xfs_daddr_t daddr, size_t numblks, int flags); + xfs_daddr_t daddr, size_t numblks, int flags, + const struct xfs_buf_ops *ops); void xfs_buf_hold(struct xfs_buf *bp); /* Releasing Buffers */ diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index a8d0ed911196..becf4a97efc6 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -526,7 +526,25 @@ xfs_buf_item_unpin( } xfs_buf_relse(bp); } else if (freed && remove) { + /* + * There are currently two references to the buffer - the active + * LRU reference and the buf log item. What we are about to do + * here - simulate a failed IO completion - requires 3 + * references. + * + * The LRU reference is removed by the xfs_buf_stale() call. The + * buf item reference is removed by the xfs_buf_iodone() + * callback that is run by xfs_buf_do_callbacks() during ioend + * processing (via the bp->b_iodone callback), and then finally + * the ioend processing will drop the IO reference if the buffer + * is marked XBF_ASYNC. + * + * Hence we need to take an additional reference here so that IO + * completion processing doesn't free the buffer prematurely. + */ xfs_buf_lock(bp); + xfs_buf_hold(bp); + bp->b_flags |= XBF_ASYNC; xfs_buf_ioerror(bp, EIO); XFS_BUF_UNDONE(bp); xfs_buf_stale(bp); diff --git a/fs/xfs/xfs_cksum.h b/fs/xfs/xfs_cksum.h new file mode 100644 index 000000000000..fad1676ad8cd --- /dev/null +++ b/fs/xfs/xfs_cksum.h @@ -0,0 +1,63 @@ +#ifndef _XFS_CKSUM_H +#define _XFS_CKSUM_H 1 + +#define XFS_CRC_SEED (~(__uint32_t)0) + +/* + * Calculate the intermediate checksum for a buffer that has the CRC field + * inside it. The offset of the 32bit crc fields is passed as the + * cksum_offset parameter. + */ +static inline __uint32_t +xfs_start_cksum(char *buffer, size_t length, unsigned long cksum_offset) +{ + __uint32_t zero = 0; + __uint32_t crc; + + /* Calculate CRC up to the checksum. */ + crc = crc32c(XFS_CRC_SEED, buffer, cksum_offset); + + /* Skip checksum field */ + crc = crc32c(crc, &zero, sizeof(__u32)); + + /* Calculate the rest of the CRC. */ + return crc32c(crc, &buffer[cksum_offset + sizeof(__be32)], + length - (cksum_offset + sizeof(__be32))); +} + +/* + * Convert the intermediate checksum to the final ondisk format. + * + * The CRC32c calculation uses LE format even on BE machines, but returns the + * result in host endian format. Hence we need to byte swap it back to LE format + * so that it is consistent on disk. + */ +static inline __le32 +xfs_end_cksum(__uint32_t crc) +{ + return ~cpu_to_le32(crc); +} + +/* + * Helper to generate the checksum for a buffer. + */ +static inline void +xfs_update_cksum(char *buffer, size_t length, unsigned long cksum_offset) +{ + __uint32_t crc = xfs_start_cksum(buffer, length, cksum_offset); + + *(__le32 *)(buffer + cksum_offset) = xfs_end_cksum(crc); +} + +/* + * Helper to verify the checksum for a buffer. + */ +static inline int +xfs_verify_cksum(char *buffer, size_t length, unsigned long cksum_offset) +{ + __uint32_t crc = xfs_start_cksum(buffer, length, cksum_offset); + + return *(__le32 *)(buffer + cksum_offset) == xfs_end_cksum(crc); +} + +#endif /* _XFS_CKSUM_H */ diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index 7bfb7dd334fc..4d7696a02418 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -91,6 +91,84 @@ STATIC int xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *save_blk); STATIC void xfs_da_state_kill_altpath(xfs_da_state_t *state); +static void +xfs_da_node_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_da_node_hdr *hdr = bp->b_addr; + int block_ok = 0; + + block_ok = hdr->info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC); + block_ok = block_ok && + be16_to_cpu(hdr->level) > 0 && + be16_to_cpu(hdr->count) > 0 ; + if (!block_ok) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } + +} + +static void +xfs_da_node_write_verify( + struct xfs_buf *bp) +{ + xfs_da_node_verify(bp); +} + +/* + * leaf/node format detection on trees is sketchy, so a node read can be done on + * leaf level blocks when detection identifies the tree as a node format tree + * incorrectly. In this case, we need to swap the verifier to match the correct + * format of the block being read. + */ +static void +xfs_da_node_read_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_da_blkinfo *info = bp->b_addr; + + switch (be16_to_cpu(info->magic)) { + case XFS_DA_NODE_MAGIC: + xfs_da_node_verify(bp); + break; + case XFS_ATTR_LEAF_MAGIC: + bp->b_ops = &xfs_attr_leaf_buf_ops; + bp->b_ops->verify_read(bp); + return; + case XFS_DIR2_LEAFN_MAGIC: + bp->b_ops = &xfs_dir2_leafn_buf_ops; + bp->b_ops->verify_read(bp); + return; + default: + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, + mp, info); + xfs_buf_ioerror(bp, EFSCORRUPTED); + break; + } +} + +const struct xfs_buf_ops xfs_da_node_buf_ops = { + .verify_read = xfs_da_node_read_verify, + .verify_write = xfs_da_node_write_verify, +}; + + +int +xfs_da_node_read( + struct xfs_trans *tp, + struct xfs_inode *dp, + xfs_dablk_t bno, + xfs_daddr_t mappedbno, + struct xfs_buf **bpp, + int which_fork) +{ + return xfs_da_read_buf(tp, dp, bno, mappedbno, bpp, + which_fork, &xfs_da_node_buf_ops); +} + /*======================================================================== * Routines used for growing the Btree. *========================================================================*/ @@ -125,6 +203,7 @@ xfs_da_node_create(xfs_da_args_t *args, xfs_dablk_t blkno, int level, xfs_trans_log_buf(tp, bp, XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr))); + bp->b_ops = &xfs_da_node_buf_ops; *bpp = bp; return(0); } @@ -324,6 +403,8 @@ xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, } memcpy(node, oldroot, size); xfs_trans_log_buf(tp, bp, 0, size - 1); + + bp->b_ops = blk1->bp->b_ops; blk1->bp = bp; blk1->blkno = blkno; @@ -746,7 +827,7 @@ xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk) */ child = be32_to_cpu(oldroot->btree[0].before); ASSERT(child != 0); - error = xfs_da_read_buf(args->trans, args->dp, child, -1, &bp, + error = xfs_da_node_read(args->trans, args->dp, child, -1, &bp, args->whichfork); if (error) return(error); @@ -754,7 +835,14 @@ xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk) xfs_da_blkinfo_onlychild_validate(bp->b_addr, be16_to_cpu(oldroot->hdr.level)); + /* + * This could be copying a leaf back into the root block in the case of + * there only being a single leaf block left in the tree. Hence we have + * to update the b_ops pointer as well to match the buffer type change + * that could occur. + */ memcpy(root_blk->bp->b_addr, bp->b_addr, state->blocksize); + root_blk->bp->b_ops = bp->b_ops; xfs_trans_log_buf(args->trans, root_blk->bp, 0, state->blocksize - 1); error = xfs_da_shrink_inode(args, child, bp); return(error); @@ -779,6 +867,8 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action) xfs_dablk_t blkno; struct xfs_buf *bp; + trace_xfs_da_node_toosmall(state->args); + /* * Check for the degenerate case of the block being over 50% full. * If so, it's not worth even looking to see if we might be able @@ -835,7 +925,7 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action) blkno = be32_to_cpu(info->back); if (blkno == 0) continue; - error = xfs_da_read_buf(state->args->trans, state->args->dp, + error = xfs_da_node_read(state->args->trans, state->args->dp, blkno, -1, &bp, state->args->whichfork); if (error) return(error); @@ -900,6 +990,8 @@ xfs_da_fixhashpath(xfs_da_state_t *state, xfs_da_state_path_t *path) xfs_dahash_t lasthash=0; int level, count; + trace_xfs_da_fixhashpath(state->args); + level = path->active-1; blk = &path->blk[ level ]; switch (blk->magic) { @@ -1079,7 +1171,7 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result) * Read the next node down in the tree. */ blk->blkno = blkno; - error = xfs_da_read_buf(args->trans, args->dp, blkno, + error = xfs_da_node_read(args->trans, args->dp, blkno, -1, &blk->bp, args->whichfork); if (error) { blk->blkno = 0; @@ -1241,7 +1333,7 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk, new_info->forw = cpu_to_be32(old_blk->blkno); new_info->back = old_info->back; if (old_info->back) { - error = xfs_da_read_buf(args->trans, args->dp, + error = xfs_da_node_read(args->trans, args->dp, be32_to_cpu(old_info->back), -1, &bp, args->whichfork); if (error) @@ -1262,7 +1354,7 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk, new_info->forw = old_info->forw; new_info->back = cpu_to_be32(old_blk->blkno); if (old_info->forw) { - error = xfs_da_read_buf(args->trans, args->dp, + error = xfs_da_node_read(args->trans, args->dp, be32_to_cpu(old_info->forw), -1, &bp, args->whichfork); if (error) @@ -1362,7 +1454,7 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, trace_xfs_da_unlink_back(args); save_info->back = drop_info->back; if (drop_info->back) { - error = xfs_da_read_buf(args->trans, args->dp, + error = xfs_da_node_read(args->trans, args->dp, be32_to_cpu(drop_info->back), -1, &bp, args->whichfork); if (error) @@ -1379,7 +1471,7 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, trace_xfs_da_unlink_forward(args); save_info->forw = drop_info->forw; if (drop_info->forw) { - error = xfs_da_read_buf(args->trans, args->dp, + error = xfs_da_node_read(args->trans, args->dp, be32_to_cpu(drop_info->forw), -1, &bp, args->whichfork); if (error) @@ -1417,6 +1509,8 @@ xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path, xfs_dablk_t blkno=0; int level, error; + trace_xfs_da_path_shift(state->args); + /* * Roll up the Btree looking for the first block where our * current index is not at the edge of the block. Note that @@ -1463,8 +1557,8 @@ xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path, * Read the next child block. */ blk->blkno = blkno; - error = xfs_da_read_buf(args->trans, args->dp, blkno, -1, - &blk->bp, args->whichfork); + error = xfs_da_node_read(args->trans, args->dp, blkno, -1, + &blk->bp, args->whichfork); if (error) return(error); ASSERT(blk->bp != NULL); @@ -1727,7 +1821,8 @@ xfs_da_swap_lastblock( * Read the last block in the btree space. */ last_blkno = (xfs_dablk_t)lastoff - mp->m_dirblkfsbs; - if ((error = xfs_da_read_buf(tp, ip, last_blkno, -1, &last_buf, w))) + error = xfs_da_node_read(tp, ip, last_blkno, -1, &last_buf, w); + if (error) return error; /* * Copy the last block into the dead buffer and log it. @@ -1753,7 +1848,8 @@ xfs_da_swap_lastblock( * If the moved block has a left sibling, fix up the pointers. */ if ((sib_blkno = be32_to_cpu(dead_info->back))) { - if ((error = xfs_da_read_buf(tp, ip, sib_blkno, -1, &sib_buf, w))) + error = xfs_da_node_read(tp, ip, sib_blkno, -1, &sib_buf, w); + if (error) goto done; sib_info = sib_buf->b_addr; if (unlikely( @@ -1774,7 +1870,8 @@ xfs_da_swap_lastblock( * If the moved block has a right sibling, fix up the pointers. */ if ((sib_blkno = be32_to_cpu(dead_info->forw))) { - if ((error = xfs_da_read_buf(tp, ip, sib_blkno, -1, &sib_buf, w))) + error = xfs_da_node_read(tp, ip, sib_blkno, -1, &sib_buf, w); + if (error) goto done; sib_info = sib_buf->b_addr; if (unlikely( @@ -1797,7 +1894,8 @@ xfs_da_swap_lastblock( * Walk down the tree looking for the parent of the moved block. */ for (;;) { - if ((error = xfs_da_read_buf(tp, ip, par_blkno, -1, &par_buf, w))) + error = xfs_da_node_read(tp, ip, par_blkno, -1, &par_buf, w); + if (error) goto done; par_node = par_buf->b_addr; if (unlikely(par_node->hdr.info.magic != @@ -1847,7 +1945,8 @@ xfs_da_swap_lastblock( error = XFS_ERROR(EFSCORRUPTED); goto done; } - if ((error = xfs_da_read_buf(tp, ip, par_blkno, -1, &par_buf, w))) + error = xfs_da_node_read(tp, ip, par_blkno, -1, &par_buf, w); + if (error) goto done; par_node = par_buf->b_addr; if (unlikely( @@ -2133,7 +2232,8 @@ xfs_da_read_buf( xfs_dablk_t bno, xfs_daddr_t mappedbno, struct xfs_buf **bpp, - int whichfork) + int whichfork, + const struct xfs_buf_ops *ops) { struct xfs_buf *bp; struct xfs_buf_map map; @@ -2155,7 +2255,7 @@ xfs_da_read_buf( error = xfs_trans_read_buf_map(dp->i_mount, trans, dp->i_mount->m_ddev_targp, - mapp, nmap, 0, &bp); + mapp, nmap, 0, &bp, ops); if (error) goto out_free; @@ -2211,9 +2311,10 @@ xfs_da_reada_buf( struct xfs_trans *trans, struct xfs_inode *dp, xfs_dablk_t bno, - int whichfork) + xfs_daddr_t mappedbno, + int whichfork, + const struct xfs_buf_ops *ops) { - xfs_daddr_t mappedbno = -1; struct xfs_buf_map map; struct xfs_buf_map *mapp; int nmap; @@ -2221,7 +2322,7 @@ xfs_da_reada_buf( mapp = ↦ nmap = 1; - error = xfs_dabuf_map(trans, dp, bno, -1, whichfork, + error = xfs_dabuf_map(trans, dp, bno, mappedbno, whichfork, &mapp, &nmap); if (error) { /* mapping a hole is not an error, but we don't continue */ @@ -2231,7 +2332,7 @@ xfs_da_reada_buf( } mappedbno = mapp[0].bm_bn; - xfs_buf_readahead_map(dp->i_mount->m_ddev_targp, mapp, nmap); + xfs_buf_readahead_map(dp->i_mount->m_ddev_targp, mapp, nmap, ops); out_free: if (mapp != &map) diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h index 132adafb041e..ee5170c46ae1 100644 --- a/fs/xfs/xfs_da_btree.h +++ b/fs/xfs/xfs_da_btree.h @@ -18,7 +18,6 @@ #ifndef __XFS_DA_BTREE_H__ #define __XFS_DA_BTREE_H__ -struct xfs_buf; struct xfs_bmap_free; struct xfs_inode; struct xfs_mount; @@ -214,6 +213,9 @@ int xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path, */ int xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk, xfs_da_state_blk_t *new_blk); +int xfs_da_node_read(struct xfs_trans *tp, struct xfs_inode *dp, + xfs_dablk_t bno, xfs_daddr_t mappedbno, + struct xfs_buf **bpp, int which_fork); /* * Utility routines. @@ -226,9 +228,11 @@ int xfs_da_get_buf(struct xfs_trans *trans, struct xfs_inode *dp, struct xfs_buf **bp, int whichfork); int xfs_da_read_buf(struct xfs_trans *trans, struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mappedbno, - struct xfs_buf **bpp, int whichfork); + struct xfs_buf **bpp, int whichfork, + const struct xfs_buf_ops *ops); xfs_daddr_t xfs_da_reada_buf(struct xfs_trans *trans, struct xfs_inode *dp, - xfs_dablk_t bno, int whichfork); + xfs_dablk_t bno, xfs_daddr_t mapped_bno, + int whichfork, const struct xfs_buf_ops *ops); int xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno, struct xfs_buf *dead_buf); diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c index b9b8646e62db..d0e9c74d3d96 100644 --- a/fs/xfs/xfs_dfrag.c +++ b/fs/xfs/xfs_dfrag.c @@ -246,12 +246,10 @@ xfs_swap_extents( goto out_unlock; } - if (VN_CACHED(VFS_I(tip)) != 0) { - error = xfs_flushinval_pages(tip, 0, -1, - FI_REMAPF_LOCKED); - if (error) - goto out_unlock; - } + error = -filemap_write_and_wait(VFS_I(ip)->i_mapping); + if (error) + goto out_unlock; + truncate_pagecache_range(VFS_I(ip), 0, -1); /* Verify O_DIRECT for ftmp */ if (VN_CACHED(VFS_I(tip)) != 0) { @@ -315,8 +313,7 @@ xfs_swap_extents( * are safe. We don't really care if non-io related * fields change. */ - - xfs_tosspages(ip, 0, -1, FI_REMAPF); + truncate_pagecache_range(VFS_I(ip), 0, -1); tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT); if ((error = xfs_trans_reserve(tp, 0, diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c index e93ca8f054f4..7536faaa61e7 100644 --- a/fs/xfs/xfs_dir2_block.c +++ b/fs/xfs/xfs_dir2_block.c @@ -56,6 +56,214 @@ xfs_dir_startup(void) xfs_dir_hash_dotdot = xfs_da_hashname((unsigned char *)"..", 2); } +static void +xfs_dir2_block_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_dir2_data_hdr *hdr = bp->b_addr; + int block_ok = 0; + + block_ok = hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC); + block_ok = block_ok && __xfs_dir2_data_check(NULL, bp) == 0; + + if (!block_ok) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } +} + +static void +xfs_dir2_block_read_verify( + struct xfs_buf *bp) +{ + xfs_dir2_block_verify(bp); +} + +static void +xfs_dir2_block_write_verify( + struct xfs_buf *bp) +{ + xfs_dir2_block_verify(bp); +} + +const struct xfs_buf_ops xfs_dir2_block_buf_ops = { + .verify_read = xfs_dir2_block_read_verify, + .verify_write = xfs_dir2_block_write_verify, +}; + +static int +xfs_dir2_block_read( + struct xfs_trans *tp, + struct xfs_inode *dp, + struct xfs_buf **bpp) +{ + struct xfs_mount *mp = dp->i_mount; + + return xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, bpp, + XFS_DATA_FORK, &xfs_dir2_block_buf_ops); +} + +static void +xfs_dir2_block_need_space( + struct xfs_dir2_data_hdr *hdr, + struct xfs_dir2_block_tail *btp, + struct xfs_dir2_leaf_entry *blp, + __be16 **tagpp, + struct xfs_dir2_data_unused **dupp, + struct xfs_dir2_data_unused **enddupp, + int *compact, + int len) +{ + struct xfs_dir2_data_free *bf; + __be16 *tagp = NULL; + struct xfs_dir2_data_unused *dup = NULL; + struct xfs_dir2_data_unused *enddup = NULL; + + *compact = 0; + bf = hdr->bestfree; + + /* + * If there are stale entries we'll use one for the leaf. + */ + if (btp->stale) { + if (be16_to_cpu(bf[0].length) >= len) { + /* + * The biggest entry enough to avoid compaction. + */ + dup = (xfs_dir2_data_unused_t *) + ((char *)hdr + be16_to_cpu(bf[0].offset)); + goto out; + } + + /* + * Will need to compact to make this work. + * Tag just before the first leaf entry. + */ + *compact = 1; + tagp = (__be16 *)blp - 1; + + /* Data object just before the first leaf entry. */ + dup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp)); + + /* + * If it's not free then the data will go where the + * leaf data starts now, if it works at all. + */ + if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { + if (be16_to_cpu(dup->length) + (be32_to_cpu(btp->stale) - 1) * + (uint)sizeof(*blp) < len) + dup = NULL; + } else if ((be32_to_cpu(btp->stale) - 1) * (uint)sizeof(*blp) < len) + dup = NULL; + else + dup = (xfs_dir2_data_unused_t *)blp; + goto out; + } + + /* + * no stale entries, so just use free space. + * Tag just before the first leaf entry. + */ + tagp = (__be16 *)blp - 1; + + /* Data object just before the first leaf entry. */ + enddup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp)); + + /* + * If it's not free then can't do this add without cleaning up: + * the space before the first leaf entry needs to be free so it + * can be expanded to hold the pointer to the new entry. + */ + if (be16_to_cpu(enddup->freetag) == XFS_DIR2_DATA_FREE_TAG) { + /* + * Check out the biggest freespace and see if it's the same one. + */ + dup = (xfs_dir2_data_unused_t *) + ((char *)hdr + be16_to_cpu(bf[0].offset)); + if (dup != enddup) { + /* + * Not the same free entry, just check its length. + */ + if (be16_to_cpu(dup->length) < len) + dup = NULL; + goto out; + } + + /* + * It is the biggest freespace, can it hold the leaf too? + */ + if (be16_to_cpu(dup->length) < len + (uint)sizeof(*blp)) { + /* + * Yes, use the second-largest entry instead if it works. + */ + if (be16_to_cpu(bf[1].length) >= len) + dup = (xfs_dir2_data_unused_t *) + ((char *)hdr + be16_to_cpu(bf[1].offset)); + else + dup = NULL; + } + } +out: + *tagpp = tagp; + *dupp = dup; + *enddupp = enddup; +} + +/* + * compact the leaf entries. + * Leave the highest-numbered stale entry stale. + * XXX should be the one closest to mid but mid is not yet computed. + */ +static void +xfs_dir2_block_compact( + struct xfs_trans *tp, + struct xfs_buf *bp, + struct xfs_dir2_data_hdr *hdr, + struct xfs_dir2_block_tail *btp, + struct xfs_dir2_leaf_entry *blp, + int *needlog, + int *lfloghigh, + int *lfloglow) +{ + int fromidx; /* source leaf index */ + int toidx; /* target leaf index */ + int needscan = 0; + int highstale; /* high stale index */ + + fromidx = toidx = be32_to_cpu(btp->count) - 1; + highstale = *lfloghigh = -1; + for (; fromidx >= 0; fromidx--) { + if (blp[fromidx].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) { + if (highstale == -1) + highstale = toidx; + else { + if (*lfloghigh == -1) + *lfloghigh = toidx; + continue; + } + } + if (fromidx < toidx) + blp[toidx] = blp[fromidx]; + toidx--; + } + *lfloglow = toidx + 1 - (be32_to_cpu(btp->stale) - 1); + *lfloghigh -= be32_to_cpu(btp->stale) - 1; + be32_add_cpu(&btp->count, -(be32_to_cpu(btp->stale) - 1)); + xfs_dir2_data_make_free(tp, bp, + (xfs_dir2_data_aoff_t)((char *)blp - (char *)hdr), + (xfs_dir2_data_aoff_t)((be32_to_cpu(btp->stale) - 1) * sizeof(*blp)), + needlog, &needscan); + blp += be32_to_cpu(btp->stale) - 1; + btp->stale = cpu_to_be32(1); + /* + * If we now need to rebuild the bestfree map, do so. + * This needs to happen before the next call to use_free. + */ + if (needscan) + xfs_dir2_data_freescan(tp->t_mountp, hdr, needlog); +} + /* * Add an entry to a block directory. */ @@ -63,7 +271,6 @@ int /* error */ xfs_dir2_block_addname( xfs_da_args_t *args) /* directory op arguments */ { - xfs_dir2_data_free_t *bf; /* bestfree table in block */ xfs_dir2_data_hdr_t *hdr; /* block header */ xfs_dir2_leaf_entry_t *blp; /* block leaf entries */ struct xfs_buf *bp; /* buffer for block */ @@ -94,134 +301,44 @@ xfs_dir2_block_addname( dp = args->dp; tp = args->trans; mp = dp->i_mount; - /* - * Read the (one and only) directory block into dabuf bp. - */ - if ((error = - xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &bp, XFS_DATA_FORK))) { + + /* Read the (one and only) directory block into bp. */ + error = xfs_dir2_block_read(tp, dp, &bp); + if (error) return error; - } - ASSERT(bp != NULL); - hdr = bp->b_addr; - /* - * Check the magic number, corrupted if wrong. - */ - if (unlikely(hdr->magic != cpu_to_be32(XFS_DIR2_BLOCK_MAGIC))) { - XFS_CORRUPTION_ERROR("xfs_dir2_block_addname", - XFS_ERRLEVEL_LOW, mp, hdr); - xfs_trans_brelse(tp, bp); - return XFS_ERROR(EFSCORRUPTED); - } + len = xfs_dir2_data_entsize(args->namelen); + /* * Set up pointers to parts of the block. */ - bf = hdr->bestfree; + hdr = bp->b_addr; btp = xfs_dir2_block_tail_p(mp, hdr); blp = xfs_dir2_block_leaf_p(btp); + /* - * No stale entries? Need space for entry and new leaf. - */ - if (!btp->stale) { - /* - * Tag just before the first leaf entry. - */ - tagp = (__be16 *)blp - 1; - /* - * Data object just before the first leaf entry. - */ - enddup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp)); - /* - * If it's not free then can't do this add without cleaning up: - * the space before the first leaf entry needs to be free so it - * can be expanded to hold the pointer to the new entry. - */ - if (be16_to_cpu(enddup->freetag) != XFS_DIR2_DATA_FREE_TAG) - dup = enddup = NULL; - /* - * Check out the biggest freespace and see if it's the same one. - */ - else { - dup = (xfs_dir2_data_unused_t *) - ((char *)hdr + be16_to_cpu(bf[0].offset)); - if (dup == enddup) { - /* - * It is the biggest freespace, is it too small - * to hold the new leaf too? - */ - if (be16_to_cpu(dup->length) < len + (uint)sizeof(*blp)) { - /* - * Yes, we use the second-largest - * entry instead if it works. - */ - if (be16_to_cpu(bf[1].length) >= len) - dup = (xfs_dir2_data_unused_t *) - ((char *)hdr + - be16_to_cpu(bf[1].offset)); - else - dup = NULL; - } - } else { - /* - * Not the same free entry, - * just check its length. - */ - if (be16_to_cpu(dup->length) < len) { - dup = NULL; - } - } - } - compact = 0; - } - /* - * If there are stale entries we'll use one for the leaf. - * Is the biggest entry enough to avoid compaction? - */ - else if (be16_to_cpu(bf[0].length) >= len) { - dup = (xfs_dir2_data_unused_t *) - ((char *)hdr + be16_to_cpu(bf[0].offset)); - compact = 0; - } - /* - * Will need to compact to make this work. + * Find out if we can reuse stale entries or whether we need extra + * space for entry and new leaf. */ - else { - /* - * Tag just before the first leaf entry. - */ - tagp = (__be16 *)blp - 1; - /* - * Data object just before the first leaf entry. - */ - dup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp)); - /* - * If it's not free then the data will go where the - * leaf data starts now, if it works at all. - */ - if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { - if (be16_to_cpu(dup->length) + (be32_to_cpu(btp->stale) - 1) * - (uint)sizeof(*blp) < len) - dup = NULL; - } else if ((be32_to_cpu(btp->stale) - 1) * (uint)sizeof(*blp) < len) - dup = NULL; - else - dup = (xfs_dir2_data_unused_t *)blp; - compact = 1; - } + xfs_dir2_block_need_space(hdr, btp, blp, &tagp, &dup, + &enddup, &compact, len); + /* - * If this isn't a real add, we're done with the buffer. + * Done everything we need for a space check now. */ - if (args->op_flags & XFS_DA_OP_JUSTCHECK) + if (args->op_flags & XFS_DA_OP_JUSTCHECK) { xfs_trans_brelse(tp, bp); + if (!dup) + return XFS_ERROR(ENOSPC); + return 0; + } + /* * If we don't have space for the new entry & leaf ... */ if (!dup) { - /* - * Not trying to actually do anything, or don't have - * a space reservation: return no-space. - */ - if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0) + /* Don't have a space reservation: return no-space. */ + if (args->total == 0) return XFS_ERROR(ENOSPC); /* * Convert to the next larger format. @@ -232,65 +349,24 @@ xfs_dir2_block_addname( return error; return xfs_dir2_leaf_addname(args); } - /* - * Just checking, and it would work, so say so. - */ - if (args->op_flags & XFS_DA_OP_JUSTCHECK) - return 0; + needlog = needscan = 0; + /* * If need to compact the leaf entries, do it now. - * Leave the highest-numbered stale entry stale. - * XXX should be the one closest to mid but mid is not yet computed. - */ - if (compact) { - int fromidx; /* source leaf index */ - int toidx; /* target leaf index */ - - for (fromidx = toidx = be32_to_cpu(btp->count) - 1, - highstale = lfloghigh = -1; - fromidx >= 0; - fromidx--) { - if (blp[fromidx].address == - cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) { - if (highstale == -1) - highstale = toidx; - else { - if (lfloghigh == -1) - lfloghigh = toidx; - continue; - } - } - if (fromidx < toidx) - blp[toidx] = blp[fromidx]; - toidx--; - } - lfloglow = toidx + 1 - (be32_to_cpu(btp->stale) - 1); - lfloghigh -= be32_to_cpu(btp->stale) - 1; - be32_add_cpu(&btp->count, -(be32_to_cpu(btp->stale) - 1)); - xfs_dir2_data_make_free(tp, bp, - (xfs_dir2_data_aoff_t)((char *)blp - (char *)hdr), - (xfs_dir2_data_aoff_t)((be32_to_cpu(btp->stale) - 1) * sizeof(*blp)), - &needlog, &needscan); - blp += be32_to_cpu(btp->stale) - 1; - btp->stale = cpu_to_be32(1); - /* - * If we now need to rebuild the bestfree map, do so. - * This needs to happen before the next call to use_free. - */ - if (needscan) { - xfs_dir2_data_freescan(mp, hdr, &needlog); - needscan = 0; - } - } - /* - * Set leaf logging boundaries to impossible state. - * For the no-stale case they're set explicitly. */ + if (compact) + xfs_dir2_block_compact(tp, bp, hdr, btp, blp, &needlog, + &lfloghigh, &lfloglow); else if (btp->stale) { + /* + * Set leaf logging boundaries to impossible state. + * For the no-stale case they're set explicitly. + */ lfloglow = be32_to_cpu(btp->count); lfloghigh = -1; } + /* * Find the slot that's first lower than our hash value, -1 if none. */ @@ -450,18 +526,13 @@ xfs_dir2_block_getdents( /* * If the block number in the offset is out of range, we're done. */ - if (xfs_dir2_dataptr_to_db(mp, *offset) > mp->m_dirdatablk) { + if (xfs_dir2_dataptr_to_db(mp, *offset) > mp->m_dirdatablk) return 0; - } - /* - * Can't read the block, give up, else get dabuf in bp. - */ - error = xfs_da_read_buf(NULL, dp, mp->m_dirdatablk, -1, - &bp, XFS_DATA_FORK); + + error = xfs_dir2_block_read(NULL, dp, &bp); if (error) return error; - ASSERT(bp != NULL); /* * Extract the byte offset we start at from the seek pointer. * We'll skip entries before this. @@ -637,14 +708,11 @@ xfs_dir2_block_lookup_int( dp = args->dp; tp = args->trans; mp = dp->i_mount; - /* - * Read the buffer, return error if we can't get it. - */ - if ((error = - xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &bp, XFS_DATA_FORK))) { + + error = xfs_dir2_block_read(tp, dp, &bp); + if (error) return error; - } - ASSERT(bp != NULL); + hdr = bp->b_addr; xfs_dir2_data_check(dp, bp); btp = xfs_dir2_block_tail_p(mp, hdr); @@ -917,10 +985,10 @@ xfs_dir2_leaf_to_block( /* * Read the data block if we don't already have it, give up if it fails. */ - if (dbp == NULL && - (error = xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &dbp, - XFS_DATA_FORK))) { - return error; + if (!dbp) { + error = xfs_dir2_data_read(tp, dp, mp->m_dirdatablk, -1, &dbp); + if (error) + return error; } hdr = dbp->b_addr; ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC)); @@ -944,6 +1012,7 @@ xfs_dir2_leaf_to_block( /* * Start converting it to block form. */ + dbp->b_ops = &xfs_dir2_block_buf_ops; hdr->magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC); needlog = 1; needscan = 0; @@ -1073,6 +1142,7 @@ xfs_dir2_sf_to_block( kmem_free(sfp); return error; } + bp->b_ops = &xfs_dir2_block_buf_ops; hdr = bp->b_addr; hdr->magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC); /* diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c index 44ffd4d6bc91..ffcf1774152e 100644 --- a/fs/xfs/xfs_dir2_data.c +++ b/fs/xfs/xfs_dir2_data.c @@ -34,14 +34,13 @@ STATIC xfs_dir2_data_free_t * xfs_dir2_data_freefind(xfs_dir2_data_hdr_t *hdr, xfs_dir2_data_unused_t *dup); -#ifdef DEBUG /* * Check the consistency of the data block. * The input can also be a block-format directory. - * Pop an assert if we find anything bad. + * Return 0 is the buffer is good, otherwise an error. */ -void -xfs_dir2_data_check( +int +__xfs_dir2_data_check( struct xfs_inode *dp, /* incore inode pointer */ struct xfs_buf *bp) /* data block's buffer */ { @@ -64,18 +63,23 @@ xfs_dir2_data_check( int stale; /* count of stale leaves */ struct xfs_name name; - mp = dp->i_mount; + mp = bp->b_target->bt_mount; hdr = bp->b_addr; bf = hdr->bestfree; p = (char *)(hdr + 1); - if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) { + switch (hdr->magic) { + case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC): btp = xfs_dir2_block_tail_p(mp, hdr); lep = xfs_dir2_block_leaf_p(btp); endp = (char *)lep; - } else { - ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC)); + break; + case cpu_to_be32(XFS_DIR2_DATA_MAGIC): endp = (char *)hdr + mp->m_dirblksize; + break; + default: + XFS_ERROR_REPORT("Bad Magic", XFS_ERRLEVEL_LOW, mp); + return EFSCORRUPTED; } count = lastfree = freeseen = 0; @@ -83,19 +87,22 @@ xfs_dir2_data_check( * Account for zero bestfree entries. */ if (!bf[0].length) { - ASSERT(!bf[0].offset); + XFS_WANT_CORRUPTED_RETURN(!bf[0].offset); freeseen |= 1 << 0; } if (!bf[1].length) { - ASSERT(!bf[1].offset); + XFS_WANT_CORRUPTED_RETURN(!bf[1].offset); freeseen |= 1 << 1; } if (!bf[2].length) { - ASSERT(!bf[2].offset); + XFS_WANT_CORRUPTED_RETURN(!bf[2].offset); freeseen |= 1 << 2; } - ASSERT(be16_to_cpu(bf[0].length) >= be16_to_cpu(bf[1].length)); - ASSERT(be16_to_cpu(bf[1].length) >= be16_to_cpu(bf[2].length)); + + XFS_WANT_CORRUPTED_RETURN(be16_to_cpu(bf[0].length) >= + be16_to_cpu(bf[1].length)); + XFS_WANT_CORRUPTED_RETURN(be16_to_cpu(bf[1].length) >= + be16_to_cpu(bf[2].length)); /* * Loop over the data/unused entries. */ @@ -107,17 +114,20 @@ xfs_dir2_data_check( * doesn't need to be there. */ if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { - ASSERT(lastfree == 0); - ASSERT(be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)) == - (char *)dup - (char *)hdr); + XFS_WANT_CORRUPTED_RETURN(lastfree == 0); + XFS_WANT_CORRUPTED_RETURN( + be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)) == + (char *)dup - (char *)hdr); dfp = xfs_dir2_data_freefind(hdr, dup); if (dfp) { i = (int)(dfp - bf); - ASSERT((freeseen & (1 << i)) == 0); + XFS_WANT_CORRUPTED_RETURN( + (freeseen & (1 << i)) == 0); freeseen |= 1 << i; } else { - ASSERT(be16_to_cpu(dup->length) <= - be16_to_cpu(bf[2].length)); + XFS_WANT_CORRUPTED_RETURN( + be16_to_cpu(dup->length) <= + be16_to_cpu(bf[2].length)); } p += be16_to_cpu(dup->length); lastfree = 1; @@ -130,10 +140,12 @@ xfs_dir2_data_check( * The linear search is crude but this is DEBUG code. */ dep = (xfs_dir2_data_entry_t *)p; - ASSERT(dep->namelen != 0); - ASSERT(xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber)) == 0); - ASSERT(be16_to_cpu(*xfs_dir2_data_entry_tag_p(dep)) == - (char *)dep - (char *)hdr); + XFS_WANT_CORRUPTED_RETURN(dep->namelen != 0); + XFS_WANT_CORRUPTED_RETURN( + !xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber))); + XFS_WANT_CORRUPTED_RETURN( + be16_to_cpu(*xfs_dir2_data_entry_tag_p(dep)) == + (char *)dep - (char *)hdr); count++; lastfree = 0; if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) { @@ -148,27 +160,122 @@ xfs_dir2_data_check( be32_to_cpu(lep[i].hashval) == hash) break; } - ASSERT(i < be32_to_cpu(btp->count)); + XFS_WANT_CORRUPTED_RETURN(i < be32_to_cpu(btp->count)); } p += xfs_dir2_data_entsize(dep->namelen); } /* * Need to have seen all the entries and all the bestfree slots. */ - ASSERT(freeseen == 7); + XFS_WANT_CORRUPTED_RETURN(freeseen == 7); if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) { for (i = stale = 0; i < be32_to_cpu(btp->count); i++) { if (lep[i].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) stale++; if (i > 0) - ASSERT(be32_to_cpu(lep[i].hashval) >= be32_to_cpu(lep[i - 1].hashval)); + XFS_WANT_CORRUPTED_RETURN( + be32_to_cpu(lep[i].hashval) >= + be32_to_cpu(lep[i - 1].hashval)); } - ASSERT(count == be32_to_cpu(btp->count) - be32_to_cpu(btp->stale)); - ASSERT(stale == be32_to_cpu(btp->stale)); + XFS_WANT_CORRUPTED_RETURN(count == + be32_to_cpu(btp->count) - be32_to_cpu(btp->stale)); + XFS_WANT_CORRUPTED_RETURN(stale == be32_to_cpu(btp->stale)); } + return 0; +} + +static void +xfs_dir2_data_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_dir2_data_hdr *hdr = bp->b_addr; + int block_ok = 0; + + block_ok = hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC); + block_ok = block_ok && __xfs_dir2_data_check(NULL, bp) == 0; + + if (!block_ok) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } +} + +/* + * Readahead of the first block of the directory when it is opened is completely + * oblivious to the format of the directory. Hence we can either get a block + * format buffer or a data format buffer on readahead. + */ +static void +xfs_dir2_data_reada_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_dir2_data_hdr *hdr = bp->b_addr; + + switch (hdr->magic) { + case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC): + bp->b_ops = &xfs_dir2_block_buf_ops; + bp->b_ops->verify_read(bp); + return; + case cpu_to_be32(XFS_DIR2_DATA_MAGIC): + xfs_dir2_data_verify(bp); + return; + default: + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + break; + } +} + +static void +xfs_dir2_data_read_verify( + struct xfs_buf *bp) +{ + xfs_dir2_data_verify(bp); +} + +static void +xfs_dir2_data_write_verify( + struct xfs_buf *bp) +{ + xfs_dir2_data_verify(bp); +} + +const struct xfs_buf_ops xfs_dir2_data_buf_ops = { + .verify_read = xfs_dir2_data_read_verify, + .verify_write = xfs_dir2_data_write_verify, +}; + +static const struct xfs_buf_ops xfs_dir2_data_reada_buf_ops = { + .verify_read = xfs_dir2_data_reada_verify, + .verify_write = xfs_dir2_data_write_verify, +}; + + +int +xfs_dir2_data_read( + struct xfs_trans *tp, + struct xfs_inode *dp, + xfs_dablk_t bno, + xfs_daddr_t mapped_bno, + struct xfs_buf **bpp) +{ + return xfs_da_read_buf(tp, dp, bno, mapped_bno, bpp, + XFS_DATA_FORK, &xfs_dir2_data_buf_ops); +} + +int +xfs_dir2_data_readahead( + struct xfs_trans *tp, + struct xfs_inode *dp, + xfs_dablk_t bno, + xfs_daddr_t mapped_bno) +{ + return xfs_da_reada_buf(tp, dp, bno, mapped_bno, + XFS_DATA_FORK, &xfs_dir2_data_reada_buf_ops); } -#endif /* * Given a data block and an unused entry from that block, @@ -409,10 +516,9 @@ xfs_dir2_data_init( */ error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, blkno), -1, &bp, XFS_DATA_FORK); - if (error) { + if (error) return error; - } - ASSERT(bp != NULL); + bp->b_ops = &xfs_dir2_data_buf_ops; /* * Initialize the header. diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c index 0b296253bd01..60cd2fa4e047 100644 --- a/fs/xfs/xfs_dir2_leaf.c +++ b/fs/xfs/xfs_dir2_leaf.c @@ -48,6 +48,83 @@ static void xfs_dir2_leaf_log_bests(struct xfs_trans *tp, struct xfs_buf *bp, int first, int last); static void xfs_dir2_leaf_log_tail(struct xfs_trans *tp, struct xfs_buf *bp); +static void +xfs_dir2_leaf_verify( + struct xfs_buf *bp, + __be16 magic) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_dir2_leaf_hdr *hdr = bp->b_addr; + int block_ok = 0; + + block_ok = hdr->info.magic == magic; + if (!block_ok) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } +} + +static void +xfs_dir2_leaf1_read_verify( + struct xfs_buf *bp) +{ + xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAF1_MAGIC)); +} + +static void +xfs_dir2_leaf1_write_verify( + struct xfs_buf *bp) +{ + xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAF1_MAGIC)); +} + +void +xfs_dir2_leafn_read_verify( + struct xfs_buf *bp) +{ + xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); +} + +void +xfs_dir2_leafn_write_verify( + struct xfs_buf *bp) +{ + xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); +} + +static const struct xfs_buf_ops xfs_dir2_leaf1_buf_ops = { + .verify_read = xfs_dir2_leaf1_read_verify, + .verify_write = xfs_dir2_leaf1_write_verify, +}; + +const struct xfs_buf_ops xfs_dir2_leafn_buf_ops = { + .verify_read = xfs_dir2_leafn_read_verify, + .verify_write = xfs_dir2_leafn_write_verify, +}; + +static int +xfs_dir2_leaf_read( + struct xfs_trans *tp, + struct xfs_inode *dp, + xfs_dablk_t fbno, + xfs_daddr_t mappedbno, + struct xfs_buf **bpp) +{ + return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, + XFS_DATA_FORK, &xfs_dir2_leaf1_buf_ops); +} + +int +xfs_dir2_leafn_read( + struct xfs_trans *tp, + struct xfs_inode *dp, + xfs_dablk_t fbno, + xfs_daddr_t mappedbno, + struct xfs_buf **bpp) +{ + return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, + XFS_DATA_FORK, &xfs_dir2_leafn_buf_ops); +} /* * Convert a block form directory to a leaf form directory. @@ -125,6 +202,7 @@ xfs_dir2_block_to_leaf( /* * Fix up the block header, make it a data block. */ + dbp->b_ops = &xfs_dir2_data_buf_ops; hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC); if (needscan) xfs_dir2_data_freescan(mp, hdr, &needlog); @@ -311,15 +389,11 @@ xfs_dir2_leaf_addname( dp = args->dp; tp = args->trans; mp = dp->i_mount; - /* - * Read the leaf block. - */ - error = xfs_da_read_buf(tp, dp, mp->m_dirleafblk, -1, &lbp, - XFS_DATA_FORK); - if (error) { + + error = xfs_dir2_leaf_read(tp, dp, mp->m_dirleafblk, -1, &lbp); + if (error) return error; - } - ASSERT(lbp != NULL); + /* * Look up the entry by hash value and name. * We know it's not there, our caller has already done a lookup. @@ -494,22 +568,21 @@ xfs_dir2_leaf_addname( hdr = dbp->b_addr; bestsp[use_block] = hdr->bestfree[0].length; grown = 1; - } - /* - * Already had space in some data block. - * Just read that one in. - */ - else { - if ((error = - xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, use_block), - -1, &dbp, XFS_DATA_FORK))) { + } else { + /* + * Already had space in some data block. + * Just read that one in. + */ + error = xfs_dir2_data_read(tp, dp, + xfs_dir2_db_to_da(mp, use_block), + -1, &dbp); + if (error) { xfs_trans_brelse(tp, lbp); return error; } hdr = dbp->b_addr; grown = 0; } - xfs_dir2_data_check(dp, dbp); /* * Point to the biggest freespace in our data block. */ @@ -892,10 +965,9 @@ xfs_dir2_leaf_readbuf( * Read the directory block starting at the first mapping. */ mip->curdb = xfs_dir2_da_to_db(mp, map->br_startoff); - error = xfs_da_read_buf(NULL, dp, map->br_startoff, + error = xfs_dir2_data_read(NULL, dp, map->br_startoff, map->br_blockcount >= mp->m_dirblkfsbs ? - XFS_FSB_TO_DADDR(mp, map->br_startblock) : -1, - &bp, XFS_DATA_FORK); + XFS_FSB_TO_DADDR(mp, map->br_startblock) : -1, &bp); /* * Should just skip over the data block instead of giving up. @@ -922,11 +994,11 @@ xfs_dir2_leaf_readbuf( */ if (i > mip->ra_current && map[mip->ra_index].br_blockcount >= mp->m_dirblkfsbs) { - xfs_buf_readahead(mp->m_ddev_targp, + xfs_dir2_data_readahead(NULL, dp, + map[mip->ra_index].br_startoff + mip->ra_offset, XFS_FSB_TO_DADDR(mp, map[mip->ra_index].br_startblock + - mip->ra_offset), - (int)BTOBB(mp->m_dirblksize)); + mip->ra_offset)); mip->ra_current = i; } @@ -935,10 +1007,9 @@ xfs_dir2_leaf_readbuf( * use our mapping, but this is a very rare case. */ else if (i > mip->ra_current) { - xfs_da_reada_buf(NULL, dp, + xfs_dir2_data_readahead(NULL, dp, map[mip->ra_index].br_startoff + - mip->ra_offset, - XFS_DATA_FORK); + mip->ra_offset, -1); mip->ra_current = i; } @@ -1177,15 +1248,14 @@ xfs_dir2_leaf_init( * Get the buffer for the block. */ error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, bno), -1, &bp, - XFS_DATA_FORK); - if (error) { + XFS_DATA_FORK); + if (error) return error; - } - ASSERT(bp != NULL); - leaf = bp->b_addr; + /* * Initialize the header. */ + leaf = bp->b_addr; leaf->hdr.info.magic = cpu_to_be16(magic); leaf->hdr.info.forw = 0; leaf->hdr.info.back = 0; @@ -1198,10 +1268,12 @@ xfs_dir2_leaf_init( * the block. */ if (magic == XFS_DIR2_LEAF1_MAGIC) { + bp->b_ops = &xfs_dir2_leaf1_buf_ops; ltp = xfs_dir2_leaf_tail_p(mp, leaf); ltp->bestcount = 0; xfs_dir2_leaf_log_tail(tp, bp); - } + } else + bp->b_ops = &xfs_dir2_leafn_buf_ops; *bpp = bp; return 0; } @@ -1372,13 +1444,11 @@ xfs_dir2_leaf_lookup_int( dp = args->dp; tp = args->trans; mp = dp->i_mount; - /* - * Read the leaf block into the buffer. - */ - error = xfs_da_read_buf(tp, dp, mp->m_dirleafblk, -1, &lbp, - XFS_DATA_FORK); + + error = xfs_dir2_leaf_read(tp, dp, mp->m_dirleafblk, -1, &lbp); if (error) return error; + *lbpp = lbp; leaf = lbp->b_addr; xfs_dir2_leaf_check(dp, lbp); @@ -1409,14 +1479,13 @@ xfs_dir2_leaf_lookup_int( if (newdb != curdb) { if (dbp) xfs_trans_brelse(tp, dbp); - error = xfs_da_read_buf(tp, dp, - xfs_dir2_db_to_da(mp, newdb), - -1, &dbp, XFS_DATA_FORK); + error = xfs_dir2_data_read(tp, dp, + xfs_dir2_db_to_da(mp, newdb), + -1, &dbp); if (error) { xfs_trans_brelse(tp, lbp); return error; } - xfs_dir2_data_check(dp, dbp); curdb = newdb; } /* @@ -1451,9 +1520,9 @@ xfs_dir2_leaf_lookup_int( ASSERT(cidb != -1); if (cidb != curdb) { xfs_trans_brelse(tp, dbp); - error = xfs_da_read_buf(tp, dp, - xfs_dir2_db_to_da(mp, cidb), - -1, &dbp, XFS_DATA_FORK); + error = xfs_dir2_data_read(tp, dp, + xfs_dir2_db_to_da(mp, cidb), + -1, &dbp); if (error) { xfs_trans_brelse(tp, lbp); return error; @@ -1738,10 +1807,9 @@ xfs_dir2_leaf_trim_data( /* * Read the offending data block. We need its buffer. */ - if ((error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, db), -1, &dbp, - XFS_DATA_FORK))) { + error = xfs_dir2_data_read(tp, dp, xfs_dir2_db_to_da(mp, db), -1, &dbp); + if (error) return error; - } leaf = lbp->b_addr; ltp = xfs_dir2_leaf_tail_p(mp, leaf); @@ -1864,10 +1932,9 @@ xfs_dir2_node_to_leaf( /* * Read the freespace block. */ - if ((error = xfs_da_read_buf(tp, dp, mp->m_dirfreeblk, -1, &fbp, - XFS_DATA_FORK))) { + error = xfs_dir2_free_read(tp, dp, mp->m_dirfreeblk, &fbp); + if (error) return error; - } free = fbp->b_addr; ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); ASSERT(!free->hdr.firstdb); @@ -1890,7 +1957,10 @@ xfs_dir2_node_to_leaf( xfs_dir2_leaf_compact(args, lbp); else xfs_dir2_leaf_log_header(tp, lbp); + + lbp->b_ops = &xfs_dir2_leaf1_buf_ops; leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAF1_MAGIC); + /* * Set up the leaf tail from the freespace block. */ diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c index 6c7052406605..5980f9b7fa9b 100644 --- a/fs/xfs/xfs_dir2_node.c +++ b/fs/xfs/xfs_dir2_node.c @@ -55,6 +55,74 @@ static int xfs_dir2_leafn_remove(xfs_da_args_t *args, struct xfs_buf *bp, static int xfs_dir2_node_addname_int(xfs_da_args_t *args, xfs_da_state_blk_t *fblk); +static void +xfs_dir2_free_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_dir2_free_hdr *hdr = bp->b_addr; + int block_ok = 0; + + block_ok = hdr->magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC); + if (!block_ok) { + XFS_CORRUPTION_ERROR("xfs_dir2_free_verify magic", + XFS_ERRLEVEL_LOW, mp, hdr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } +} + +static void +xfs_dir2_free_read_verify( + struct xfs_buf *bp) +{ + xfs_dir2_free_verify(bp); +} + +static void +xfs_dir2_free_write_verify( + struct xfs_buf *bp) +{ + xfs_dir2_free_verify(bp); +} + +static const struct xfs_buf_ops xfs_dir2_free_buf_ops = { + .verify_read = xfs_dir2_free_read_verify, + .verify_write = xfs_dir2_free_write_verify, +}; + + +static int +__xfs_dir2_free_read( + struct xfs_trans *tp, + struct xfs_inode *dp, + xfs_dablk_t fbno, + xfs_daddr_t mappedbno, + struct xfs_buf **bpp) +{ + return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, + XFS_DATA_FORK, &xfs_dir2_free_buf_ops); +} + +int +xfs_dir2_free_read( + struct xfs_trans *tp, + struct xfs_inode *dp, + xfs_dablk_t fbno, + struct xfs_buf **bpp) +{ + return __xfs_dir2_free_read(tp, dp, fbno, -1, bpp); +} + +static int +xfs_dir2_free_try_read( + struct xfs_trans *tp, + struct xfs_inode *dp, + xfs_dablk_t fbno, + struct xfs_buf **bpp) +{ + return __xfs_dir2_free_read(tp, dp, fbno, -2, bpp); +} + /* * Log entries from a freespace block. */ @@ -131,11 +199,12 @@ xfs_dir2_leaf_to_node( /* * Get the buffer for the new freespace block. */ - if ((error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, fdb), -1, &fbp, - XFS_DATA_FORK))) { + error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, fdb), -1, &fbp, + XFS_DATA_FORK); + if (error) return error; - } - ASSERT(fbp != NULL); + fbp->b_ops = &xfs_dir2_free_buf_ops; + free = fbp->b_addr; leaf = lbp->b_addr; ltp = xfs_dir2_leaf_tail_p(mp, leaf); @@ -157,7 +226,10 @@ xfs_dir2_leaf_to_node( *to = cpu_to_be16(off); } free->hdr.nused = cpu_to_be32(n); + + lbp->b_ops = &xfs_dir2_leafn_buf_ops; leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAFN_MAGIC); + /* * Log everything. */ @@ -394,12 +466,10 @@ xfs_dir2_leafn_lookup_for_addname( */ if (curbp) xfs_trans_brelse(tp, curbp); - /* - * Read the free block. - */ - error = xfs_da_read_buf(tp, dp, + + error = xfs_dir2_free_read(tp, dp, xfs_dir2_db_to_da(mp, newfdb), - -1, &curbp, XFS_DATA_FORK); + &curbp); if (error) return error; free = curbp->b_addr; @@ -534,9 +604,9 @@ xfs_dir2_leafn_lookup_for_entry( ASSERT(state->extravalid); curbp = state->extrablk.bp; } else { - error = xfs_da_read_buf(tp, dp, + error = xfs_dir2_data_read(tp, dp, xfs_dir2_db_to_da(mp, newdb), - -1, &curbp, XFS_DATA_FORK); + -1, &curbp); if (error) return error; } @@ -568,6 +638,7 @@ xfs_dir2_leafn_lookup_for_entry( state->extrablk.index = (int)((char *)dep - (char *)curbp->b_addr); state->extrablk.magic = XFS_DIR2_DATA_MAGIC; + curbp->b_ops = &xfs_dir2_data_buf_ops; if (cmp == XFS_CMP_EXACT) return XFS_ERROR(EEXIST); } @@ -582,6 +653,7 @@ xfs_dir2_leafn_lookup_for_entry( state->extrablk.index = -1; state->extrablk.blkno = curdb; state->extrablk.magic = XFS_DIR2_DATA_MAGIC; + curbp->b_ops = &xfs_dir2_data_buf_ops; } else { /* If the curbp is not the CI match block, drop it */ if (state->extrablk.bp != curbp) @@ -825,6 +897,77 @@ xfs_dir2_leafn_rebalance( } } +static int +xfs_dir2_data_block_free( + xfs_da_args_t *args, + struct xfs_dir2_data_hdr *hdr, + struct xfs_dir2_free *free, + xfs_dir2_db_t fdb, + int findex, + struct xfs_buf *fbp, + int longest) +{ + struct xfs_trans *tp = args->trans; + int logfree = 0; + + if (!hdr) { + /* One less used entry in the free table. */ + be32_add_cpu(&free->hdr.nused, -1); + xfs_dir2_free_log_header(tp, fbp); + + /* + * If this was the last entry in the table, we can trim the + * table size back. There might be other entries at the end + * referring to non-existent data blocks, get those too. + */ + if (findex == be32_to_cpu(free->hdr.nvalid) - 1) { + int i; /* free entry index */ + + for (i = findex - 1; i >= 0; i--) { + if (free->bests[i] != cpu_to_be16(NULLDATAOFF)) + break; + } + free->hdr.nvalid = cpu_to_be32(i + 1); + logfree = 0; + } else { + /* Not the last entry, just punch it out. */ + free->bests[findex] = cpu_to_be16(NULLDATAOFF); + logfree = 1; + } + /* + * If there are no useful entries left in the block, + * get rid of the block if we can. + */ + if (!free->hdr.nused) { + int error; + + error = xfs_dir2_shrink_inode(args, fdb, fbp); + if (error == 0) { + fbp = NULL; + logfree = 0; + } else if (error != ENOSPC || args->total != 0) + return error; + /* + * It's possible to get ENOSPC if there is no + * space reservation. In this case some one + * else will eventually get rid of this block. + */ + } + } else { + /* + * Data block is not empty, just set the free entry to the new + * value. + */ + free->bests[findex] = cpu_to_be16(longest); + logfree = 1; + } + + /* Log the free entry that changed, unless we got rid of it. */ + if (logfree) + xfs_dir2_free_log_bests(tp, fbp, findex, findex); + return 0; +} + /* * Remove an entry from a node directory. * This removes the leaf entry and the data entry, @@ -908,17 +1051,16 @@ xfs_dir2_leafn_remove( xfs_dir2_db_t fdb; /* freeblock block number */ int findex; /* index in freeblock entries */ xfs_dir2_free_t *free; /* freeblock structure */ - int logfree; /* need to log free entry */ /* * Convert the data block number to a free block, * read in the free block. */ fdb = xfs_dir2_db_to_fdb(mp, db); - if ((error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, fdb), - -1, &fbp, XFS_DATA_FORK))) { + error = xfs_dir2_free_read(tp, dp, xfs_dir2_db_to_da(mp, fdb), + &fbp); + if (error) return error; - } free = fbp->b_addr; ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); ASSERT(be32_to_cpu(free->hdr.firstdb) == @@ -954,68 +1096,12 @@ xfs_dir2_leafn_remove( * If we got rid of the data block, we can eliminate that entry * in the free block. */ - if (hdr == NULL) { - /* - * One less used entry in the free table. - */ - be32_add_cpu(&free->hdr.nused, -1); - xfs_dir2_free_log_header(tp, fbp); - /* - * If this was the last entry in the table, we can - * trim the table size back. There might be other - * entries at the end referring to non-existent - * data blocks, get those too. - */ - if (findex == be32_to_cpu(free->hdr.nvalid) - 1) { - int i; /* free entry index */ - - for (i = findex - 1; - i >= 0 && - free->bests[i] == cpu_to_be16(NULLDATAOFF); - i--) - continue; - free->hdr.nvalid = cpu_to_be32(i + 1); - logfree = 0; - } - /* - * Not the last entry, just punch it out. - */ - else { - free->bests[findex] = cpu_to_be16(NULLDATAOFF); - logfree = 1; - } - /* - * If there are no useful entries left in the block, - * get rid of the block if we can. - */ - if (!free->hdr.nused) { - error = xfs_dir2_shrink_inode(args, fdb, fbp); - if (error == 0) { - fbp = NULL; - logfree = 0; - } else if (error != ENOSPC || args->total != 0) - return error; - /* - * It's possible to get ENOSPC if there is no - * space reservation. In this case some one - * else will eventually get rid of this block. - */ - } - } - /* - * Data block is not empty, just set the free entry to - * the new value. - */ - else { - free->bests[findex] = cpu_to_be16(longest); - logfree = 1; - } - /* - * Log the free entry that changed, unless we got rid of it. - */ - if (logfree) - xfs_dir2_free_log_bests(tp, fbp, findex, findex); + error = xfs_dir2_data_block_free(args, hdr, free, + fdb, findex, fbp, longest); + if (error) + return error; } + xfs_dir2_leafn_check(dp, bp); /* * Return indication of whether this leaf block is empty enough @@ -1169,12 +1255,11 @@ xfs_dir2_leafn_toosmall( /* * Read the sibling leaf block. */ - if ((error = - xfs_da_read_buf(state->args->trans, state->args->dp, blkno, - -1, &bp, XFS_DATA_FORK))) { + error = xfs_dir2_leafn_read(state->args->trans, state->args->dp, + blkno, -1, &bp); + if (error) return error; - } - ASSERT(bp != NULL); + /* * Count bytes in the two blocks combined. */ @@ -1454,14 +1539,13 @@ xfs_dir2_node_addname_int( * This should be really rare, so there's no reason * to avoid it. */ - if ((error = xfs_da_read_buf(tp, dp, - xfs_dir2_db_to_da(mp, fbno), -2, &fbp, - XFS_DATA_FORK))) { + error = xfs_dir2_free_try_read(tp, dp, + xfs_dir2_db_to_da(mp, fbno), + &fbp); + if (error) return error; - } - if (unlikely(fbp == NULL)) { + if (!fbp) continue; - } free = fbp->b_addr; ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); findex = 0; @@ -1520,9 +1604,10 @@ xfs_dir2_node_addname_int( * that was just allocated. */ fbno = xfs_dir2_db_to_fdb(mp, dbno); - if (unlikely(error = xfs_da_read_buf(tp, dp, - xfs_dir2_db_to_da(mp, fbno), -2, &fbp, - XFS_DATA_FORK))) + error = xfs_dir2_free_try_read(tp, dp, + xfs_dir2_db_to_da(mp, fbno), + &fbp); + if (error) return error; /* @@ -1561,12 +1646,12 @@ xfs_dir2_node_addname_int( /* * Get a buffer for the new block. */ - if ((error = xfs_da_get_buf(tp, dp, - xfs_dir2_db_to_da(mp, fbno), - -1, &fbp, XFS_DATA_FORK))) { + error = xfs_da_get_buf(tp, dp, + xfs_dir2_db_to_da(mp, fbno), + -1, &fbp, XFS_DATA_FORK); + if (error) return error; - } - ASSERT(fbp != NULL); + fbp->b_ops = &xfs_dir2_free_buf_ops; /* * Initialize the new block to be empty, and remember @@ -1630,8 +1715,8 @@ xfs_dir2_node_addname_int( /* * Read the data block in. */ - error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, dbno), - -1, &dbp, XFS_DATA_FORK); + error = xfs_dir2_data_read(tp, dp, xfs_dir2_db_to_da(mp, dbno), + -1, &dbp); if (error) return error; hdr = dbp->b_addr; @@ -1917,18 +2002,15 @@ xfs_dir2_node_trim_free( /* * Read the freespace block. */ - if (unlikely(error = xfs_da_read_buf(tp, dp, (xfs_dablk_t)fo, -2, &bp, - XFS_DATA_FORK))) { + error = xfs_dir2_free_try_read(tp, dp, fo, &bp); + if (error) return error; - } - /* * There can be holes in freespace. If fo is a hole, there's * nothing to do. */ - if (bp == NULL) { + if (!bp) return 0; - } free = bp->b_addr; ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); /* diff --git a/fs/xfs/xfs_dir2_priv.h b/fs/xfs/xfs_dir2_priv.h index 3523d3e15aa8..7da79f6515fd 100644 --- a/fs/xfs/xfs_dir2_priv.h +++ b/fs/xfs/xfs_dir2_priv.h @@ -30,6 +30,8 @@ extern int xfs_dir_cilookup_result(struct xfs_da_args *args, const unsigned char *name, int len); /* xfs_dir2_block.c */ +extern const struct xfs_buf_ops xfs_dir2_block_buf_ops; + extern int xfs_dir2_block_addname(struct xfs_da_args *args); extern int xfs_dir2_block_getdents(struct xfs_inode *dp, void *dirent, xfs_off_t *offset, filldir_t filldir); @@ -41,10 +43,19 @@ extern int xfs_dir2_leaf_to_block(struct xfs_da_args *args, /* xfs_dir2_data.c */ #ifdef DEBUG -extern void xfs_dir2_data_check(struct xfs_inode *dp, struct xfs_buf *bp); +#define xfs_dir2_data_check(dp,bp) __xfs_dir2_data_check(dp, bp); #else #define xfs_dir2_data_check(dp,bp) #endif + +extern const struct xfs_buf_ops xfs_dir2_data_buf_ops; + +extern int __xfs_dir2_data_check(struct xfs_inode *dp, struct xfs_buf *bp); +extern int xfs_dir2_data_read(struct xfs_trans *tp, struct xfs_inode *dp, + xfs_dablk_t bno, xfs_daddr_t mapped_bno, struct xfs_buf **bpp); +extern int xfs_dir2_data_readahead(struct xfs_trans *tp, struct xfs_inode *dp, + xfs_dablk_t bno, xfs_daddr_t mapped_bno); + extern struct xfs_dir2_data_free * xfs_dir2_data_freeinsert(struct xfs_dir2_data_hdr *hdr, struct xfs_dir2_data_unused *dup, int *loghead); @@ -66,6 +77,10 @@ extern void xfs_dir2_data_use_free(struct xfs_trans *tp, struct xfs_buf *bp, xfs_dir2_data_aoff_t len, int *needlogp, int *needscanp); /* xfs_dir2_leaf.c */ +extern const struct xfs_buf_ops xfs_dir2_leafn_buf_ops; + +extern int xfs_dir2_leafn_read(struct xfs_trans *tp, struct xfs_inode *dp, + xfs_dablk_t fbno, xfs_daddr_t mappedbno, struct xfs_buf **bpp); extern int xfs_dir2_block_to_leaf(struct xfs_da_args *args, struct xfs_buf *dbp); extern int xfs_dir2_leaf_addname(struct xfs_da_args *args); @@ -115,6 +130,8 @@ extern int xfs_dir2_node_removename(struct xfs_da_args *args); extern int xfs_dir2_node_replace(struct xfs_da_args *args); extern int xfs_dir2_node_trim_free(struct xfs_da_args *args, xfs_fileoff_t fo, int *rvalp); +extern int xfs_dir2_free_read(struct xfs_trans *tp, struct xfs_inode *dp, + xfs_dablk_t fbno, struct xfs_buf **bpp); /* xfs_dir2_sf.c */ extern xfs_ino_t xfs_dir2_sf_get_parent_ino(struct xfs_dir2_sf_hdr *sfp); diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index bf27fcca4843..9e1bf5294c91 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -248,7 +248,59 @@ xfs_qm_init_dquot_blk( xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1); } +static void +xfs_dquot_buf_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_dqblk *d = (struct xfs_dqblk *)bp->b_addr; + struct xfs_disk_dquot *ddq; + xfs_dqid_t id = 0; + int i; + + /* + * On the first read of the buffer, verify that each dquot is valid. + * We don't know what the id of the dquot is supposed to be, just that + * they should be increasing monotonically within the buffer. If the + * first id is corrupt, then it will fail on the second dquot in the + * buffer so corruptions could point to the wrong dquot in this case. + */ + for (i = 0; i < mp->m_quotainfo->qi_dqperchunk; i++) { + int error; + + ddq = &d[i].dd_diskdq; + + if (i == 0) + id = be32_to_cpu(ddq->d_id); + + error = xfs_qm_dqcheck(mp, ddq, id + i, 0, XFS_QMOPT_DOWARN, + "xfs_dquot_read_verify"); + if (error) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, d); + xfs_buf_ioerror(bp, EFSCORRUPTED); + break; + } + } +} + +static void +xfs_dquot_buf_read_verify( + struct xfs_buf *bp) +{ + xfs_dquot_buf_verify(bp); +} + +void +xfs_dquot_buf_write_verify( + struct xfs_buf *bp) +{ + xfs_dquot_buf_verify(bp); +} +const struct xfs_buf_ops xfs_dquot_buf_ops = { + .verify_read = xfs_dquot_buf_read_verify, + .verify_write = xfs_dquot_buf_write_verify, +}; /* * Allocate a block and fill it with dquots. @@ -315,6 +367,7 @@ xfs_qm_dqalloc( error = xfs_buf_geterror(bp); if (error) goto error1; + bp->b_ops = &xfs_dquot_buf_ops; /* * Make a chunk of dquots out of this buffer and log @@ -359,6 +412,51 @@ xfs_qm_dqalloc( return (error); } +STATIC int +xfs_qm_dqrepair( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_dquot *dqp, + xfs_dqid_t firstid, + struct xfs_buf **bpp) +{ + int error; + struct xfs_disk_dquot *ddq; + struct xfs_dqblk *d; + int i; + + /* + * Read the buffer without verification so we get the corrupted + * buffer returned to us. make sure we verify it on write, though. + */ + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, dqp->q_blkno, + mp->m_quotainfo->qi_dqchunklen, + 0, bpp, NULL); + + if (error) { + ASSERT(*bpp == NULL); + return XFS_ERROR(error); + } + (*bpp)->b_ops = &xfs_dquot_buf_ops; + + ASSERT(xfs_buf_islocked(*bpp)); + d = (struct xfs_dqblk *)(*bpp)->b_addr; + + /* Do the actual repair of dquots in this buffer */ + for (i = 0; i < mp->m_quotainfo->qi_dqperchunk; i++) { + ddq = &d[i].dd_diskdq; + error = xfs_qm_dqcheck(mp, ddq, firstid + i, + dqp->dq_flags & XFS_DQ_ALLTYPES, + XFS_QMOPT_DQREPAIR, "xfs_qm_dqrepair"); + if (error) { + /* repair failed, we're screwed */ + xfs_trans_brelse(tp, *bpp); + return XFS_ERROR(EIO); + } + } + + return 0; +} /* * Maps a dquot to the buffer containing its on-disk version. @@ -378,7 +476,6 @@ xfs_qm_dqtobp( xfs_buf_t *bp; xfs_inode_t *quotip = XFS_DQ_TO_QIP(dqp); xfs_mount_t *mp = dqp->q_mount; - xfs_disk_dquot_t *ddq; xfs_dqid_t id = be32_to_cpu(dqp->q_core.d_id); xfs_trans_t *tp = (tpp ? *tpp : NULL); @@ -439,33 +536,24 @@ xfs_qm_dqtobp( error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, dqp->q_blkno, mp->m_quotainfo->qi_dqchunklen, - 0, &bp); - if (error || !bp) - return XFS_ERROR(error); - } - - ASSERT(xfs_buf_islocked(bp)); + 0, &bp, &xfs_dquot_buf_ops); - /* - * calculate the location of the dquot inside the buffer. - */ - ddq = bp->b_addr + dqp->q_bufoffset; + if (error == EFSCORRUPTED && (flags & XFS_QMOPT_DQREPAIR)) { + xfs_dqid_t firstid = (xfs_dqid_t)map.br_startoff * + mp->m_quotainfo->qi_dqperchunk; + ASSERT(bp == NULL); + error = xfs_qm_dqrepair(mp, tp, dqp, firstid, &bp); + } - /* - * A simple sanity check in case we got a corrupted dquot... - */ - error = xfs_qm_dqcheck(mp, ddq, id, dqp->dq_flags & XFS_DQ_ALLTYPES, - flags & (XFS_QMOPT_DQREPAIR|XFS_QMOPT_DOWARN), - "dqtobp"); - if (error) { - if (!(flags & XFS_QMOPT_DQREPAIR)) { - xfs_trans_brelse(tp, bp); - return XFS_ERROR(EIO); + if (error) { + ASSERT(bp == NULL); + return XFS_ERROR(error); } } + ASSERT(xfs_buf_islocked(bp)); *O_bpp = bp; - *O_ddpp = ddq; + *O_ddpp = bp->b_addr + dqp->q_bufoffset; return (0); } @@ -920,7 +1008,7 @@ xfs_qm_dqflush( * Get the buffer containing the on-disk dquot */ error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno, - mp->m_quotainfo->qi_dqchunklen, 0, &bp); + mp->m_quotainfo->qi_dqchunklen, 0, &bp, NULL); if (error) goto out_unlock; diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h index 7d20af27346d..c694a8469c4a 100644 --- a/fs/xfs/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h @@ -161,4 +161,6 @@ static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp) return dqp; } +extern const struct xfs_buf_ops xfs_dquot_buf_ops; + #endif /* __XFS_DQUOT_H__ */ diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c index 8c6d1d70278c..a83611849cee 100644 --- a/fs/xfs/xfs_export.c +++ b/fs/xfs/xfs_export.c @@ -29,6 +29,7 @@ #include "xfs_inode.h" #include "xfs_inode_item.h" #include "xfs_trace.h" +#include "xfs_icache.h" /* * Note that we only accept fileids which are long enough rather than allow diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index aa473fa640a2..67284edb84d7 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -31,6 +31,8 @@ #include "xfs_error.h" #include "xfs_vnodeops.h" #include "xfs_da_btree.h" +#include "xfs_dir2_format.h" +#include "xfs_dir2_priv.h" #include "xfs_ioctl.h" #include "xfs_trace.h" @@ -84,7 +86,7 @@ xfs_rw_ilock_demote( * valid before the operation, it will be read from disk before * being partially zeroed. */ -STATIC int +int xfs_iozero( struct xfs_inode *ip, /* inode */ loff_t pos, /* offset in file */ @@ -255,15 +257,14 @@ xfs_file_aio_read( xfs_buftarg_t *target = XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp; - if ((iocb->ki_pos & target->bt_smask) || - (size & target->bt_smask)) { - if (iocb->ki_pos == i_size_read(inode)) + if ((pos & target->bt_smask) || (size & target->bt_smask)) { + if (pos == i_size_read(inode)) return 0; return -XFS_ERROR(EINVAL); } } - n = mp->m_super->s_maxbytes - iocb->ki_pos; + n = mp->m_super->s_maxbytes - pos; if (n <= 0 || size == 0) return 0; @@ -289,20 +290,21 @@ xfs_file_aio_read( xfs_rw_ilock(ip, XFS_IOLOCK_EXCL); if (inode->i_mapping->nrpages) { - ret = -xfs_flushinval_pages(ip, - (iocb->ki_pos & PAGE_CACHE_MASK), - -1, FI_REMAPF_LOCKED); + ret = -filemap_write_and_wait_range( + VFS_I(ip)->i_mapping, + pos, -1); if (ret) { xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL); return ret; } + truncate_pagecache_range(VFS_I(ip), pos, -1); } xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); } - trace_xfs_file_read(ip, size, iocb->ki_pos, ioflags); + trace_xfs_file_read(ip, size, pos, ioflags); - ret = generic_file_aio_read(iocb, iovp, nr_segs, iocb->ki_pos); + ret = generic_file_aio_read(iocb, iovp, nr_segs, pos); if (ret > 0) XFS_STATS_ADD(xs_read_bytes, ret); @@ -670,10 +672,11 @@ xfs_file_dio_aio_write( goto out; if (mapping->nrpages) { - ret = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1, - FI_REMAPF_LOCKED); + ret = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping, + pos, -1); if (ret) goto out; + truncate_pagecache_range(VFS_I(ip), pos, -1); } /* @@ -728,16 +731,17 @@ xfs_file_buffered_aio_write( write_retry: trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0); ret = generic_file_buffered_write(iocb, iovp, nr_segs, - pos, &iocb->ki_pos, count, ret); + pos, &iocb->ki_pos, count, 0); + /* - * if we just got an ENOSPC, flush the inode now we aren't holding any - * page locks and retry *once* + * If we just got an ENOSPC, try to write back all dirty inodes to + * convert delalloc space to free up some of the excess reserved + * metadata space. */ if (ret == -ENOSPC && !enospc) { enospc = 1; - ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE); - if (!ret) - goto write_retry; + xfs_flush_inodes(ip->i_mount); + goto write_retry; } current->backing_dev_info = NULL; @@ -889,7 +893,7 @@ xfs_dir_open( */ mode = xfs_ilock_map_shared(ip); if (ip->i_d.di_nextents > 0) - xfs_da_reada_buf(NULL, ip, 0, XFS_DATA_FORK); + xfs_dir2_data_readahead(NULL, ip, 0, -1); xfs_iunlock(ip, mode); return 0; } diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h index c13fed8c394a..6dda3f949b04 100644 --- a/fs/xfs/xfs_fs.h +++ b/fs/xfs/xfs_fs.h @@ -233,7 +233,8 @@ typedef struct xfs_fsop_resblks { #define XFS_FSOP_GEOM_FLAGS_LOGV2 0x0100 /* log format version 2 */ #define XFS_FSOP_GEOM_FLAGS_SECTOR 0x0200 /* sector sizes >1BB */ #define XFS_FSOP_GEOM_FLAGS_ATTR2 0x0400 /* inline attributes rework */ -#define XFS_FSOP_GEOM_FLAGS_DIRV2CI 0x1000 /* ASCII only CI names */ +#define XFS_FSOP_GEOM_FLAGS_PROJID32 0x0800 /* 32-bit project IDs */ +#define XFS_FSOP_GEOM_FLAGS_DIRV2CI 0x1000 /* ASCII only CI names */ #define XFS_FSOP_GEOM_FLAGS_LAZYSB 0x4000 /* lazy superblock counters */ @@ -339,6 +340,35 @@ typedef struct xfs_error_injection { /* + * Speculative preallocation trimming. + */ +#define XFS_EOFBLOCKS_VERSION 1 +struct xfs_eofblocks { + __u32 eof_version; + __u32 eof_flags; + uid_t eof_uid; + gid_t eof_gid; + prid_t eof_prid; + __u32 pad32; + __u64 eof_min_file_size; + __u64 pad64[12]; +}; + +/* eof_flags values */ +#define XFS_EOF_FLAGS_SYNC (1 << 0) /* sync/wait mode scan */ +#define XFS_EOF_FLAGS_UID (1 << 1) /* filter by uid */ +#define XFS_EOF_FLAGS_GID (1 << 2) /* filter by gid */ +#define XFS_EOF_FLAGS_PRID (1 << 3) /* filter by project id */ +#define XFS_EOF_FLAGS_MINFILESIZE (1 << 4) /* filter by min file size */ +#define XFS_EOF_FLAGS_VALID \ + (XFS_EOF_FLAGS_SYNC | \ + XFS_EOF_FLAGS_UID | \ + XFS_EOF_FLAGS_GID | \ + XFS_EOF_FLAGS_PRID | \ + XFS_EOF_FLAGS_MINFILESIZE) + + +/* * The user-level Handle Request interface structure. */ typedef struct xfs_fsop_handlereq { @@ -456,6 +486,7 @@ typedef struct xfs_handle { /* XFS_IOC_GETBIOSIZE ---- deprecated 47 */ #define XFS_IOC_GETBMAPX _IOWR('X', 56, struct getbmap) #define XFS_IOC_ZERO_RANGE _IOW ('X', 57, struct xfs_flock64) +#define XFS_IOC_FREE_EOFBLOCKS _IOR ('X', 58, struct xfs_eofblocks) /* * ioctl commands that replace IRIX syssgi()'s diff --git a/fs/xfs/xfs_fs_subr.c b/fs/xfs/xfs_fs_subr.c deleted file mode 100644 index 652b875a9d4c..000000000000 --- a/fs/xfs/xfs_fs_subr.c +++ /dev/null @@ -1,96 +0,0 @@ -/* - * Copyright (c) 2000-2002,2005-2006 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include "xfs.h" -#include "xfs_vnodeops.h" -#include "xfs_bmap_btree.h" -#include "xfs_inode.h" -#include "xfs_trace.h" - -/* - * note: all filemap functions return negative error codes. These - * need to be inverted before returning to the xfs core functions. - */ -void -xfs_tosspages( - xfs_inode_t *ip, - xfs_off_t first, - xfs_off_t last, - int fiopt) -{ - /* can't toss partial tail pages, so mask them out */ - last &= ~(PAGE_SIZE - 1); - truncate_inode_pages_range(VFS_I(ip)->i_mapping, first, last - 1); -} - -int -xfs_flushinval_pages( - xfs_inode_t *ip, - xfs_off_t first, - xfs_off_t last, - int fiopt) -{ - struct address_space *mapping = VFS_I(ip)->i_mapping; - int ret = 0; - - trace_xfs_pagecache_inval(ip, first, last); - - xfs_iflags_clear(ip, XFS_ITRUNCATED); - ret = filemap_write_and_wait_range(mapping, first, - last == -1 ? LLONG_MAX : last); - if (!ret) - truncate_inode_pages_range(mapping, first, last); - return -ret; -} - -int -xfs_flush_pages( - xfs_inode_t *ip, - xfs_off_t first, - xfs_off_t last, - uint64_t flags, - int fiopt) -{ - struct address_space *mapping = VFS_I(ip)->i_mapping; - int ret = 0; - int ret2; - - xfs_iflags_clear(ip, XFS_ITRUNCATED); - ret = -filemap_fdatawrite_range(mapping, first, - last == -1 ? LLONG_MAX : last); - if (flags & XBF_ASYNC) - return ret; - ret2 = xfs_wait_on_pages(ip, first, last); - if (!ret) - ret = ret2; - return ret; -} - -int -xfs_wait_on_pages( - xfs_inode_t *ip, - xfs_off_t first, - xfs_off_t last) -{ - struct address_space *mapping = VFS_I(ip)->i_mapping; - - if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) { - return -filemap_fdatawait_range(mapping, first, - last == -1 ? XFS_ISIZE(ip) - 1 : last); - } - return 0; -} diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index c25b094efbf7..94eaeedc5498 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -97,7 +97,9 @@ xfs_fs_geometry( (xfs_sb_version_haslazysbcount(&mp->m_sb) ? XFS_FSOP_GEOM_FLAGS_LAZYSB : 0) | (xfs_sb_version_hasattr2(&mp->m_sb) ? - XFS_FSOP_GEOM_FLAGS_ATTR2 : 0); + XFS_FSOP_GEOM_FLAGS_ATTR2 : 0) | + (xfs_sb_version_hasprojid32bit(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_PROJID32 : 0); geo->logsectsize = xfs_sb_version_hassector(&mp->m_sb) ? mp->m_sb.sb_logsectsize : BBSIZE; geo->rtsectsize = mp->m_sb.sb_blocksize; @@ -112,18 +114,40 @@ xfs_fs_geometry( return 0; } +static struct xfs_buf * +xfs_growfs_get_hdr_buf( + struct xfs_mount *mp, + xfs_daddr_t blkno, + size_t numblks, + int flags, + const struct xfs_buf_ops *ops) +{ + struct xfs_buf *bp; + + bp = xfs_buf_get_uncached(mp->m_ddev_targp, numblks, flags); + if (!bp) + return NULL; + + xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); + bp->b_bn = blkno; + bp->b_maps[0].bm_bn = blkno; + bp->b_ops = ops; + + return bp; +} + static int xfs_growfs_data_private( xfs_mount_t *mp, /* mount point for filesystem */ xfs_growfs_data_t *in) /* growfs data input struct */ { xfs_agf_t *agf; + struct xfs_agfl *agfl; xfs_agi_t *agi; xfs_agnumber_t agno; xfs_extlen_t agsize; xfs_extlen_t tmpsize; xfs_alloc_rec_t *arec; - struct xfs_btree_block *block; xfs_buf_t *bp; int bucket; int dpct; @@ -146,9 +170,14 @@ xfs_growfs_data_private( dpct = pct - mp->m_sb.sb_imax_pct; bp = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1), - XFS_FSS_TO_BB(mp, 1), 0); + XFS_FSS_TO_BB(mp, 1), 0, NULL); if (!bp) return EIO; + if (bp->b_error) { + int error = bp->b_error; + xfs_buf_relse(bp); + return error; + } xfs_buf_relse(bp); new = nb; /* use new as a temporary here */ @@ -186,17 +215,18 @@ xfs_growfs_data_private( nfree = 0; for (agno = nagcount - 1; agno >= oagcount; agno--, new -= agsize) { /* - * AG freelist header block + * AG freespace header block */ - bp = xfs_buf_get(mp->m_ddev_targp, - XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), 0); + bp = xfs_growfs_get_hdr_buf(mp, + XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)), + XFS_FSS_TO_BB(mp, 1), 0, + &xfs_agf_buf_ops); if (!bp) { error = ENOMEM; goto error0; } + agf = XFS_BUF_TO_AGF(bp); - memset(agf, 0, mp->m_sb.sb_sectsize); agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC); agf->agf_versionnum = cpu_to_be32(XFS_AGF_VERSION); agf->agf_seqno = cpu_to_be32(agno); @@ -223,17 +253,39 @@ xfs_growfs_data_private( goto error0; /* + * AG freelist header block + */ + bp = xfs_growfs_get_hdr_buf(mp, + XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)), + XFS_FSS_TO_BB(mp, 1), 0, + &xfs_agfl_buf_ops); + if (!bp) { + error = ENOMEM; + goto error0; + } + + agfl = XFS_BUF_TO_AGFL(bp); + for (bucket = 0; bucket < XFS_AGFL_SIZE(mp); bucket++) + agfl->agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK); + + error = xfs_bwrite(bp); + xfs_buf_relse(bp); + if (error) + goto error0; + + /* * AG inode header block */ - bp = xfs_buf_get(mp->m_ddev_targp, - XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), 0); + bp = xfs_growfs_get_hdr_buf(mp, + XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)), + XFS_FSS_TO_BB(mp, 1), 0, + &xfs_agi_buf_ops); if (!bp) { error = ENOMEM; goto error0; } + agi = XFS_BUF_TO_AGI(bp); - memset(agi, 0, mp->m_sb.sb_sectsize); agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC); agi->agi_versionnum = cpu_to_be32(XFS_AGI_VERSION); agi->agi_seqno = cpu_to_be32(agno); @@ -254,24 +306,22 @@ xfs_growfs_data_private( /* * BNO btree root block */ - bp = xfs_buf_get(mp->m_ddev_targp, - XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)), - BTOBB(mp->m_sb.sb_blocksize), 0); + bp = xfs_growfs_get_hdr_buf(mp, + XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)), + BTOBB(mp->m_sb.sb_blocksize), 0, + &xfs_allocbt_buf_ops); + if (!bp) { error = ENOMEM; goto error0; } - block = XFS_BUF_TO_BLOCK(bp); - memset(block, 0, mp->m_sb.sb_blocksize); - block->bb_magic = cpu_to_be32(XFS_ABTB_MAGIC); - block->bb_level = 0; - block->bb_numrecs = cpu_to_be16(1); - block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK); - block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK); - arec = XFS_ALLOC_REC_ADDR(mp, block, 1); + + xfs_btree_init_block(mp, bp, XFS_ABTB_MAGIC, 0, 1, 0); + arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1); arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp)); arec->ar_blockcount = cpu_to_be32( agsize - be32_to_cpu(arec->ar_startblock)); + error = xfs_bwrite(bp); xfs_buf_relse(bp); if (error) @@ -280,25 +330,22 @@ xfs_growfs_data_private( /* * CNT btree root block */ - bp = xfs_buf_get(mp->m_ddev_targp, - XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)), - BTOBB(mp->m_sb.sb_blocksize), 0); + bp = xfs_growfs_get_hdr_buf(mp, + XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)), + BTOBB(mp->m_sb.sb_blocksize), 0, + &xfs_allocbt_buf_ops); if (!bp) { error = ENOMEM; goto error0; } - block = XFS_BUF_TO_BLOCK(bp); - memset(block, 0, mp->m_sb.sb_blocksize); - block->bb_magic = cpu_to_be32(XFS_ABTC_MAGIC); - block->bb_level = 0; - block->bb_numrecs = cpu_to_be16(1); - block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK); - block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK); - arec = XFS_ALLOC_REC_ADDR(mp, block, 1); + + xfs_btree_init_block(mp, bp, XFS_ABTC_MAGIC, 0, 1, 0); + arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1); arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp)); arec->ar_blockcount = cpu_to_be32( agsize - be32_to_cpu(arec->ar_startblock)); nfree += be32_to_cpu(arec->ar_blockcount); + error = xfs_bwrite(bp); xfs_buf_relse(bp); if (error) @@ -307,20 +354,17 @@ xfs_growfs_data_private( /* * INO btree root block */ - bp = xfs_buf_get(mp->m_ddev_targp, - XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)), - BTOBB(mp->m_sb.sb_blocksize), 0); + bp = xfs_growfs_get_hdr_buf(mp, + XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)), + BTOBB(mp->m_sb.sb_blocksize), 0, + &xfs_inobt_buf_ops); if (!bp) { error = ENOMEM; goto error0; } - block = XFS_BUF_TO_BLOCK(bp); - memset(block, 0, mp->m_sb.sb_blocksize); - block->bb_magic = cpu_to_be32(XFS_IBT_MAGIC); - block->bb_level = 0; - block->bb_numrecs = 0; - block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK); - block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK); + + xfs_btree_init_block(mp, bp, XFS_IBT_MAGIC, 0, 0, 0); + error = xfs_bwrite(bp); xfs_buf_relse(bp); if (error) @@ -399,9 +443,28 @@ xfs_growfs_data_private( /* update secondary superblocks. */ for (agno = 1; agno < nagcount; agno++) { - error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, + error = 0; + /* + * new secondary superblocks need to be zeroed, not read from + * disk as the contents of the new area we are growing into is + * completely unknown. + */ + if (agno < oagcount) { + error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, + XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)), + XFS_FSS_TO_BB(mp, 1), 0, &bp, + &xfs_sb_buf_ops); + } else { + bp = xfs_trans_get_buf(NULL, mp->m_ddev_targp, XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)), - XFS_FSS_TO_BB(mp, 1), 0, &bp); + XFS_FSS_TO_BB(mp, 1), 0); + if (bp) { + bp->b_ops = &xfs_sb_buf_ops; + xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); + } else + error = ENOMEM; + } + if (error) { xfs_warn(mp, "error %d reading secondary superblock for ag %d", @@ -409,6 +472,7 @@ xfs_growfs_data_private( break; } xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb, XFS_SB_ALL_BITS); + /* * If we get an error writing out the alternate superblocks, * just issue a warning and continue. The real work is @@ -423,7 +487,7 @@ xfs_growfs_data_private( break; /* no point in continuing */ } } - return 0; + return error; error0: xfs_trans_cancel(tp, XFS_TRANS_ABORT); diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c index 76e81cff70b9..5399ef222dd7 100644 --- a/fs/xfs/xfs_globals.c +++ b/fs/xfs/xfs_globals.c @@ -21,7 +21,8 @@ /* * Tunable XFS parameters. xfs_params is required even when CONFIG_SYSCTL=n, * other XFS code uses these values. Times are measured in centisecs (i.e. - * 100ths of a second). + * 100ths of a second) with the exception of eofb_timer, which is measured in + * seconds. */ xfs_param_t xfs_params = { /* MIN DFLT MAX */ @@ -40,4 +41,5 @@ xfs_param_t xfs_params = { .rotorstep = { 1, 1, 255 }, .inherit_nodfrg = { 0, 1, 1 }, .fstrm_timer = { 1, 30*100, 3600*100}, + .eofb_timer = { 1, 300, 3600*24}, }; diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index 445bf1aef31c..a815412eab80 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -200,7 +200,8 @@ xfs_ialloc_inode_init( */ d = XFS_AGB_TO_DADDR(mp, agno, agbno + (j * blks_per_cluster)); fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, - mp->m_bsize * blks_per_cluster, 0); + mp->m_bsize * blks_per_cluster, + XBF_UNMAPPED); if (!fbuf) return ENOMEM; /* @@ -210,6 +211,7 @@ xfs_ialloc_inode_init( * to log a whole cluster of inodes instead of all the * individual transactions causing a lot of log traffic. */ + fbuf->b_ops = &xfs_inode_buf_ops; xfs_buf_zero(fbuf, 0, ninodes << mp->m_sb.sb_inodelog); for (i = 0; i < ninodes; i++) { int ioffset = i << mp->m_sb.sb_inodelog; @@ -250,6 +252,7 @@ xfs_ialloc_ag_alloc( /* boundary */ struct xfs_perag *pag; + memset(&args, 0, sizeof(args)); args.tp = tp; args.mp = tp->t_mountp; @@ -876,9 +879,9 @@ error0: * This function is designed to be called twice if it has to do an allocation * to make more free inodes. On the first call, *IO_agbp should be set to NULL. * If an inode is available without having to performn an allocation, an inode - * number is returned. In this case, *IO_agbp would be NULL. If an allocation - * needes to be done, xfs_dialloc would return the current AGI buffer in - * *IO_agbp. The caller should then commit the current transaction, allocate a + * number is returned. In this case, *IO_agbp is set to NULL. If an allocation + * needs to be done, xfs_dialloc returns the current AGI buffer in *IO_agbp. + * The caller should then commit the current transaction, allocate a * new transaction, and call xfs_dialloc() again, passing in the previous value * of *IO_agbp. IO_agbp should be held across the transactions. Since the AGI * buffer is locked across the two calls, the second call is guaranteed to have @@ -1471,6 +1474,57 @@ xfs_check_agi_unlinked( #define xfs_check_agi_unlinked(agi) #endif +static void +xfs_agi_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_agi *agi = XFS_BUF_TO_AGI(bp); + int agi_ok; + + /* + * Validate the magic number of the agi block. + */ + agi_ok = agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC) && + XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)); + + /* + * during growfs operations, the perag is not fully initialised, + * so we can't use it for any useful checking. growfs ensures we can't + * use it by using uncached buffers that don't have the perag attached + * so we can detect and avoid this problem. + */ + if (bp->b_pag) + agi_ok = agi_ok && be32_to_cpu(agi->agi_seqno) == + bp->b_pag->pag_agno; + + if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IALLOC_READ_AGI, + XFS_RANDOM_IALLOC_READ_AGI))) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agi); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } + xfs_check_agi_unlinked(agi); +} + +static void +xfs_agi_read_verify( + struct xfs_buf *bp) +{ + xfs_agi_verify(bp); +} + +static void +xfs_agi_write_verify( + struct xfs_buf *bp) +{ + xfs_agi_verify(bp); +} + +const struct xfs_buf_ops xfs_agi_buf_ops = { + .verify_read = xfs_agi_read_verify, + .verify_write = xfs_agi_write_verify, +}; + /* * Read in the allocation group header (inode allocation section) */ @@ -1481,38 +1535,18 @@ xfs_read_agi( xfs_agnumber_t agno, /* allocation group number */ struct xfs_buf **bpp) /* allocation group hdr buf */ { - struct xfs_agi *agi; /* allocation group header */ - int agi_ok; /* agi is consistent */ int error; ASSERT(agno != NULLAGNUMBER); error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), 0, bpp); + XFS_FSS_TO_BB(mp, 1), 0, bpp, &xfs_agi_buf_ops); if (error) return error; ASSERT(!xfs_buf_geterror(*bpp)); - agi = XFS_BUF_TO_AGI(*bpp); - - /* - * Validate the magic number of the agi block. - */ - agi_ok = agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC) && - XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)) && - be32_to_cpu(agi->agi_seqno) == agno; - if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IALLOC_READ_AGI, - XFS_RANDOM_IALLOC_READ_AGI))) { - XFS_CORRUPTION_ERROR("xfs_read_agi", XFS_ERRLEVEL_LOW, - mp, agi); - xfs_trans_brelse(tp, *bpp); - return XFS_ERROR(EFSCORRUPTED); - } - xfs_buf_set_ref(*bpp, XFS_AGI_REF); - - xfs_check_agi_unlinked(agi); return 0; } diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h index 1fd6ea4e9c91..c8da3df271e6 100644 --- a/fs/xfs/xfs_ialloc.h +++ b/fs/xfs/xfs_ialloc.h @@ -147,7 +147,9 @@ int xfs_inobt_lookup(struct xfs_btree_cur *cur, xfs_agino_t ino, /* * Get the data from the pointed-to record. */ -extern int xfs_inobt_get_rec(struct xfs_btree_cur *cur, +int xfs_inobt_get_rec(struct xfs_btree_cur *cur, xfs_inobt_rec_incore_t *rec, int *stat); +extern const struct xfs_buf_ops xfs_agi_buf_ops; + #endif /* __XFS_IALLOC_H__ */ diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c index 2b8b7a37aa18..bec344b36507 100644 --- a/fs/xfs/xfs_ialloc_btree.c +++ b/fs/xfs/xfs_ialloc_btree.c @@ -33,6 +33,7 @@ #include "xfs_ialloc.h" #include "xfs_alloc.h" #include "xfs_error.h" +#include "xfs_trace.h" STATIC int @@ -181,6 +182,59 @@ xfs_inobt_key_diff( cur->bc_rec.i.ir_startino; } +void +xfs_inobt_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + unsigned int level; + int sblock_ok; /* block passes checks */ + + /* magic number and level verification */ + level = be16_to_cpu(block->bb_level); + sblock_ok = block->bb_magic == cpu_to_be32(XFS_IBT_MAGIC) && + level < mp->m_in_maxlevels; + + /* numrecs verification */ + sblock_ok = sblock_ok && + be16_to_cpu(block->bb_numrecs) <= mp->m_inobt_mxr[level != 0]; + + /* sibling pointer verification */ + sblock_ok = sblock_ok && + (block->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) || + be32_to_cpu(block->bb_u.s.bb_leftsib) < mp->m_sb.sb_agblocks) && + block->bb_u.s.bb_leftsib && + (block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK) || + be32_to_cpu(block->bb_u.s.bb_rightsib) < mp->m_sb.sb_agblocks) && + block->bb_u.s.bb_rightsib; + + if (!sblock_ok) { + trace_xfs_btree_corrupt(bp, _RET_IP_); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, block); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } +} + +static void +xfs_inobt_read_verify( + struct xfs_buf *bp) +{ + xfs_inobt_verify(bp); +} + +static void +xfs_inobt_write_verify( + struct xfs_buf *bp) +{ + xfs_inobt_verify(bp); +} + +const struct xfs_buf_ops xfs_inobt_buf_ops = { + .verify_read = xfs_inobt_read_verify, + .verify_write = xfs_inobt_write_verify, +}; + #ifdef DEBUG STATIC int xfs_inobt_keys_inorder( @@ -218,6 +272,7 @@ static const struct xfs_btree_ops xfs_inobt_ops = { .init_rec_from_cur = xfs_inobt_init_rec_from_cur, .init_ptr_from_cur = xfs_inobt_init_ptr_from_cur, .key_diff = xfs_inobt_key_diff, + .buf_ops = &xfs_inobt_buf_ops, #ifdef DEBUG .keys_inorder = xfs_inobt_keys_inorder, .recs_inorder = xfs_inobt_recs_inorder, diff --git a/fs/xfs/xfs_ialloc_btree.h b/fs/xfs/xfs_ialloc_btree.h index f782ad0c4769..25c0239a8eab 100644 --- a/fs/xfs/xfs_ialloc_btree.h +++ b/fs/xfs/xfs_ialloc_btree.h @@ -109,4 +109,6 @@ extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_mount *, struct xfs_trans *, struct xfs_buf *, xfs_agnumber_t); extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int); +extern const struct xfs_buf_ops xfs_inobt_buf_ops; + #endif /* __XFS_IALLOC_BTREE_H__ */ diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_icache.c index 9500caf15acf..96e344e3e927 100644 --- a/fs/xfs/xfs_sync.c +++ b/fs/xfs/xfs_icache.c @@ -19,6 +19,7 @@ #include "xfs_fs.h" #include "xfs_types.h" #include "xfs_log.h" +#include "xfs_log_priv.h" #include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_trans_priv.h" @@ -35,11 +36,425 @@ #include "xfs_quota.h" #include "xfs_trace.h" #include "xfs_fsops.h" +#include "xfs_icache.h" #include <linux/kthread.h> #include <linux/freezer.h> -struct workqueue_struct *xfs_syncd_wq; /* sync workqueue */ +STATIC void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, + struct xfs_perag *pag, struct xfs_inode *ip); + +/* + * Allocate and initialise an xfs_inode. + */ +STATIC struct xfs_inode * +xfs_inode_alloc( + struct xfs_mount *mp, + xfs_ino_t ino) +{ + struct xfs_inode *ip; + + /* + * if this didn't occur in transactions, we could use + * KM_MAYFAIL and return NULL here on ENOMEM. Set the + * code up to do this anyway. + */ + ip = kmem_zone_alloc(xfs_inode_zone, KM_SLEEP); + if (!ip) + return NULL; + if (inode_init_always(mp->m_super, VFS_I(ip))) { + kmem_zone_free(xfs_inode_zone, ip); + return NULL; + } + + ASSERT(atomic_read(&ip->i_pincount) == 0); + ASSERT(!spin_is_locked(&ip->i_flags_lock)); + ASSERT(!xfs_isiflocked(ip)); + ASSERT(ip->i_ino == 0); + + mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); + + /* initialise the xfs inode */ + ip->i_ino = ino; + ip->i_mount = mp; + memset(&ip->i_imap, 0, sizeof(struct xfs_imap)); + ip->i_afp = NULL; + memset(&ip->i_df, 0, sizeof(xfs_ifork_t)); + ip->i_flags = 0; + ip->i_delayed_blks = 0; + memset(&ip->i_d, 0, sizeof(xfs_icdinode_t)); + + return ip; +} + +STATIC void +xfs_inode_free_callback( + struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + struct xfs_inode *ip = XFS_I(inode); + + kmem_zone_free(xfs_inode_zone, ip); +} + +STATIC void +xfs_inode_free( + struct xfs_inode *ip) +{ + switch (ip->i_d.di_mode & S_IFMT) { + case S_IFREG: + case S_IFDIR: + case S_IFLNK: + xfs_idestroy_fork(ip, XFS_DATA_FORK); + break; + } + + if (ip->i_afp) + xfs_idestroy_fork(ip, XFS_ATTR_FORK); + + if (ip->i_itemp) { + ASSERT(!(ip->i_itemp->ili_item.li_flags & XFS_LI_IN_AIL)); + xfs_inode_item_destroy(ip); + ip->i_itemp = NULL; + } + + /* asserts to verify all state is correct here */ + ASSERT(atomic_read(&ip->i_pincount) == 0); + ASSERT(!spin_is_locked(&ip->i_flags_lock)); + ASSERT(!xfs_isiflocked(ip)); + + /* + * Because we use RCU freeing we need to ensure the inode always + * appears to be reclaimed with an invalid inode number when in the + * free state. The ip->i_flags_lock provides the barrier against lookup + * races. + */ + spin_lock(&ip->i_flags_lock); + ip->i_flags = XFS_IRECLAIM; + ip->i_ino = 0; + spin_unlock(&ip->i_flags_lock); + + call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback); +} + +/* + * Check the validity of the inode we just found it the cache + */ +static int +xfs_iget_cache_hit( + struct xfs_perag *pag, + struct xfs_inode *ip, + xfs_ino_t ino, + int flags, + int lock_flags) __releases(RCU) +{ + struct inode *inode = VFS_I(ip); + struct xfs_mount *mp = ip->i_mount; + int error; + + /* + * check for re-use of an inode within an RCU grace period due to the + * radix tree nodes not being updated yet. We monitor for this by + * setting the inode number to zero before freeing the inode structure. + * If the inode has been reallocated and set up, then the inode number + * will not match, so check for that, too. + */ + spin_lock(&ip->i_flags_lock); + if (ip->i_ino != ino) { + trace_xfs_iget_skip(ip); + XFS_STATS_INC(xs_ig_frecycle); + error = EAGAIN; + goto out_error; + } + + + /* + * If we are racing with another cache hit that is currently + * instantiating this inode or currently recycling it out of + * reclaimabe state, wait for the initialisation to complete + * before continuing. + * + * XXX(hch): eventually we should do something equivalent to + * wait_on_inode to wait for these flags to be cleared + * instead of polling for it. + */ + if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) { + trace_xfs_iget_skip(ip); + XFS_STATS_INC(xs_ig_frecycle); + error = EAGAIN; + goto out_error; + } + + /* + * If lookup is racing with unlink return an error immediately. + */ + if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) { + error = ENOENT; + goto out_error; + } + + /* + * If IRECLAIMABLE is set, we've torn down the VFS inode already. + * Need to carefully get it back into useable state. + */ + if (ip->i_flags & XFS_IRECLAIMABLE) { + trace_xfs_iget_reclaim(ip); + + /* + * We need to set XFS_IRECLAIM to prevent xfs_reclaim_inode + * from stomping over us while we recycle the inode. We can't + * clear the radix tree reclaimable tag yet as it requires + * pag_ici_lock to be held exclusive. + */ + ip->i_flags |= XFS_IRECLAIM; + + spin_unlock(&ip->i_flags_lock); + rcu_read_unlock(); + + error = -inode_init_always(mp->m_super, inode); + if (error) { + /* + * Re-initializing the inode failed, and we are in deep + * trouble. Try to re-add it to the reclaim list. + */ + rcu_read_lock(); + spin_lock(&ip->i_flags_lock); + + ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM); + ASSERT(ip->i_flags & XFS_IRECLAIMABLE); + trace_xfs_iget_reclaim_fail(ip); + goto out_error; + } + + spin_lock(&pag->pag_ici_lock); + spin_lock(&ip->i_flags_lock); + + /* + * Clear the per-lifetime state in the inode as we are now + * effectively a new inode and need to return to the initial + * state before reuse occurs. + */ + ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS; + ip->i_flags |= XFS_INEW; + __xfs_inode_clear_reclaim_tag(mp, pag, ip); + inode->i_state = I_NEW; + + ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock)); + mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); + + spin_unlock(&ip->i_flags_lock); + spin_unlock(&pag->pag_ici_lock); + } else { + /* If the VFS inode is being torn down, pause and try again. */ + if (!igrab(inode)) { + trace_xfs_iget_skip(ip); + error = EAGAIN; + goto out_error; + } + + /* We've got a live one. */ + spin_unlock(&ip->i_flags_lock); + rcu_read_unlock(); + trace_xfs_iget_hit(ip); + } + + if (lock_flags != 0) + xfs_ilock(ip, lock_flags); + + xfs_iflags_clear(ip, XFS_ISTALE | XFS_IDONTCACHE); + XFS_STATS_INC(xs_ig_found); + + return 0; + +out_error: + spin_unlock(&ip->i_flags_lock); + rcu_read_unlock(); + return error; +} + + +static int +xfs_iget_cache_miss( + struct xfs_mount *mp, + struct xfs_perag *pag, + xfs_trans_t *tp, + xfs_ino_t ino, + struct xfs_inode **ipp, + int flags, + int lock_flags) +{ + struct xfs_inode *ip; + int error; + xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ino); + int iflags; + + ip = xfs_inode_alloc(mp, ino); + if (!ip) + return ENOMEM; + + error = xfs_iread(mp, tp, ip, flags); + if (error) + goto out_destroy; + + trace_xfs_iget_miss(ip); + + if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) { + error = ENOENT; + goto out_destroy; + } + + /* + * Preload the radix tree so we can insert safely under the + * write spinlock. Note that we cannot sleep inside the preload + * region. Since we can be called from transaction context, don't + * recurse into the file system. + */ + if (radix_tree_preload(GFP_NOFS)) { + error = EAGAIN; + goto out_destroy; + } + + /* + * Because the inode hasn't been added to the radix-tree yet it can't + * be found by another thread, so we can do the non-sleeping lock here. + */ + if (lock_flags) { + if (!xfs_ilock_nowait(ip, lock_flags)) + BUG(); + } + + /* + * These values must be set before inserting the inode into the radix + * tree as the moment it is inserted a concurrent lookup (allowed by the + * RCU locking mechanism) can find it and that lookup must see that this + * is an inode currently under construction (i.e. that XFS_INEW is set). + * The ip->i_flags_lock that protects the XFS_INEW flag forms the + * memory barrier that ensures this detection works correctly at lookup + * time. + */ + iflags = XFS_INEW; + if (flags & XFS_IGET_DONTCACHE) + iflags |= XFS_IDONTCACHE; + ip->i_udquot = ip->i_gdquot = NULL; + xfs_iflags_set(ip, iflags); + + /* insert the new inode */ + spin_lock(&pag->pag_ici_lock); + error = radix_tree_insert(&pag->pag_ici_root, agino, ip); + if (unlikely(error)) { + WARN_ON(error != -EEXIST); + XFS_STATS_INC(xs_ig_dup); + error = EAGAIN; + goto out_preload_end; + } + spin_unlock(&pag->pag_ici_lock); + radix_tree_preload_end(); + + *ipp = ip; + return 0; + +out_preload_end: + spin_unlock(&pag->pag_ici_lock); + radix_tree_preload_end(); + if (lock_flags) + xfs_iunlock(ip, lock_flags); +out_destroy: + __destroy_inode(VFS_I(ip)); + xfs_inode_free(ip); + return error; +} + +/* + * Look up an inode by number in the given file system. + * The inode is looked up in the cache held in each AG. + * If the inode is found in the cache, initialise the vfs inode + * if necessary. + * + * If it is not in core, read it in from the file system's device, + * add it to the cache and initialise the vfs inode. + * + * The inode is locked according to the value of the lock_flags parameter. + * This flag parameter indicates how and if the inode's IO lock and inode lock + * should be taken. + * + * mp -- the mount point structure for the current file system. It points + * to the inode hash table. + * tp -- a pointer to the current transaction if there is one. This is + * simply passed through to the xfs_iread() call. + * ino -- the number of the inode desired. This is the unique identifier + * within the file system for the inode being requested. + * lock_flags -- flags indicating how to lock the inode. See the comment + * for xfs_ilock() for a list of valid values. + */ +int +xfs_iget( + xfs_mount_t *mp, + xfs_trans_t *tp, + xfs_ino_t ino, + uint flags, + uint lock_flags, + xfs_inode_t **ipp) +{ + xfs_inode_t *ip; + int error; + xfs_perag_t *pag; + xfs_agino_t agino; + + /* + * xfs_reclaim_inode() uses the ILOCK to ensure an inode + * doesn't get freed while it's being referenced during a + * radix tree traversal here. It assumes this function + * aqcuires only the ILOCK (and therefore it has no need to + * involve the IOLOCK in this synchronization). + */ + ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0); + + /* reject inode numbers outside existing AGs */ + if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount) + return EINVAL; + + /* get the perag structure and ensure that it's inode capable */ + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino)); + agino = XFS_INO_TO_AGINO(mp, ino); + +again: + error = 0; + rcu_read_lock(); + ip = radix_tree_lookup(&pag->pag_ici_root, agino); + + if (ip) { + error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags); + if (error) + goto out_error_or_again; + } else { + rcu_read_unlock(); + XFS_STATS_INC(xs_ig_missed); + + error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip, + flags, lock_flags); + if (error) + goto out_error_or_again; + } + xfs_perag_put(pag); + + *ipp = ip; + + /* + * If we have a real type for an on-disk inode, we can set ops(&unlock) + * now. If it's a new inode being created, xfs_ialloc will handle it. + */ + if (xfs_iflags_test(ip, XFS_INEW) && ip->i_d.di_mode != 0) + xfs_setup_inode(ip); + return 0; + +out_error_or_again: + if (error == EAGAIN) { + delay(1); + goto again; + } + xfs_perag_put(pag); + return error; +} /* * The inode lookup is done in batches to keep the amount of lock traffic and @@ -101,8 +516,11 @@ xfs_inode_ag_walk( struct xfs_mount *mp, struct xfs_perag *pag, int (*execute)(struct xfs_inode *ip, - struct xfs_perag *pag, int flags), - int flags) + struct xfs_perag *pag, int flags, + void *args), + int flags, + void *args, + int tag) { uint32_t first_index; int last_error = 0; @@ -121,9 +539,17 @@ restart: int i; rcu_read_lock(); - nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, + + if (tag == -1) + nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void **)batch, first_index, XFS_LOOKUP_BATCH); + else + nr_found = radix_tree_gang_lookup_tag( + &pag->pag_ici_root, + (void **) batch, first_index, + XFS_LOOKUP_BATCH, tag); + if (!nr_found) { rcu_read_unlock(); break; @@ -164,7 +590,7 @@ restart: for (i = 0; i < nr_found; i++) { if (!batch[i]) continue; - error = execute(batch[i], pag, flags); + error = execute(batch[i], pag, flags, args); IRELE(batch[i]); if (error == EAGAIN) { skipped++; @@ -189,12 +615,40 @@ restart: return last_error; } +/* + * Background scanning to trim post-EOF preallocated space. This is queued + * based on the 'background_prealloc_discard_period' tunable (5m by default). + */ +STATIC void +xfs_queue_eofblocks( + struct xfs_mount *mp) +{ + rcu_read_lock(); + if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_EOFBLOCKS_TAG)) + queue_delayed_work(mp->m_eofblocks_workqueue, + &mp->m_eofblocks_work, + msecs_to_jiffies(xfs_eofb_secs * 1000)); + rcu_read_unlock(); +} + +void +xfs_eofblocks_worker( + struct work_struct *work) +{ + struct xfs_mount *mp = container_of(to_delayed_work(work), + struct xfs_mount, m_eofblocks_work); + xfs_icache_free_eofblocks(mp, NULL); + xfs_queue_eofblocks(mp); +} + int xfs_inode_ag_iterator( struct xfs_mount *mp, int (*execute)(struct xfs_inode *ip, - struct xfs_perag *pag, int flags), - int flags) + struct xfs_perag *pag, int flags, + void *args), + int flags, + void *args) { struct xfs_perag *pag; int error = 0; @@ -204,7 +658,7 @@ xfs_inode_ag_iterator( ag = 0; while ((pag = xfs_perag_get(mp, ag))) { ag = pag->pag_agno + 1; - error = xfs_inode_ag_walk(mp, pag, execute, flags); + error = xfs_inode_ag_walk(mp, pag, execute, flags, args, -1); xfs_perag_put(pag); if (error) { last_error = error; @@ -215,224 +669,50 @@ xfs_inode_ag_iterator( return XFS_ERROR(last_error); } -STATIC int -xfs_sync_inode_data( - struct xfs_inode *ip, - struct xfs_perag *pag, - int flags) -{ - struct inode *inode = VFS_I(ip); - struct address_space *mapping = inode->i_mapping; - int error = 0; - - if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) - return 0; - - if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) { - if (flags & SYNC_TRYLOCK) - return 0; - xfs_ilock(ip, XFS_IOLOCK_SHARED); - } - - error = xfs_flush_pages(ip, 0, -1, (flags & SYNC_WAIT) ? - 0 : XBF_ASYNC, FI_NONE); - xfs_iunlock(ip, XFS_IOLOCK_SHARED); - return error; -} - -/* - * Write out pagecache data for the whole filesystem. - */ -STATIC int -xfs_sync_data( - struct xfs_mount *mp, - int flags) -{ - int error; - - ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0); - - error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags); - if (error) - return XFS_ERROR(error); - - xfs_log_force(mp, (flags & SYNC_WAIT) ? XFS_LOG_SYNC : 0); - return 0; -} - -STATIC int -xfs_sync_fsdata( - struct xfs_mount *mp) -{ - struct xfs_buf *bp; - int error; - - /* - * If the buffer is pinned then push on the log so we won't get stuck - * waiting in the write for someone, maybe ourselves, to flush the log. - * - * Even though we just pushed the log above, we did not have the - * superblock buffer locked at that point so it can become pinned in - * between there and here. - */ - bp = xfs_getsb(mp, 0); - if (xfs_buf_ispinned(bp)) - xfs_log_force(mp, 0); - error = xfs_bwrite(bp); - xfs_buf_relse(bp); - return error; -} - -/* - * When remounting a filesystem read-only or freezing the filesystem, we have - * two phases to execute. This first phase is syncing the data before we - * quiesce the filesystem, and the second is flushing all the inodes out after - * we've waited for all the transactions created by the first phase to - * complete. The second phase ensures that the inodes are written to their - * location on disk rather than just existing in transactions in the log. This - * means after a quiesce there is no log replay required to write the inodes to - * disk (this is the main difference between a sync and a quiesce). - */ -/* - * First stage of freeze - no writers will make progress now we are here, - * so we flush delwri and delalloc buffers here, then wait for all I/O to - * complete. Data is frozen at that point. Metadata is not frozen, - * transactions can still occur here so don't bother emptying the AIL - * because it'll just get dirty again. - */ int -xfs_quiesce_data( - struct xfs_mount *mp) -{ - int error, error2 = 0; - - /* force out the log */ - xfs_log_force(mp, XFS_LOG_SYNC); - - /* write superblock and hoover up shutdown errors */ - error = xfs_sync_fsdata(mp); - - /* mark the log as covered if needed */ - if (xfs_log_need_covered(mp)) - error2 = xfs_fs_log_dummy(mp); - - return error ? error : error2; -} - -/* - * Second stage of a quiesce. The data is already synced, now we have to take - * care of the metadata. New transactions are already blocked, so we need to - * wait for any remaining transactions to drain out before proceeding. - */ -void -xfs_quiesce_attr( - struct xfs_mount *mp) -{ - int error = 0; - - /* wait for all modifications to complete */ - while (atomic_read(&mp->m_active_trans) > 0) - delay(100); - - /* reclaim inodes to do any IO before the freeze completes */ - xfs_reclaim_inodes(mp, 0); - xfs_reclaim_inodes(mp, SYNC_WAIT); - - /* flush all pending changes from the AIL */ - xfs_ail_push_all_sync(mp->m_ail); - - /* - * Just warn here till VFS can correctly support - * read-only remount without racing. - */ - WARN_ON(atomic_read(&mp->m_active_trans) != 0); - - /* Push the superblock and write an unmount record */ - error = xfs_log_sbcount(mp); - if (error) - xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. " - "Frozen image may not be consistent."); - xfs_log_unmount_write(mp); - - /* - * At this point we might have modified the superblock again and thus - * added an item to the AIL, thus flush it again. - */ - xfs_ail_push_all_sync(mp->m_ail); - - /* - * The superblock buffer is uncached and xfsaild_push() will lock and - * set the XBF_ASYNC flag on the buffer. We cannot do xfs_buf_iowait() - * here but a lock on the superblock buffer will block until iodone() - * has completed. - */ - xfs_buf_lock(mp->m_sb_bp); - xfs_buf_unlock(mp->m_sb_bp); -} - -static void -xfs_syncd_queue_sync( - struct xfs_mount *mp) -{ - queue_delayed_work(xfs_syncd_wq, &mp->m_sync_work, - msecs_to_jiffies(xfs_syncd_centisecs * 10)); -} - -/* - * Every sync period we need to unpin all items, reclaim inodes and sync - * disk quotas. We might need to cover the log to indicate that the - * filesystem is idle and not frozen. - */ -STATIC void -xfs_sync_worker( - struct work_struct *work) +xfs_inode_ag_iterator_tag( + struct xfs_mount *mp, + int (*execute)(struct xfs_inode *ip, + struct xfs_perag *pag, int flags, + void *args), + int flags, + void *args, + int tag) { - struct xfs_mount *mp = container_of(to_delayed_work(work), - struct xfs_mount, m_sync_work); - int error; - - /* - * We shouldn't write/force the log if we are in the mount/unmount - * process or on a read only filesystem. The workqueue still needs to be - * active in both cases, however, because it is used for inode reclaim - * during these times. Use the MS_ACTIVE flag to avoid doing anything - * during mount. Doing work during unmount is avoided by calling - * cancel_delayed_work_sync on this work queue before tearing down - * the ail and the log in xfs_log_unmount. - */ - if (!(mp->m_super->s_flags & MS_ACTIVE) && - !(mp->m_flags & XFS_MOUNT_RDONLY)) { - /* dgc: errors ignored here */ - if (mp->m_super->s_writers.frozen == SB_UNFROZEN && - xfs_log_need_covered(mp)) - error = xfs_fs_log_dummy(mp); - else - xfs_log_force(mp, 0); + struct xfs_perag *pag; + int error = 0; + int last_error = 0; + xfs_agnumber_t ag; - /* start pushing all the metadata that is currently - * dirty */ - xfs_ail_push_all(mp->m_ail); + ag = 0; + while ((pag = xfs_perag_get_tag(mp, ag, tag))) { + ag = pag->pag_agno + 1; + error = xfs_inode_ag_walk(mp, pag, execute, flags, args, tag); + xfs_perag_put(pag); + if (error) { + last_error = error; + if (error == EFSCORRUPTED) + break; + } } - - /* queue us up again */ - xfs_syncd_queue_sync(mp); + return XFS_ERROR(last_error); } /* * Queue a new inode reclaim pass if there are reclaimable inodes and there * isn't a reclaim pass already in progress. By default it runs every 5s based - * on the xfs syncd work default of 30s. Perhaps this should have it's own + * on the xfs periodic sync default of 30s. Perhaps this should have it's own * tunable, but that can be done if this method proves to be ineffective or too * aggressive. */ static void -xfs_syncd_queue_reclaim( +xfs_reclaim_work_queue( struct xfs_mount *mp) { rcu_read_lock(); if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) { - queue_delayed_work(xfs_syncd_wq, &mp->m_reclaim_work, + queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work, msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10)); } rcu_read_unlock(); @@ -445,7 +725,7 @@ xfs_syncd_queue_reclaim( * goes low. It scans as quickly as possible avoiding locked inodes or those * already being flushed, and once done schedules a future pass. */ -STATIC void +void xfs_reclaim_worker( struct work_struct *work) { @@ -453,65 +733,10 @@ xfs_reclaim_worker( struct xfs_mount, m_reclaim_work); xfs_reclaim_inodes(mp, SYNC_TRYLOCK); - xfs_syncd_queue_reclaim(mp); + xfs_reclaim_work_queue(mp); } -/* - * Flush delayed allocate data, attempting to free up reserved space - * from existing allocations. At this point a new allocation attempt - * has failed with ENOSPC and we are in the process of scratching our - * heads, looking about for more room. - * - * Queue a new data flush if there isn't one already in progress and - * wait for completion of the flush. This means that we only ever have one - * inode flush in progress no matter how many ENOSPC events are occurring and - * so will prevent the system from bogging down due to every concurrent - * ENOSPC event scanning all the active inodes in the system for writeback. - */ -void -xfs_flush_inodes( - struct xfs_inode *ip) -{ - struct xfs_mount *mp = ip->i_mount; - - queue_work(xfs_syncd_wq, &mp->m_flush_work); - flush_work(&mp->m_flush_work); -} - -STATIC void -xfs_flush_worker( - struct work_struct *work) -{ - struct xfs_mount *mp = container_of(work, - struct xfs_mount, m_flush_work); - - xfs_sync_data(mp, SYNC_TRYLOCK); - xfs_sync_data(mp, SYNC_TRYLOCK | SYNC_WAIT); -} - -int -xfs_syncd_init( - struct xfs_mount *mp) -{ - INIT_WORK(&mp->m_flush_work, xfs_flush_worker); - INIT_DELAYED_WORK(&mp->m_sync_work, xfs_sync_worker); - INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker); - - xfs_syncd_queue_sync(mp); - - return 0; -} - -void -xfs_syncd_stop( - struct xfs_mount *mp) -{ - cancel_delayed_work_sync(&mp->m_sync_work); - cancel_delayed_work_sync(&mp->m_reclaim_work); - cancel_work_sync(&mp->m_flush_work); -} - -void +static void __xfs_inode_set_reclaim_tag( struct xfs_perag *pag, struct xfs_inode *ip) @@ -529,7 +754,7 @@ __xfs_inode_set_reclaim_tag( spin_unlock(&ip->i_mount->m_perag_lock); /* schedule periodic background inode reclaim */ - xfs_syncd_queue_reclaim(ip->i_mount); + xfs_reclaim_work_queue(ip->i_mount); trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno, -1, _RET_IP_); @@ -577,7 +802,7 @@ __xfs_inode_clear_reclaim( } } -void +STATIC void __xfs_inode_clear_reclaim_tag( xfs_mount_t *mp, xfs_perag_t *pag, @@ -787,9 +1012,9 @@ out: /* * We could return EAGAIN here to make reclaim rescan the inode tree in * a short while. However, this just burns CPU time scanning the tree - * waiting for IO to complete and xfssyncd never goes back to the idle - * state. Instead, return 0 to let the next scheduled background reclaim - * attempt to reclaim the inode again. + * waiting for IO to complete and the reclaim work never goes back to + * the idle state. Instead, return 0 to let the next scheduled + * background reclaim attempt to reclaim the inode again. */ return 0; } @@ -800,7 +1025,7 @@ out: * then a shut down during filesystem unmount reclaim walk leak all the * unreclaimed inodes. */ -int +STATIC int xfs_reclaim_inodes_ag( struct xfs_mount *mp, int flags, @@ -945,7 +1170,7 @@ xfs_reclaim_inodes_nr( int nr_to_scan) { /* kick background reclaimer and push the AIL */ - xfs_syncd_queue_reclaim(mp); + xfs_reclaim_work_queue(mp); xfs_ail_push_all(mp->m_ail); xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan); @@ -971,3 +1196,146 @@ xfs_reclaim_inodes_count( return reclaimable; } +STATIC int +xfs_inode_match_id( + struct xfs_inode *ip, + struct xfs_eofblocks *eofb) +{ + if (eofb->eof_flags & XFS_EOF_FLAGS_UID && + ip->i_d.di_uid != eofb->eof_uid) + return 0; + + if (eofb->eof_flags & XFS_EOF_FLAGS_GID && + ip->i_d.di_gid != eofb->eof_gid) + return 0; + + if (eofb->eof_flags & XFS_EOF_FLAGS_PRID && + xfs_get_projid(ip) != eofb->eof_prid) + return 0; + + return 1; +} + +STATIC int +xfs_inode_free_eofblocks( + struct xfs_inode *ip, + struct xfs_perag *pag, + int flags, + void *args) +{ + int ret; + struct xfs_eofblocks *eofb = args; + + if (!xfs_can_free_eofblocks(ip, false)) { + /* inode could be preallocated or append-only */ + trace_xfs_inode_free_eofblocks_invalid(ip); + xfs_inode_clear_eofblocks_tag(ip); + return 0; + } + + /* + * If the mapping is dirty the operation can block and wait for some + * time. Unless we are waiting, skip it. + */ + if (!(flags & SYNC_WAIT) && + mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY)) + return 0; + + if (eofb) { + if (!xfs_inode_match_id(ip, eofb)) + return 0; + + /* skip the inode if the file size is too small */ + if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE && + XFS_ISIZE(ip) < eofb->eof_min_file_size) + return 0; + } + + ret = xfs_free_eofblocks(ip->i_mount, ip, true); + + /* don't revisit the inode if we're not waiting */ + if (ret == EAGAIN && !(flags & SYNC_WAIT)) + ret = 0; + + return ret; +} + +int +xfs_icache_free_eofblocks( + struct xfs_mount *mp, + struct xfs_eofblocks *eofb) +{ + int flags = SYNC_TRYLOCK; + + if (eofb && (eofb->eof_flags & XFS_EOF_FLAGS_SYNC)) + flags = SYNC_WAIT; + + return xfs_inode_ag_iterator_tag(mp, xfs_inode_free_eofblocks, flags, + eofb, XFS_ICI_EOFBLOCKS_TAG); +} + +void +xfs_inode_set_eofblocks_tag( + xfs_inode_t *ip) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_perag *pag; + int tagged; + + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); + spin_lock(&pag->pag_ici_lock); + trace_xfs_inode_set_eofblocks_tag(ip); + + tagged = radix_tree_tagged(&pag->pag_ici_root, + XFS_ICI_EOFBLOCKS_TAG); + radix_tree_tag_set(&pag->pag_ici_root, + XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), + XFS_ICI_EOFBLOCKS_TAG); + if (!tagged) { + /* propagate the eofblocks tag up into the perag radix tree */ + spin_lock(&ip->i_mount->m_perag_lock); + radix_tree_tag_set(&ip->i_mount->m_perag_tree, + XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), + XFS_ICI_EOFBLOCKS_TAG); + spin_unlock(&ip->i_mount->m_perag_lock); + + /* kick off background trimming */ + xfs_queue_eofblocks(ip->i_mount); + + trace_xfs_perag_set_eofblocks(ip->i_mount, pag->pag_agno, + -1, _RET_IP_); + } + + spin_unlock(&pag->pag_ici_lock); + xfs_perag_put(pag); +} + +void +xfs_inode_clear_eofblocks_tag( + xfs_inode_t *ip) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_perag *pag; + + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); + spin_lock(&pag->pag_ici_lock); + trace_xfs_inode_clear_eofblocks_tag(ip); + + radix_tree_tag_clear(&pag->pag_ici_root, + XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), + XFS_ICI_EOFBLOCKS_TAG); + if (!radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_EOFBLOCKS_TAG)) { + /* clear the eofblocks tag from the perag radix tree */ + spin_lock(&ip->i_mount->m_perag_lock); + radix_tree_tag_clear(&ip->i_mount->m_perag_tree, + XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), + XFS_ICI_EOFBLOCKS_TAG); + spin_unlock(&ip->i_mount->m_perag_lock); + trace_xfs_perag_clear_eofblocks(ip->i_mount, pag->pag_agno, + -1, _RET_IP_); + } + + spin_unlock(&pag->pag_ici_lock); + xfs_perag_put(pag); +} + diff --git a/fs/xfs/xfs_sync.h b/fs/xfs/xfs_icache.h index 941202e7ac6e..e0f138c70a2f 100644 --- a/fs/xfs/xfs_sync.h +++ b/fs/xfs/xfs_icache.h @@ -24,28 +24,30 @@ struct xfs_perag; #define SYNC_WAIT 0x0001 /* wait for i/o to complete */ #define SYNC_TRYLOCK 0x0002 /* only try to lock inodes */ -extern struct workqueue_struct *xfs_syncd_wq; /* sync workqueue */ +int xfs_iget(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino, + uint flags, uint lock_flags, xfs_inode_t **ipp); -int xfs_syncd_init(struct xfs_mount *mp); -void xfs_syncd_stop(struct xfs_mount *mp); - -int xfs_quiesce_data(struct xfs_mount *mp); -void xfs_quiesce_attr(struct xfs_mount *mp); - -void xfs_flush_inodes(struct xfs_inode *ip); +void xfs_reclaim_worker(struct work_struct *work); int xfs_reclaim_inodes(struct xfs_mount *mp, int mode); int xfs_reclaim_inodes_count(struct xfs_mount *mp); void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan); void xfs_inode_set_reclaim_tag(struct xfs_inode *ip); -void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip); -void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag, - struct xfs_inode *ip); + +void xfs_inode_set_eofblocks_tag(struct xfs_inode *ip); +void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip); +int xfs_icache_free_eofblocks(struct xfs_mount *, struct xfs_eofblocks *); +void xfs_eofblocks_worker(struct work_struct *); int xfs_sync_inode_grab(struct xfs_inode *ip); int xfs_inode_ag_iterator(struct xfs_mount *mp, - int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags), - int flags); + int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, + int flags, void *args), + int flags, void *args); +int xfs_inode_ag_iterator_tag(struct xfs_mount *mp, + int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, + int flags, void *args), + int flags, void *args, int tag); #endif diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c deleted file mode 100644 index 784a803383ec..000000000000 --- a/fs/xfs/xfs_iget.c +++ /dev/null @@ -1,705 +0,0 @@ -/* - * Copyright (c) 2000-2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include "xfs.h" -#include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_acl.h" -#include "xfs_log.h" -#include "xfs_inum.h" -#include "xfs_trans.h" -#include "xfs_sb.h" -#include "xfs_ag.h" -#include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dinode.h" -#include "xfs_inode.h" -#include "xfs_btree.h" -#include "xfs_ialloc.h" -#include "xfs_quota.h" -#include "xfs_utils.h" -#include "xfs_trans_priv.h" -#include "xfs_inode_item.h" -#include "xfs_bmap.h" -#include "xfs_trace.h" - - -/* - * Allocate and initialise an xfs_inode. - */ -STATIC struct xfs_inode * -xfs_inode_alloc( - struct xfs_mount *mp, - xfs_ino_t ino) -{ - struct xfs_inode *ip; - - /* - * if this didn't occur in transactions, we could use - * KM_MAYFAIL and return NULL here on ENOMEM. Set the - * code up to do this anyway. - */ - ip = kmem_zone_alloc(xfs_inode_zone, KM_SLEEP); - if (!ip) - return NULL; - if (inode_init_always(mp->m_super, VFS_I(ip))) { - kmem_zone_free(xfs_inode_zone, ip); - return NULL; - } - - ASSERT(atomic_read(&ip->i_pincount) == 0); - ASSERT(!spin_is_locked(&ip->i_flags_lock)); - ASSERT(!xfs_isiflocked(ip)); - ASSERT(ip->i_ino == 0); - - mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); - - /* initialise the xfs inode */ - ip->i_ino = ino; - ip->i_mount = mp; - memset(&ip->i_imap, 0, sizeof(struct xfs_imap)); - ip->i_afp = NULL; - memset(&ip->i_df, 0, sizeof(xfs_ifork_t)); - ip->i_flags = 0; - ip->i_delayed_blks = 0; - memset(&ip->i_d, 0, sizeof(xfs_icdinode_t)); - - return ip; -} - -STATIC void -xfs_inode_free_callback( - struct rcu_head *head) -{ - struct inode *inode = container_of(head, struct inode, i_rcu); - struct xfs_inode *ip = XFS_I(inode); - - kmem_zone_free(xfs_inode_zone, ip); -} - -void -xfs_inode_free( - struct xfs_inode *ip) -{ - switch (ip->i_d.di_mode & S_IFMT) { - case S_IFREG: - case S_IFDIR: - case S_IFLNK: - xfs_idestroy_fork(ip, XFS_DATA_FORK); - break; - } - - if (ip->i_afp) - xfs_idestroy_fork(ip, XFS_ATTR_FORK); - - if (ip->i_itemp) { - ASSERT(!(ip->i_itemp->ili_item.li_flags & XFS_LI_IN_AIL)); - xfs_inode_item_destroy(ip); - ip->i_itemp = NULL; - } - - /* asserts to verify all state is correct here */ - ASSERT(atomic_read(&ip->i_pincount) == 0); - ASSERT(!spin_is_locked(&ip->i_flags_lock)); - ASSERT(!xfs_isiflocked(ip)); - - /* - * Because we use RCU freeing we need to ensure the inode always - * appears to be reclaimed with an invalid inode number when in the - * free state. The ip->i_flags_lock provides the barrier against lookup - * races. - */ - spin_lock(&ip->i_flags_lock); - ip->i_flags = XFS_IRECLAIM; - ip->i_ino = 0; - spin_unlock(&ip->i_flags_lock); - - call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback); -} - -/* - * Check the validity of the inode we just found it the cache - */ -static int -xfs_iget_cache_hit( - struct xfs_perag *pag, - struct xfs_inode *ip, - xfs_ino_t ino, - int flags, - int lock_flags) __releases(RCU) -{ - struct inode *inode = VFS_I(ip); - struct xfs_mount *mp = ip->i_mount; - int error; - - /* - * check for re-use of an inode within an RCU grace period due to the - * radix tree nodes not being updated yet. We monitor for this by - * setting the inode number to zero before freeing the inode structure. - * If the inode has been reallocated and set up, then the inode number - * will not match, so check for that, too. - */ - spin_lock(&ip->i_flags_lock); - if (ip->i_ino != ino) { - trace_xfs_iget_skip(ip); - XFS_STATS_INC(xs_ig_frecycle); - error = EAGAIN; - goto out_error; - } - - - /* - * If we are racing with another cache hit that is currently - * instantiating this inode or currently recycling it out of - * reclaimabe state, wait for the initialisation to complete - * before continuing. - * - * XXX(hch): eventually we should do something equivalent to - * wait_on_inode to wait for these flags to be cleared - * instead of polling for it. - */ - if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) { - trace_xfs_iget_skip(ip); - XFS_STATS_INC(xs_ig_frecycle); - error = EAGAIN; - goto out_error; - } - - /* - * If lookup is racing with unlink return an error immediately. - */ - if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) { - error = ENOENT; - goto out_error; - } - - /* - * If IRECLAIMABLE is set, we've torn down the VFS inode already. - * Need to carefully get it back into useable state. - */ - if (ip->i_flags & XFS_IRECLAIMABLE) { - trace_xfs_iget_reclaim(ip); - - /* - * We need to set XFS_IRECLAIM to prevent xfs_reclaim_inode - * from stomping over us while we recycle the inode. We can't - * clear the radix tree reclaimable tag yet as it requires - * pag_ici_lock to be held exclusive. - */ - ip->i_flags |= XFS_IRECLAIM; - - spin_unlock(&ip->i_flags_lock); - rcu_read_unlock(); - - error = -inode_init_always(mp->m_super, inode); - if (error) { - /* - * Re-initializing the inode failed, and we are in deep - * trouble. Try to re-add it to the reclaim list. - */ - rcu_read_lock(); - spin_lock(&ip->i_flags_lock); - - ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM); - ASSERT(ip->i_flags & XFS_IRECLAIMABLE); - trace_xfs_iget_reclaim_fail(ip); - goto out_error; - } - - spin_lock(&pag->pag_ici_lock); - spin_lock(&ip->i_flags_lock); - - /* - * Clear the per-lifetime state in the inode as we are now - * effectively a new inode and need to return to the initial - * state before reuse occurs. - */ - ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS; - ip->i_flags |= XFS_INEW; - __xfs_inode_clear_reclaim_tag(mp, pag, ip); - inode->i_state = I_NEW; - - ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock)); - mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); - - spin_unlock(&ip->i_flags_lock); - spin_unlock(&pag->pag_ici_lock); - } else { - /* If the VFS inode is being torn down, pause and try again. */ - if (!igrab(inode)) { - trace_xfs_iget_skip(ip); - error = EAGAIN; - goto out_error; - } - - /* We've got a live one. */ - spin_unlock(&ip->i_flags_lock); - rcu_read_unlock(); - trace_xfs_iget_hit(ip); - } - - if (lock_flags != 0) - xfs_ilock(ip, lock_flags); - - xfs_iflags_clear(ip, XFS_ISTALE | XFS_IDONTCACHE); - XFS_STATS_INC(xs_ig_found); - - return 0; - -out_error: - spin_unlock(&ip->i_flags_lock); - rcu_read_unlock(); - return error; -} - - -static int -xfs_iget_cache_miss( - struct xfs_mount *mp, - struct xfs_perag *pag, - xfs_trans_t *tp, - xfs_ino_t ino, - struct xfs_inode **ipp, - int flags, - int lock_flags) -{ - struct xfs_inode *ip; - int error; - xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ino); - int iflags; - - ip = xfs_inode_alloc(mp, ino); - if (!ip) - return ENOMEM; - - error = xfs_iread(mp, tp, ip, flags); - if (error) - goto out_destroy; - - trace_xfs_iget_miss(ip); - - if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) { - error = ENOENT; - goto out_destroy; - } - - /* - * Preload the radix tree so we can insert safely under the - * write spinlock. Note that we cannot sleep inside the preload - * region. Since we can be called from transaction context, don't - * recurse into the file system. - */ - if (radix_tree_preload(GFP_NOFS)) { - error = EAGAIN; - goto out_destroy; - } - - /* - * Because the inode hasn't been added to the radix-tree yet it can't - * be found by another thread, so we can do the non-sleeping lock here. - */ - if (lock_flags) { - if (!xfs_ilock_nowait(ip, lock_flags)) - BUG(); - } - - /* - * These values must be set before inserting the inode into the radix - * tree as the moment it is inserted a concurrent lookup (allowed by the - * RCU locking mechanism) can find it and that lookup must see that this - * is an inode currently under construction (i.e. that XFS_INEW is set). - * The ip->i_flags_lock that protects the XFS_INEW flag forms the - * memory barrier that ensures this detection works correctly at lookup - * time. - */ - iflags = XFS_INEW; - if (flags & XFS_IGET_DONTCACHE) - iflags |= XFS_IDONTCACHE; - ip->i_udquot = ip->i_gdquot = NULL; - xfs_iflags_set(ip, iflags); - - /* insert the new inode */ - spin_lock(&pag->pag_ici_lock); - error = radix_tree_insert(&pag->pag_ici_root, agino, ip); - if (unlikely(error)) { - WARN_ON(error != -EEXIST); - XFS_STATS_INC(xs_ig_dup); - error = EAGAIN; - goto out_preload_end; - } - spin_unlock(&pag->pag_ici_lock); - radix_tree_preload_end(); - - *ipp = ip; - return 0; - -out_preload_end: - spin_unlock(&pag->pag_ici_lock); - radix_tree_preload_end(); - if (lock_flags) - xfs_iunlock(ip, lock_flags); -out_destroy: - __destroy_inode(VFS_I(ip)); - xfs_inode_free(ip); - return error; -} - -/* - * Look up an inode by number in the given file system. - * The inode is looked up in the cache held in each AG. - * If the inode is found in the cache, initialise the vfs inode - * if necessary. - * - * If it is not in core, read it in from the file system's device, - * add it to the cache and initialise the vfs inode. - * - * The inode is locked according to the value of the lock_flags parameter. - * This flag parameter indicates how and if the inode's IO lock and inode lock - * should be taken. - * - * mp -- the mount point structure for the current file system. It points - * to the inode hash table. - * tp -- a pointer to the current transaction if there is one. This is - * simply passed through to the xfs_iread() call. - * ino -- the number of the inode desired. This is the unique identifier - * within the file system for the inode being requested. - * lock_flags -- flags indicating how to lock the inode. See the comment - * for xfs_ilock() for a list of valid values. - */ -int -xfs_iget( - xfs_mount_t *mp, - xfs_trans_t *tp, - xfs_ino_t ino, - uint flags, - uint lock_flags, - xfs_inode_t **ipp) -{ - xfs_inode_t *ip; - int error; - xfs_perag_t *pag; - xfs_agino_t agino; - - /* - * xfs_reclaim_inode() uses the ILOCK to ensure an inode - * doesn't get freed while it's being referenced during a - * radix tree traversal here. It assumes this function - * aqcuires only the ILOCK (and therefore it has no need to - * involve the IOLOCK in this synchronization). - */ - ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0); - - /* reject inode numbers outside existing AGs */ - if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount) - return EINVAL; - - /* get the perag structure and ensure that it's inode capable */ - pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino)); - agino = XFS_INO_TO_AGINO(mp, ino); - -again: - error = 0; - rcu_read_lock(); - ip = radix_tree_lookup(&pag->pag_ici_root, agino); - - if (ip) { - error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags); - if (error) - goto out_error_or_again; - } else { - rcu_read_unlock(); - XFS_STATS_INC(xs_ig_missed); - - error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip, - flags, lock_flags); - if (error) - goto out_error_or_again; - } - xfs_perag_put(pag); - - *ipp = ip; - - /* - * If we have a real type for an on-disk inode, we can set ops(&unlock) - * now. If it's a new inode being created, xfs_ialloc will handle it. - */ - if (xfs_iflags_test(ip, XFS_INEW) && ip->i_d.di_mode != 0) - xfs_setup_inode(ip); - return 0; - -out_error_or_again: - if (error == EAGAIN) { - delay(1); - goto again; - } - xfs_perag_put(pag); - return error; -} - -/* - * This is a wrapper routine around the xfs_ilock() routine - * used to centralize some grungy code. It is used in places - * that wish to lock the inode solely for reading the extents. - * The reason these places can't just call xfs_ilock(SHARED) - * is that the inode lock also guards to bringing in of the - * extents from disk for a file in b-tree format. If the inode - * is in b-tree format, then we need to lock the inode exclusively - * until the extents are read in. Locking it exclusively all - * the time would limit our parallelism unnecessarily, though. - * What we do instead is check to see if the extents have been - * read in yet, and only lock the inode exclusively if they - * have not. - * - * The function returns a value which should be given to the - * corresponding xfs_iunlock_map_shared(). This value is - * the mode in which the lock was actually taken. - */ -uint -xfs_ilock_map_shared( - xfs_inode_t *ip) -{ - uint lock_mode; - - if ((ip->i_d.di_format == XFS_DINODE_FMT_BTREE) && - ((ip->i_df.if_flags & XFS_IFEXTENTS) == 0)) { - lock_mode = XFS_ILOCK_EXCL; - } else { - lock_mode = XFS_ILOCK_SHARED; - } - - xfs_ilock(ip, lock_mode); - - return lock_mode; -} - -/* - * This is simply the unlock routine to go with xfs_ilock_map_shared(). - * All it does is call xfs_iunlock() with the given lock_mode. - */ -void -xfs_iunlock_map_shared( - xfs_inode_t *ip, - unsigned int lock_mode) -{ - xfs_iunlock(ip, lock_mode); -} - -/* - * The xfs inode contains 2 locks: a multi-reader lock called the - * i_iolock and a multi-reader lock called the i_lock. This routine - * allows either or both of the locks to be obtained. - * - * The 2 locks should always be ordered so that the IO lock is - * obtained first in order to prevent deadlock. - * - * ip -- the inode being locked - * lock_flags -- this parameter indicates the inode's locks - * to be locked. It can be: - * XFS_IOLOCK_SHARED, - * XFS_IOLOCK_EXCL, - * XFS_ILOCK_SHARED, - * XFS_ILOCK_EXCL, - * XFS_IOLOCK_SHARED | XFS_ILOCK_SHARED, - * XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL, - * XFS_IOLOCK_EXCL | XFS_ILOCK_SHARED, - * XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL - */ -void -xfs_ilock( - xfs_inode_t *ip, - uint lock_flags) -{ - /* - * You can't set both SHARED and EXCL for the same lock, - * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED, - * and XFS_ILOCK_EXCL are valid values to set in lock_flags. - */ - ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != - (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); - ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != - (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); - ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); - - if (lock_flags & XFS_IOLOCK_EXCL) - mrupdate_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags)); - else if (lock_flags & XFS_IOLOCK_SHARED) - mraccess_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags)); - - if (lock_flags & XFS_ILOCK_EXCL) - mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags)); - else if (lock_flags & XFS_ILOCK_SHARED) - mraccess_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags)); - - trace_xfs_ilock(ip, lock_flags, _RET_IP_); -} - -/* - * This is just like xfs_ilock(), except that the caller - * is guaranteed not to sleep. It returns 1 if it gets - * the requested locks and 0 otherwise. If the IO lock is - * obtained but the inode lock cannot be, then the IO lock - * is dropped before returning. - * - * ip -- the inode being locked - * lock_flags -- this parameter indicates the inode's locks to be - * to be locked. See the comment for xfs_ilock() for a list - * of valid values. - */ -int -xfs_ilock_nowait( - xfs_inode_t *ip, - uint lock_flags) -{ - /* - * You can't set both SHARED and EXCL for the same lock, - * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED, - * and XFS_ILOCK_EXCL are valid values to set in lock_flags. - */ - ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != - (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); - ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != - (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); - ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); - - if (lock_flags & XFS_IOLOCK_EXCL) { - if (!mrtryupdate(&ip->i_iolock)) - goto out; - } else if (lock_flags & XFS_IOLOCK_SHARED) { - if (!mrtryaccess(&ip->i_iolock)) - goto out; - } - if (lock_flags & XFS_ILOCK_EXCL) { - if (!mrtryupdate(&ip->i_lock)) - goto out_undo_iolock; - } else if (lock_flags & XFS_ILOCK_SHARED) { - if (!mrtryaccess(&ip->i_lock)) - goto out_undo_iolock; - } - trace_xfs_ilock_nowait(ip, lock_flags, _RET_IP_); - return 1; - - out_undo_iolock: - if (lock_flags & XFS_IOLOCK_EXCL) - mrunlock_excl(&ip->i_iolock); - else if (lock_flags & XFS_IOLOCK_SHARED) - mrunlock_shared(&ip->i_iolock); - out: - return 0; -} - -/* - * xfs_iunlock() is used to drop the inode locks acquired with - * xfs_ilock() and xfs_ilock_nowait(). The caller must pass - * in the flags given to xfs_ilock() or xfs_ilock_nowait() so - * that we know which locks to drop. - * - * ip -- the inode being unlocked - * lock_flags -- this parameter indicates the inode's locks to be - * to be unlocked. See the comment for xfs_ilock() for a list - * of valid values for this parameter. - * - */ -void -xfs_iunlock( - xfs_inode_t *ip, - uint lock_flags) -{ - /* - * You can't set both SHARED and EXCL for the same lock, - * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED, - * and XFS_ILOCK_EXCL are valid values to set in lock_flags. - */ - ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != - (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); - ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != - (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); - ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); - ASSERT(lock_flags != 0); - - if (lock_flags & XFS_IOLOCK_EXCL) - mrunlock_excl(&ip->i_iolock); - else if (lock_flags & XFS_IOLOCK_SHARED) - mrunlock_shared(&ip->i_iolock); - - if (lock_flags & XFS_ILOCK_EXCL) - mrunlock_excl(&ip->i_lock); - else if (lock_flags & XFS_ILOCK_SHARED) - mrunlock_shared(&ip->i_lock); - - trace_xfs_iunlock(ip, lock_flags, _RET_IP_); -} - -/* - * give up write locks. the i/o lock cannot be held nested - * if it is being demoted. - */ -void -xfs_ilock_demote( - xfs_inode_t *ip, - uint lock_flags) -{ - ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)); - ASSERT((lock_flags & ~(XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)) == 0); - - if (lock_flags & XFS_ILOCK_EXCL) - mrdemote(&ip->i_lock); - if (lock_flags & XFS_IOLOCK_EXCL) - mrdemote(&ip->i_iolock); - - trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_); -} - -#ifdef DEBUG -int -xfs_isilocked( - xfs_inode_t *ip, - uint lock_flags) -{ - if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) { - if (!(lock_flags & XFS_ILOCK_SHARED)) - return !!ip->i_lock.mr_writer; - return rwsem_is_locked(&ip->i_lock.mr_lock); - } - - if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) { - if (!(lock_flags & XFS_IOLOCK_SHARED)) - return !!ip->i_iolock.mr_writer; - return rwsem_is_locked(&ip->i_iolock.mr_lock); - } - - ASSERT(0); - return 0; -} -#endif - -void -__xfs_iflock( - struct xfs_inode *ip) -{ - wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT); - DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT); - - do { - prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE); - if (xfs_isiflocked(ip)) - io_schedule(); - } while (!xfs_iflock_nowait(ip)); - - finish_wait(wq, &wait.wait); -} diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 2778258fcfa2..66282dcb821b 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -45,6 +45,7 @@ #include "xfs_filestream.h" #include "xfs_vnodeops.h" #include "xfs_trace.h" +#include "xfs_icache.h" kmem_zone_t *xfs_ifork_zone; kmem_zone_t *xfs_inode_zone; @@ -74,6 +75,256 @@ xfs_get_extsz_hint( return 0; } +/* + * This is a wrapper routine around the xfs_ilock() routine used to centralize + * some grungy code. It is used in places that wish to lock the inode solely + * for reading the extents. The reason these places can't just call + * xfs_ilock(SHARED) is that the inode lock also guards to bringing in of the + * extents from disk for a file in b-tree format. If the inode is in b-tree + * format, then we need to lock the inode exclusively until the extents are read + * in. Locking it exclusively all the time would limit our parallelism + * unnecessarily, though. What we do instead is check to see if the extents + * have been read in yet, and only lock the inode exclusively if they have not. + * + * The function returns a value which should be given to the corresponding + * xfs_iunlock_map_shared(). This value is the mode in which the lock was + * actually taken. + */ +uint +xfs_ilock_map_shared( + xfs_inode_t *ip) +{ + uint lock_mode; + + if ((ip->i_d.di_format == XFS_DINODE_FMT_BTREE) && + ((ip->i_df.if_flags & XFS_IFEXTENTS) == 0)) { + lock_mode = XFS_ILOCK_EXCL; + } else { + lock_mode = XFS_ILOCK_SHARED; + } + + xfs_ilock(ip, lock_mode); + + return lock_mode; +} + +/* + * This is simply the unlock routine to go with xfs_ilock_map_shared(). + * All it does is call xfs_iunlock() with the given lock_mode. + */ +void +xfs_iunlock_map_shared( + xfs_inode_t *ip, + unsigned int lock_mode) +{ + xfs_iunlock(ip, lock_mode); +} + +/* + * The xfs inode contains 2 locks: a multi-reader lock called the + * i_iolock and a multi-reader lock called the i_lock. This routine + * allows either or both of the locks to be obtained. + * + * The 2 locks should always be ordered so that the IO lock is + * obtained first in order to prevent deadlock. + * + * ip -- the inode being locked + * lock_flags -- this parameter indicates the inode's locks + * to be locked. It can be: + * XFS_IOLOCK_SHARED, + * XFS_IOLOCK_EXCL, + * XFS_ILOCK_SHARED, + * XFS_ILOCK_EXCL, + * XFS_IOLOCK_SHARED | XFS_ILOCK_SHARED, + * XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL, + * XFS_IOLOCK_EXCL | XFS_ILOCK_SHARED, + * XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL + */ +void +xfs_ilock( + xfs_inode_t *ip, + uint lock_flags) +{ + trace_xfs_ilock(ip, lock_flags, _RET_IP_); + + /* + * You can't set both SHARED and EXCL for the same lock, + * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED, + * and XFS_ILOCK_EXCL are valid values to set in lock_flags. + */ + ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != + (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); + ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != + (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); + ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); + + if (lock_flags & XFS_IOLOCK_EXCL) + mrupdate_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags)); + else if (lock_flags & XFS_IOLOCK_SHARED) + mraccess_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags)); + + if (lock_flags & XFS_ILOCK_EXCL) + mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags)); + else if (lock_flags & XFS_ILOCK_SHARED) + mraccess_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags)); +} + +/* + * This is just like xfs_ilock(), except that the caller + * is guaranteed not to sleep. It returns 1 if it gets + * the requested locks and 0 otherwise. If the IO lock is + * obtained but the inode lock cannot be, then the IO lock + * is dropped before returning. + * + * ip -- the inode being locked + * lock_flags -- this parameter indicates the inode's locks to be + * to be locked. See the comment for xfs_ilock() for a list + * of valid values. + */ +int +xfs_ilock_nowait( + xfs_inode_t *ip, + uint lock_flags) +{ + trace_xfs_ilock_nowait(ip, lock_flags, _RET_IP_); + + /* + * You can't set both SHARED and EXCL for the same lock, + * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED, + * and XFS_ILOCK_EXCL are valid values to set in lock_flags. + */ + ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != + (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); + ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != + (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); + ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); + + if (lock_flags & XFS_IOLOCK_EXCL) { + if (!mrtryupdate(&ip->i_iolock)) + goto out; + } else if (lock_flags & XFS_IOLOCK_SHARED) { + if (!mrtryaccess(&ip->i_iolock)) + goto out; + } + if (lock_flags & XFS_ILOCK_EXCL) { + if (!mrtryupdate(&ip->i_lock)) + goto out_undo_iolock; + } else if (lock_flags & XFS_ILOCK_SHARED) { + if (!mrtryaccess(&ip->i_lock)) + goto out_undo_iolock; + } + return 1; + + out_undo_iolock: + if (lock_flags & XFS_IOLOCK_EXCL) + mrunlock_excl(&ip->i_iolock); + else if (lock_flags & XFS_IOLOCK_SHARED) + mrunlock_shared(&ip->i_iolock); + out: + return 0; +} + +/* + * xfs_iunlock() is used to drop the inode locks acquired with + * xfs_ilock() and xfs_ilock_nowait(). The caller must pass + * in the flags given to xfs_ilock() or xfs_ilock_nowait() so + * that we know which locks to drop. + * + * ip -- the inode being unlocked + * lock_flags -- this parameter indicates the inode's locks to be + * to be unlocked. See the comment for xfs_ilock() for a list + * of valid values for this parameter. + * + */ +void +xfs_iunlock( + xfs_inode_t *ip, + uint lock_flags) +{ + /* + * You can't set both SHARED and EXCL for the same lock, + * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED, + * and XFS_ILOCK_EXCL are valid values to set in lock_flags. + */ + ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != + (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); + ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != + (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); + ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); + ASSERT(lock_flags != 0); + + if (lock_flags & XFS_IOLOCK_EXCL) + mrunlock_excl(&ip->i_iolock); + else if (lock_flags & XFS_IOLOCK_SHARED) + mrunlock_shared(&ip->i_iolock); + + if (lock_flags & XFS_ILOCK_EXCL) + mrunlock_excl(&ip->i_lock); + else if (lock_flags & XFS_ILOCK_SHARED) + mrunlock_shared(&ip->i_lock); + + trace_xfs_iunlock(ip, lock_flags, _RET_IP_); +} + +/* + * give up write locks. the i/o lock cannot be held nested + * if it is being demoted. + */ +void +xfs_ilock_demote( + xfs_inode_t *ip, + uint lock_flags) +{ + ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)); + ASSERT((lock_flags & ~(XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)) == 0); + + if (lock_flags & XFS_ILOCK_EXCL) + mrdemote(&ip->i_lock); + if (lock_flags & XFS_IOLOCK_EXCL) + mrdemote(&ip->i_iolock); + + trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_); +} + +#ifdef DEBUG +int +xfs_isilocked( + xfs_inode_t *ip, + uint lock_flags) +{ + if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) { + if (!(lock_flags & XFS_ILOCK_SHARED)) + return !!ip->i_lock.mr_writer; + return rwsem_is_locked(&ip->i_lock.mr_lock); + } + + if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) { + if (!(lock_flags & XFS_IOLOCK_SHARED)) + return !!ip->i_iolock.mr_writer; + return rwsem_is_locked(&ip->i_iolock.mr_lock); + } + + ASSERT(0); + return 0; +} +#endif + +void +__xfs_iflock( + struct xfs_inode *ip) +{ + wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT); + DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT); + + do { + prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE); + if (xfs_isiflocked(ip)) + io_schedule(); + } while (!xfs_iflock_nowait(ip)); + + finish_wait(wq, &wait.wait); +} + #ifdef DEBUG /* * Make sure that the extents in the given memory buffer @@ -131,6 +382,65 @@ xfs_inobp_check( } #endif +static void +xfs_inode_buf_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + int i; + int ni; + + /* + * Validate the magic number and version of every inode in the buffer + */ + ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock; + for (i = 0; i < ni; i++) { + int di_ok; + xfs_dinode_t *dip; + + dip = (struct xfs_dinode *)xfs_buf_offset(bp, + (i << mp->m_sb.sb_inodelog)); + di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) && + XFS_DINODE_GOOD_VERSION(dip->di_version); + if (unlikely(XFS_TEST_ERROR(!di_ok, mp, + XFS_ERRTAG_ITOBP_INOTOBP, + XFS_RANDOM_ITOBP_INOTOBP))) { + xfs_buf_ioerror(bp, EFSCORRUPTED); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_HIGH, + mp, dip); +#ifdef DEBUG + xfs_emerg(mp, + "bad inode magic/vsn daddr %lld #%d (magic=%x)", + (unsigned long long)bp->b_bn, i, + be16_to_cpu(dip->di_magic)); + ASSERT(0); +#endif + } + } + xfs_inobp_check(mp, bp); +} + + +static void +xfs_inode_buf_read_verify( + struct xfs_buf *bp) +{ + xfs_inode_buf_verify(bp); +} + +static void +xfs_inode_buf_write_verify( + struct xfs_buf *bp) +{ + xfs_inode_buf_verify(bp); +} + +const struct xfs_buf_ops xfs_inode_buf_ops = { + .verify_read = xfs_inode_buf_read_verify, + .verify_write = xfs_inode_buf_write_verify, +}; + + /* * This routine is called to map an inode to the buffer containing the on-disk * version of the inode. It returns a pointer to the buffer containing the @@ -145,71 +455,33 @@ xfs_imap_to_bp( struct xfs_mount *mp, struct xfs_trans *tp, struct xfs_imap *imap, - struct xfs_dinode **dipp, + struct xfs_dinode **dipp, struct xfs_buf **bpp, uint buf_flags, uint iget_flags) { struct xfs_buf *bp; int error; - int i; - int ni; buf_flags |= XBF_UNMAPPED; error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno, - (int)imap->im_len, buf_flags, &bp); + (int)imap->im_len, buf_flags, &bp, + &xfs_inode_buf_ops); if (error) { - if (error != EAGAIN) { - xfs_warn(mp, - "%s: xfs_trans_read_buf() returned error %d.", - __func__, error); - } else { + if (error == EAGAIN) { ASSERT(buf_flags & XBF_TRYLOCK); + return error; } - return error; - } - - /* - * Validate the magic number and version of every inode in the buffer - * (if DEBUG kernel) or the first inode in the buffer, otherwise. - */ -#ifdef DEBUG - ni = BBTOB(imap->im_len) >> mp->m_sb.sb_inodelog; -#else /* usual case */ - ni = 1; -#endif - for (i = 0; i < ni; i++) { - int di_ok; - xfs_dinode_t *dip; + if (error == EFSCORRUPTED && + (iget_flags & XFS_IGET_UNTRUSTED)) + return XFS_ERROR(EINVAL); - dip = (xfs_dinode_t *)xfs_buf_offset(bp, - (i << mp->m_sb.sb_inodelog)); - di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) && - XFS_DINODE_GOOD_VERSION(dip->di_version); - if (unlikely(XFS_TEST_ERROR(!di_ok, mp, - XFS_ERRTAG_ITOBP_INOTOBP, - XFS_RANDOM_ITOBP_INOTOBP))) { - if (iget_flags & XFS_IGET_UNTRUSTED) { - xfs_trans_brelse(tp, bp); - return XFS_ERROR(EINVAL); - } - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_HIGH, - mp, dip); -#ifdef DEBUG - xfs_emerg(mp, - "bad inode magic/vsn daddr %lld #%d (magic=%x)", - (unsigned long long)imap->im_blkno, i, - be16_to_cpu(dip->di_magic)); - ASSERT(0); -#endif - xfs_trans_brelse(tp, bp); - return XFS_ERROR(EFSCORRUPTED); - } + xfs_warn(mp, "%s: xfs_trans_read_buf() returned error %d.", + __func__, error); + return error; } - xfs_inobp_check(mp, bp); - *bpp = bp; *dipp = (struct xfs_dinode *)xfs_buf_offset(bp, imap->im_boffset); return 0; @@ -853,16 +1125,16 @@ xfs_iread_extents( * set according to the contents of the given cred structure. * * Use xfs_dialloc() to allocate the on-disk inode. If xfs_dialloc() - * has a free inode available, call xfs_iget() - * to obtain the in-core version of the allocated inode. Finally, - * fill in the inode and log its initial contents. In this case, - * ialloc_context would be set to NULL and call_again set to false. + * has a free inode available, call xfs_iget() to obtain the in-core + * version of the allocated inode. Finally, fill in the inode and + * log its initial contents. In this case, ialloc_context would be + * set to NULL. * - * If xfs_dialloc() does not have an available inode, - * it will replenish its supply by doing an allocation. Since we can - * only do one allocation within a transaction without deadlocks, we - * must commit the current transaction before returning the inode itself. - * In this case, therefore, we will set call_again to true and return. + * If xfs_dialloc() does not have an available inode, it will replenish + * its supply by doing an allocation. Since we can only do one + * allocation within a transaction without deadlocks, we must commit + * the current transaction before returning the inode itself. + * In this case, therefore, we will set ialloc_context and return. * The caller should then commit the current transaction, start a new * transaction, and call xfs_ialloc() again to actually get the inode. * @@ -1509,10 +1781,23 @@ xfs_ifree_cluster( * to mark all the active inodes on the buffer stale. */ bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno, - mp->m_bsize * blks_per_cluster, 0); + mp->m_bsize * blks_per_cluster, + XBF_UNMAPPED); if (!bp) return ENOMEM; + + /* + * This buffer may not have been correctly initialised as we + * didn't read it from disk. That's not important because we are + * only using to mark the buffer as stale in the log, and to + * attach stale cached inodes on it. That means it will never be + * dispatched for IO. If it is, we want to know about it, and we + * want it to fail. We can acheive this by adding a write + * verifier to the buffer. + */ + bp->b_ops = &xfs_inode_buf_ops; + /* * Walk the inodes already attached to the buffer and mark them * stale. These will all have the flush locks held, so an @@ -3660,3 +3945,40 @@ xfs_iext_irec_update_extoffs( ifp->if_u1.if_ext_irec[i].er_extoff += ext_diff; } } + +/* + * Test whether it is appropriate to check an inode for and free post EOF + * blocks. The 'force' parameter determines whether we should also consider + * regular files that are marked preallocated or append-only. + */ +bool +xfs_can_free_eofblocks(struct xfs_inode *ip, bool force) +{ + /* prealloc/delalloc exists only on regular files */ + if (!S_ISREG(ip->i_d.di_mode)) + return false; + + /* + * Zero sized files with no cached pages and delalloc blocks will not + * have speculative prealloc/delalloc blocks to remove. + */ + if (VFS_I(ip)->i_size == 0 && + VN_CACHED(VFS_I(ip)) == 0 && + ip->i_delayed_blks == 0) + return false; + + /* If we haven't read in the extent list, then don't do it now. */ + if (!(ip->i_df.if_flags & XFS_IFEXTENTS)) + return false; + + /* + * Do not free real preallocated or append-only files unless the file + * has delalloc blocks and we are forced to remove them. + */ + if (ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) + if (!force || ip->i_delayed_blks == 0) + return false; + + return true; +} + diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 94b32f906e79..22baf6ea4fac 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -496,11 +496,10 @@ static inline int xfs_isiflocked(struct xfs_inode *ip) (((pip)->i_mount->m_flags & XFS_MOUNT_GRPID) || \ ((pip)->i_d.di_mode & S_ISGID)) + /* - * xfs_iget.c prototypes. + * xfs_inode.c prototypes. */ -int xfs_iget(struct xfs_mount *, struct xfs_trans *, xfs_ino_t, - uint, uint, xfs_inode_t **); void xfs_ilock(xfs_inode_t *, uint); int xfs_ilock_nowait(xfs_inode_t *, uint); void xfs_iunlock(xfs_inode_t *, uint); @@ -508,11 +507,6 @@ void xfs_ilock_demote(xfs_inode_t *, uint); int xfs_isilocked(xfs_inode_t *, uint); uint xfs_ilock_map_shared(xfs_inode_t *); void xfs_iunlock_map_shared(xfs_inode_t *, uint); -void xfs_inode_free(struct xfs_inode *ip); - -/* - * xfs_inode.c prototypes. - */ int xfs_ialloc(struct xfs_trans *, xfs_inode_t *, umode_t, xfs_nlink_t, xfs_dev_t, prid_t, int, struct xfs_buf **, xfs_inode_t **); @@ -591,6 +585,7 @@ void xfs_iext_irec_compact(xfs_ifork_t *); void xfs_iext_irec_compact_pages(xfs_ifork_t *); void xfs_iext_irec_compact_full(xfs_ifork_t *); void xfs_iext_irec_update_extoffs(xfs_ifork_t *, int, int); +bool xfs_can_free_eofblocks(struct xfs_inode *, bool); #define xfs_ipincount(ip) ((unsigned int) atomic_read(&ip->i_pincount)) @@ -603,5 +598,6 @@ void xfs_inobp_check(struct xfs_mount *, struct xfs_buf *); extern struct kmem_zone *xfs_ifork_zone; extern struct kmem_zone *xfs_inode_zone; extern struct kmem_zone *xfs_ili_zone; +extern const struct xfs_buf_ops xfs_inode_buf_ops; #endif /* __XFS_INODE_H__ */ diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 8305f2ac6773..c1c3ef88a260 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -42,6 +42,7 @@ #include "xfs_inode_item.h" #include "xfs_export.h" #include "xfs_trace.h" +#include "xfs_icache.h" #include <linux/capability.h> #include <linux/dcache.h> @@ -70,7 +71,7 @@ xfs_find_handle( int hsize; xfs_handle_t handle; struct inode *inode; - struct fd f; + struct fd f = {0}; struct path path; int error; struct xfs_inode *ip; @@ -1602,6 +1603,26 @@ xfs_file_ioctl( error = xfs_errortag_clearall(mp, 1); return -error; + case XFS_IOC_FREE_EOFBLOCKS: { + struct xfs_eofblocks eofb; + + if (copy_from_user(&eofb, arg, sizeof(eofb))) + return -XFS_ERROR(EFAULT); + + if (eofb.eof_version != XFS_EOFBLOCKS_VERSION) + return -XFS_ERROR(EINVAL); + + if (eofb.eof_flags & ~XFS_EOF_FLAGS_VALID) + return -XFS_ERROR(EINVAL); + + if (memchr_inv(&eofb.pad32, 0, sizeof(eofb.pad32)) || + memchr_inv(eofb.pad64, 0, sizeof(eofb.pad64))) + return -XFS_ERROR(EINVAL); + + error = xfs_icache_free_eofblocks(mp, &eofb); + return -error; + } + default: return -ENOTTY; } diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 973dff6ad935..add06b4e9a63 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -41,6 +41,7 @@ #include "xfs_utils.h" #include "xfs_iomap.h" #include "xfs_trace.h" +#include "xfs_icache.h" #define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \ @@ -373,7 +374,7 @@ xfs_iomap_write_delay( xfs_extlen_t extsz; int nimaps; xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS]; - int prealloc, flushed = 0; + int prealloc; int error; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); @@ -434,31 +435,29 @@ retry: } /* - * If bmapi returned us nothing, we got either ENOSPC or EDQUOT. For - * ENOSPC, * flush all other inodes with delalloc blocks to free up - * some of the excess reserved metadata space. For both cases, retry + * If bmapi returned us nothing, we got either ENOSPC or EDQUOT. Retry * without EOF preallocation. */ if (nimaps == 0) { trace_xfs_delalloc_enospc(ip, offset, count); - if (flushed) - return XFS_ERROR(error ? error : ENOSPC); - - if (error == ENOSPC) { - xfs_iunlock(ip, XFS_ILOCK_EXCL); - xfs_flush_inodes(ip); - xfs_ilock(ip, XFS_ILOCK_EXCL); + if (prealloc) { + prealloc = 0; + error = 0; + goto retry; } - - flushed = 1; - error = 0; - prealloc = 0; - goto retry; + return XFS_ERROR(error ? error : ENOSPC); } if (!(imap[0].br_startblock || XFS_IS_REALTIME_INODE(ip))) return xfs_alert_fsblock_zero(ip, &imap[0]); + /* + * Tag the inode as speculatively preallocated so we can reclaim this + * space on demand, if necessary. + */ + if (prealloc) + xfs_inode_set_eofblocks_tag(ip); + *ret_imap = imap[0]; return 0; } @@ -584,7 +583,9 @@ xfs_iomap_write_allocate( * pointer that the caller gave to us. */ error = xfs_bmapi_write(tp, ip, map_start_fsb, - count_fsb, 0, &first_block, 1, + count_fsb, + XFS_BMAPI_STACK_SWITCH, + &first_block, 1, imap, &nimaps, &free_list); if (error) goto trans_cancel; diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 4e00cf091d2c..d82efaa2ac73 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -38,6 +38,7 @@ #include "xfs_vnodeops.h" #include "xfs_inode_item.h" #include "xfs_trace.h" +#include "xfs_icache.h" #include <linux/capability.h> #include <linux/xattr.h> @@ -779,8 +780,8 @@ xfs_setattr_size( * care about here. */ if (oldsize != ip->i_d.di_size && newsize > ip->i_d.di_size) { - error = xfs_flush_pages(ip, ip->i_d.di_size, newsize, 0, - FI_NONE); + error = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping, + ip->i_d.di_size, newsize); if (error) goto out_unlock; } @@ -854,6 +855,9 @@ xfs_setattr_size( * and do not wait the usual (long) time for writeout. */ xfs_iflags_set(ip, XFS_ITRUNCATED); + + /* A truncate down always removes post-EOF blocks. */ + xfs_inode_clear_eofblocks_tag(ip); } if (mask & ATTR_CTIME) { diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index 01d10a66e302..2ea7d402188d 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -34,6 +34,7 @@ #include "xfs_error.h" #include "xfs_btree.h" #include "xfs_trace.h" +#include "xfs_icache.h" STATIC int xfs_internal_inum( @@ -395,7 +396,8 @@ xfs_bulkstat( if (xfs_inobt_maskn(chunkidx, nicluster) & ~r.ir_free) xfs_btree_reada_bufs(mp, agno, - agbno, nbcluster); + agbno, nbcluster, + &xfs_inode_buf_ops); } irbp->ir_startino = r.ir_startino; irbp->ir_freecount = r.ir_freecount; diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index 828662f70d64..fe7e4df85a7b 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -44,6 +44,7 @@ #include <linux/kernel.h> #include <linux/blkdev.h> #include <linux/slab.h> +#include <linux/crc32c.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/file.h> @@ -118,6 +119,7 @@ #define xfs_rotorstep xfs_params.rotorstep.val #define xfs_inherit_nodefrag xfs_params.inherit_nodfrg.val #define xfs_fstrm_centisecs xfs_params.fstrm_timer.val +#define xfs_eofb_secs xfs_params.eofb_timer.val #define current_cpu() (raw_smp_processor_id()) #define current_pid() (current->pid) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 7f4f9370d0e7..46bd9d52ab51 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -34,6 +34,8 @@ #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_trace.h" +#include "xfs_fsops.h" +#include "xfs_cksum.h" kmem_zone_t *xfs_log_ticket_zone; @@ -458,7 +460,8 @@ xfs_log_reserve( tic->t_trans_type = t_type; *ticp = tic; - xlog_grant_push_ail(log, tic->t_unit_res * tic->t_cnt); + xlog_grant_push_ail(log, tic->t_cnt ? tic->t_unit_res * tic->t_cnt + : tic->t_unit_res); trace_xfs_log_reserve(log, tic); @@ -679,25 +682,29 @@ out: } /* - * Finish the recovery of the file system. This is separate from - * the xfs_log_mount() call, because it depends on the code in - * xfs_mountfs() to read in the root and real-time bitmap inodes - * between calling xfs_log_mount() and here. + * Finish the recovery of the file system. This is separate from the + * xfs_log_mount() call, because it depends on the code in xfs_mountfs() to read + * in the root and real-time bitmap inodes between calling xfs_log_mount() and + * here. * - * mp - ubiquitous xfs mount point structure + * If we finish recovery successfully, start the background log work. If we are + * not doing recovery, then we have a RO filesystem and we don't need to start + * it. */ int xfs_log_mount_finish(xfs_mount_t *mp) { - int error; + int error = 0; - if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) + if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) { error = xlog_recover_finish(mp->m_log); - else { - error = 0; + if (!error) + xfs_log_work_queue(mp); + } else { ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); } + return error; } @@ -850,15 +857,49 @@ xfs_log_unmount_write(xfs_mount_t *mp) } /* xfs_log_unmount_write */ /* - * Deallocate log structures for unmount/relocation. + * Empty the log for unmount/freeze. + * + * To do this, we first need to shut down the background log work so it is not + * trying to cover the log as we clean up. We then need to unpin all objects in + * the log so we can then flush them out. Once they have completed their IO and + * run the callbacks removing themselves from the AIL, we can write the unmount + * record. + */ +void +xfs_log_quiesce( + struct xfs_mount *mp) +{ + cancel_delayed_work_sync(&mp->m_log->l_work); + xfs_log_force(mp, XFS_LOG_SYNC); + + /* + * The superblock buffer is uncached and while xfs_ail_push_all_sync() + * will push it, xfs_wait_buftarg() will not wait for it. Further, + * xfs_buf_iowait() cannot be used because it was pushed with the + * XBF_ASYNC flag set, so we need to use a lock/unlock pair to wait for + * the IO to complete. + */ + xfs_ail_push_all_sync(mp->m_ail); + xfs_wait_buftarg(mp->m_ddev_targp); + xfs_buf_lock(mp->m_sb_bp); + xfs_buf_unlock(mp->m_sb_bp); + + xfs_log_unmount_write(mp); +} + +/* + * Shut down and release the AIL and Log. * - * We need to stop the aild from running before we destroy - * and deallocate the log as the aild references the log. + * During unmount, we need to ensure we flush all the dirty metadata objects + * from the AIL so that the log is empty before we write the unmount record to + * the log. Once this is done, we can tear down the AIL and the log. */ void -xfs_log_unmount(xfs_mount_t *mp) +xfs_log_unmount( + struct xfs_mount *mp) { - cancel_delayed_work_sync(&mp->m_sync_work); + xfs_log_quiesce(mp); + xfs_trans_ail_destroy(mp); xlog_dealloc_log(mp->m_log); } @@ -1090,8 +1131,7 @@ xlog_iodone(xfs_buf_t *bp) * with it being freed after writing the unmount record to the * log. */ - -} /* xlog_iodone */ +} /* * Return size of each in-core log record buffer. @@ -1161,6 +1201,40 @@ done: } /* xlog_get_iclog_buffer_size */ +void +xfs_log_work_queue( + struct xfs_mount *mp) +{ + queue_delayed_work(mp->m_log_workqueue, &mp->m_log->l_work, + msecs_to_jiffies(xfs_syncd_centisecs * 10)); +} + +/* + * Every sync period we need to unpin all items in the AIL and push them to + * disk. If there is nothing dirty, then we might need to cover the log to + * indicate that the filesystem is idle. + */ +void +xfs_log_worker( + struct work_struct *work) +{ + struct xlog *log = container_of(to_delayed_work(work), + struct xlog, l_work); + struct xfs_mount *mp = log->l_mp; + + /* dgc: errors ignored - not fatal and nowhere to report them */ + if (xfs_log_need_covered(mp)) + xfs_fs_log_dummy(mp); + else + xfs_log_force(mp, 0); + + /* start pushing all the metadata that is currently dirty */ + xfs_ail_push_all(mp->m_ail); + + /* queue us up again */ + xfs_log_work_queue(mp); +} + /* * This routine initializes some of the log structure for a given mount point. * Its primary purpose is to fill in enough, so recovery can occur. However, @@ -1195,6 +1269,7 @@ xlog_alloc_log( log->l_logBBsize = num_bblks; log->l_covered_state = XLOG_STATE_COVER_IDLE; log->l_flags |= XLOG_ACTIVE_RECOVERY; + INIT_DELAYED_WORK(&log->l_work, xfs_log_worker); log->l_prev_block = -1; /* log->l_tail_lsn = 0x100000000LL; cycle = 1; current block = 0 */ @@ -1417,6 +1492,84 @@ xlog_grant_push_ail( } /* + * Stamp cycle number in every block + */ +STATIC void +xlog_pack_data( + struct xlog *log, + struct xlog_in_core *iclog, + int roundoff) +{ + int i, j, k; + int size = iclog->ic_offset + roundoff; + __be32 cycle_lsn; + xfs_caddr_t dp; + + cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn); + + dp = iclog->ic_datap; + for (i = 0; i < BTOBB(size); i++) { + if (i >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) + break; + iclog->ic_header.h_cycle_data[i] = *(__be32 *)dp; + *(__be32 *)dp = cycle_lsn; + dp += BBSIZE; + } + + if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { + xlog_in_core_2_t *xhdr = iclog->ic_data; + + for ( ; i < BTOBB(size); i++) { + j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); + k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); + xhdr[j].hic_xheader.xh_cycle_data[k] = *(__be32 *)dp; + *(__be32 *)dp = cycle_lsn; + dp += BBSIZE; + } + + for (i = 1; i < log->l_iclog_heads; i++) + xhdr[i].hic_xheader.xh_cycle = cycle_lsn; + } +} + +/* + * Calculate the checksum for a log buffer. + * + * This is a little more complicated than it should be because the various + * headers and the actual data are non-contiguous. + */ +__le32 +xlog_cksum( + struct xlog *log, + struct xlog_rec_header *rhead, + char *dp, + int size) +{ + __uint32_t crc; + + /* first generate the crc for the record header ... */ + crc = xfs_start_cksum((char *)rhead, + sizeof(struct xlog_rec_header), + offsetof(struct xlog_rec_header, h_crc)); + + /* ... then for additional cycle data for v2 logs ... */ + if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { + union xlog_in_core2 *xhdr = (union xlog_in_core2 *)rhead; + int i; + + for (i = 1; i < log->l_iclog_heads; i++) { + crc = crc32c(crc, &xhdr[i].hic_xheader, + sizeof(struct xlog_rec_ext_header)); + } + } + + /* ... and finally for the payload */ + crc = crc32c(crc, dp, size); + + return xfs_end_cksum(crc); +} + +/* * The bdstrat callback function for log bufs. This gives us a central * place to trap bufs in case we get hit by a log I/O error and need to * shutdown. Actually, in practice, even when we didn't get a log error, @@ -1476,7 +1629,6 @@ xlog_sync( struct xlog *log, struct xlog_in_core *iclog) { - xfs_caddr_t dptr; /* pointer to byte sized element */ xfs_buf_t *bp; int i; uint count; /* byte count of bwrite */ @@ -1485,6 +1637,7 @@ xlog_sync( int split = 0; /* split write into two regions */ int error; int v2 = xfs_sb_version_haslogv2(&log->l_mp->m_sb); + int size; XFS_STATS_INC(xs_log_writes); ASSERT(atomic_read(&iclog->ic_refcnt) == 0); @@ -1515,13 +1668,10 @@ xlog_sync( xlog_pack_data(log, iclog, roundoff); /* real byte length */ - if (v2) { - iclog->ic_header.h_len = - cpu_to_be32(iclog->ic_offset + roundoff); - } else { - iclog->ic_header.h_len = - cpu_to_be32(iclog->ic_offset); - } + size = iclog->ic_offset; + if (v2) + size += roundoff; + iclog->ic_header.h_len = cpu_to_be32(size); bp = iclog->ic_bp; XFS_BUF_SET_ADDR(bp, BLOCK_LSN(be64_to_cpu(iclog->ic_header.h_lsn))); @@ -1530,12 +1680,36 @@ xlog_sync( /* Do we need to split this write into 2 parts? */ if (XFS_BUF_ADDR(bp) + BTOBB(count) > log->l_logBBsize) { + char *dptr; + split = count - (BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp))); count = BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp)); - iclog->ic_bwritecnt = 2; /* split into 2 writes */ + iclog->ic_bwritecnt = 2; + + /* + * Bump the cycle numbers at the start of each block in the + * part of the iclog that ends up in the buffer that gets + * written to the start of the log. + * + * Watch out for the header magic number case, though. + */ + dptr = (char *)&iclog->ic_header + count; + for (i = 0; i < split; i += BBSIZE) { + __uint32_t cycle = be32_to_cpu(*(__be32 *)dptr); + if (++cycle == XLOG_HEADER_MAGIC_NUM) + cycle++; + *(__be32 *)dptr = cpu_to_be32(cycle); + + dptr += BBSIZE; + } } else { iclog->ic_bwritecnt = 1; } + + /* calculcate the checksum */ + iclog->ic_header.h_crc = xlog_cksum(log, &iclog->ic_header, + iclog->ic_datap, size); + bp->b_io_length = BTOBB(count); bp->b_fspriv = iclog; XFS_BUF_ZEROFLAGS(bp); @@ -1589,19 +1763,6 @@ xlog_sync( bp->b_flags |= XBF_SYNCIO; if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) bp->b_flags |= XBF_FUA; - dptr = bp->b_addr; - /* - * Bump the cycle numbers at the start of each block - * since this part of the buffer is at the start of - * a new cycle. Watch out for the header magic number - * case, though. - */ - for (i = 0; i < split; i += BBSIZE) { - be32_add_cpu((__be32 *)dptr, 1); - if (be32_to_cpu(*(__be32 *)dptr) == XLOG_HEADER_MAGIC_NUM) - be32_add_cpu((__be32 *)dptr, 1); - dptr += BBSIZE; - } ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1); ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize); @@ -1618,7 +1779,6 @@ xlog_sync( return 0; } /* xlog_sync */ - /* * Deallocate a log structure */ @@ -2387,14 +2547,27 @@ xlog_state_do_callback( /* - * update the last_sync_lsn before we drop the + * Completion of a iclog IO does not imply that + * a transaction has completed, as transactions + * can be large enough to span many iclogs. We + * cannot change the tail of the log half way + * through a transaction as this may be the only + * transaction in the log and moving th etail to + * point to the middle of it will prevent + * recovery from finding the start of the + * transaction. Hence we should only update the + * last_sync_lsn if this iclog contains + * transaction completion callbacks on it. + * + * We have to do this before we drop the * icloglock to ensure we are the only one that * can update it. */ ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn), be64_to_cpu(iclog->ic_header.h_lsn)) <= 0); - atomic64_set(&log->l_last_sync_lsn, - be64_to_cpu(iclog->ic_header.h_lsn)); + if (iclog->ic_callback) + atomic64_set(&log->l_last_sync_lsn, + be64_to_cpu(iclog->ic_header.h_lsn)); } else ioerrors++; @@ -3700,3 +3873,4 @@ xlog_iclogs_empty( } while (iclog != log->l_iclog); return 1; } + diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 748d312850e2..5caee96059df 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -181,5 +181,9 @@ int xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp, xfs_lsn_t *commit_lsn, int flags); bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip); +void xfs_log_work_queue(struct xfs_mount *mp); +void xfs_log_worker(struct work_struct *work); +void xfs_log_quiesce(struct xfs_mount *mp); + #endif #endif /* __XFS_LOG_H__ */ diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 18a801d76a42..16d8d12ea3b4 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -139,7 +139,6 @@ static inline uint xlog_get_client_id(__be32 i) /* * Flags for log structure */ -#define XLOG_CHKSUM_MISMATCH 0x1 /* used only during recovery */ #define XLOG_ACTIVE_RECOVERY 0x2 /* in the middle of recovery */ #define XLOG_RECOVERY_NEEDED 0x4 /* log was recovered */ #define XLOG_IO_ERROR 0x8 /* log hit an I/O error, and being @@ -291,7 +290,7 @@ typedef struct xlog_rec_header { __be32 h_len; /* len in bytes; should be 64-bit aligned: 4 */ __be64 h_lsn; /* lsn of this LR : 8 */ __be64 h_tail_lsn; /* lsn of 1st LR w/ buffers not committed: 8 */ - __be32 h_chksum; /* may not be used; non-zero if used : 4 */ + __le32 h_crc; /* crc of log record : 4 */ __be32 h_prev_block; /* block number to previous LR : 4 */ __be32 h_num_logops; /* number of log operations in this LR : 4 */ __be32 h_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE]; @@ -495,6 +494,7 @@ struct xlog { struct xfs_buf *l_xbuf; /* extra buffer for log * wrapping */ struct xfs_buftarg *l_targ; /* buftarg of log */ + struct delayed_work l_work; /* background flush work */ uint l_flags; uint l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */ struct list_head *l_buf_cancel_table; @@ -554,11 +554,9 @@ xlog_recover( extern int xlog_recover_finish( struct xlog *log); -extern void -xlog_pack_data( - struct xlog *log, - struct xlog_in_core *iclog, - int); + +extern __le32 xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead, + char *dp, int size); extern kmem_zone_t *xfs_log_ticket_zone; struct xlog_ticket * diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 5da3ace352bf..96fcbb85ff83 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -41,7 +41,9 @@ #include "xfs_trans_priv.h" #include "xfs_quota.h" #include "xfs_utils.h" +#include "xfs_cksum.h" #include "xfs_trace.h" +#include "xfs_icache.h" STATIC int xlog_find_zeroed( @@ -2143,7 +2145,7 @@ xlog_recover_buffer_pass2( buf_flags |= XBF_UNMAPPED; bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len, - buf_flags); + buf_flags, NULL); if (!bp) return XFS_ERROR(ENOMEM); error = bp->b_error; @@ -2236,7 +2238,8 @@ xlog_recover_inode_pass2( } trace_xfs_log_recover_inode_recover(log, in_f); - bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 0); + bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 0, + NULL); if (!bp) { error = ENOMEM; goto error; @@ -2547,7 +2550,8 @@ xlog_recover_dquot_pass2( ASSERT(dq_f->qlf_len == 1); error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dq_f->qlf_blkno, - XFS_FSB_TO_BB(mp, dq_f->qlf_len), 0, &bp); + XFS_FSB_TO_BB(mp, dq_f->qlf_len), 0, &bp, + NULL); if (error) return error; @@ -3213,80 +3217,58 @@ xlog_recover_process_iunlinks( mp->m_dmevmask = mp_dmevmask; } - -#ifdef DEBUG -STATIC void -xlog_pack_data_checksum( - struct xlog *log, - struct xlog_in_core *iclog, - int size) -{ - int i; - __be32 *up; - uint chksum = 0; - - up = (__be32 *)iclog->ic_datap; - /* divide length by 4 to get # words */ - for (i = 0; i < (size >> 2); i++) { - chksum ^= be32_to_cpu(*up); - up++; - } - iclog->ic_header.h_chksum = cpu_to_be32(chksum); -} -#else -#define xlog_pack_data_checksum(log, iclog, size) -#endif - /* - * Stamp cycle number in every block + * Upack the log buffer data and crc check it. If the check fails, issue a + * warning if and only if the CRC in the header is non-zero. This makes the + * check an advisory warning, and the zero CRC check will prevent failure + * warnings from being emitted when upgrading the kernel from one that does not + * add CRCs by default. + * + * When filesystems are CRC enabled, this CRC mismatch becomes a fatal log + * corruption failure */ -void -xlog_pack_data( - struct xlog *log, - struct xlog_in_core *iclog, - int roundoff) +STATIC int +xlog_unpack_data_crc( + struct xlog_rec_header *rhead, + xfs_caddr_t dp, + struct xlog *log) { - int i, j, k; - int size = iclog->ic_offset + roundoff; - __be32 cycle_lsn; - xfs_caddr_t dp; - - xlog_pack_data_checksum(log, iclog, size); - - cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn); - - dp = iclog->ic_datap; - for (i = 0; i < BTOBB(size) && - i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) { - iclog->ic_header.h_cycle_data[i] = *(__be32 *)dp; - *(__be32 *)dp = cycle_lsn; - dp += BBSIZE; - } - - if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { - xlog_in_core_2_t *xhdr = iclog->ic_data; - - for ( ; i < BTOBB(size); i++) { - j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); - k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); - xhdr[j].hic_xheader.xh_cycle_data[k] = *(__be32 *)dp; - *(__be32 *)dp = cycle_lsn; - dp += BBSIZE; + __le32 crc; + + crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len)); + if (crc != rhead->h_crc) { + if (rhead->h_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) { + xfs_alert(log->l_mp, + "log record CRC mismatch: found 0x%x, expected 0x%x.\n", + le32_to_cpu(rhead->h_crc), + le32_to_cpu(crc)); + xfs_hex_dump(dp, 32); } - for (i = 1; i < log->l_iclog_heads; i++) { - xhdr[i].hic_xheader.xh_cycle = cycle_lsn; - } + /* + * If we've detected a log record corruption, then we can't + * recover past this point. Abort recovery if we are enforcing + * CRC protection by punting an error back up the stack. + */ + if (xfs_sb_version_hascrc(&log->l_mp->m_sb)) + return EFSCORRUPTED; } + + return 0; } -STATIC void +STATIC int xlog_unpack_data( struct xlog_rec_header *rhead, xfs_caddr_t dp, struct xlog *log) { int i, j, k; + int error; + + error = xlog_unpack_data_crc(rhead, dp, log); + if (error) + return error; for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) && i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) { @@ -3303,6 +3285,8 @@ xlog_unpack_data( dp += BBSIZE; } } + + return 0; } STATIC int @@ -3434,9 +3418,13 @@ xlog_do_recovery_pass( if (error) goto bread_err2; - xlog_unpack_data(rhead, offset, log); - if ((error = xlog_recover_process_data(log, - rhash, rhead, offset, pass))) + error = xlog_unpack_data(rhead, offset, log); + if (error) + goto bread_err2; + + error = xlog_recover_process_data(log, + rhash, rhead, offset, pass); + if (error) goto bread_err2; blk_no += bblks + hblks; } @@ -3541,14 +3529,19 @@ xlog_do_recovery_pass( * - order is important. */ error = xlog_bread_offset(log, 0, - bblks - split_bblks, hbp, + bblks - split_bblks, dbp, offset + BBTOB(split_bblks)); if (error) goto bread_err2; } - xlog_unpack_data(rhead, offset, log); - if ((error = xlog_recover_process_data(log, rhash, - rhead, offset, pass))) + + error = xlog_unpack_data(rhead, offset, log); + if (error) + goto bread_err2; + + error = xlog_recover_process_data(log, rhash, + rhead, offset, pass); + if (error) goto bread_err2; blk_no += bblks; } @@ -3573,9 +3566,13 @@ xlog_do_recovery_pass( if (error) goto bread_err2; - xlog_unpack_data(rhead, offset, log); - if ((error = xlog_recover_process_data(log, rhash, - rhead, offset, pass))) + error = xlog_unpack_data(rhead, offset, log); + if (error) + goto bread_err2; + + error = xlog_recover_process_data(log, rhash, + rhead, offset, pass); + if (error) goto bread_err2; blk_no += bblks + hblks; } @@ -3689,13 +3686,14 @@ xlog_do_recover( /* * Now that we've finished replaying all buffer and inode - * updates, re-read in the superblock. + * updates, re-read in the superblock and reverify it. */ bp = xfs_getsb(log->l_mp, 0); XFS_BUF_UNDONE(bp); ASSERT(!(XFS_BUF_ISWRITE(bp))); XFS_BUF_READ(bp); XFS_BUF_UNASYNC(bp); + bp->b_ops = &xfs_sb_buf_ops; xfsbdstrat(log->l_mp, bp); error = xfs_buf_iowait(bp); if (error) { @@ -3707,7 +3705,7 @@ xlog_do_recover( /* Convert superblock from on-disk format */ sbp = &log->l_mp->m_sb; - xfs_sb_from_disk(log->l_mp, XFS_BUF_TO_SBP(bp)); + xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp)); ASSERT(sbp->sb_magicnum == XFS_SB_MAGIC); ASSERT(xfs_sb_good_version(sbp)); xfs_buf_relse(bp); diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index b2bd3a0e6376..da508463ff10 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -42,6 +42,7 @@ #include "xfs_fsops.h" #include "xfs_utils.h" #include "xfs_trace.h" +#include "xfs_icache.h" #ifdef HAVE_PERCPU_SB @@ -303,9 +304,8 @@ STATIC int xfs_mount_validate_sb( xfs_mount_t *mp, xfs_sb_t *sbp, - int flags) + bool check_inprogress) { - int loud = !(flags & XFS_MFSI_QUIET); /* * If the log device and data device have the @@ -315,21 +315,18 @@ xfs_mount_validate_sb( * a volume filesystem in a non-volume manner. */ if (sbp->sb_magicnum != XFS_SB_MAGIC) { - if (loud) - xfs_warn(mp, "bad magic number"); + xfs_warn(mp, "bad magic number"); return XFS_ERROR(EWRONGFS); } if (!xfs_sb_good_version(sbp)) { - if (loud) - xfs_warn(mp, "bad version"); + xfs_warn(mp, "bad version"); return XFS_ERROR(EWRONGFS); } if (unlikely( sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) { - if (loud) - xfs_warn(mp, + xfs_warn(mp, "filesystem is marked as having an external log; " "specify logdev on the mount command line."); return XFS_ERROR(EINVAL); @@ -337,8 +334,7 @@ xfs_mount_validate_sb( if (unlikely( sbp->sb_logstart != 0 && mp->m_logdev_targp != mp->m_ddev_targp)) { - if (loud) - xfs_warn(mp, + xfs_warn(mp, "filesystem is marked as having an internal log; " "do not specify logdev on the mount command line."); return XFS_ERROR(EINVAL); @@ -372,8 +368,7 @@ xfs_mount_validate_sb( sbp->sb_dblocks == 0 || sbp->sb_dblocks > XFS_MAX_DBLOCKS(sbp) || sbp->sb_dblocks < XFS_MIN_DBLOCKS(sbp))) { - if (loud) - XFS_CORRUPTION_ERROR("SB sanity check failed", + XFS_CORRUPTION_ERROR("SB sanity check failed", XFS_ERRLEVEL_LOW, mp, sbp); return XFS_ERROR(EFSCORRUPTED); } @@ -382,12 +377,10 @@ xfs_mount_validate_sb( * Until this is fixed only page-sized or smaller data blocks work. */ if (unlikely(sbp->sb_blocksize > PAGE_SIZE)) { - if (loud) { - xfs_warn(mp, + xfs_warn(mp, "File system with blocksize %d bytes. " "Only pagesize (%ld) or less will currently work.", sbp->sb_blocksize, PAGE_SIZE); - } return XFS_ERROR(ENOSYS); } @@ -401,23 +394,20 @@ xfs_mount_validate_sb( case 2048: break; default: - if (loud) - xfs_warn(mp, "inode size of %d bytes not supported", + xfs_warn(mp, "inode size of %d bytes not supported", sbp->sb_inodesize); return XFS_ERROR(ENOSYS); } if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) || xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) { - if (loud) - xfs_warn(mp, + xfs_warn(mp, "file system too large to be mounted on this system."); return XFS_ERROR(EFBIG); } - if (unlikely(sbp->sb_inprogress)) { - if (loud) - xfs_warn(mp, "file system busy"); + if (check_inprogress && sbp->sb_inprogress) { + xfs_warn(mp, "Offline file system operation in progress!"); return XFS_ERROR(EFSCORRUPTED); } @@ -425,9 +415,7 @@ xfs_mount_validate_sb( * Version 1 directory format has never worked on Linux. */ if (unlikely(!xfs_sb_version_hasdirv2(sbp))) { - if (loud) - xfs_warn(mp, - "file system using version 1 directory format"); + xfs_warn(mp, "file system using version 1 directory format"); return XFS_ERROR(ENOSYS); } @@ -520,11 +508,9 @@ out_unwind: void xfs_sb_from_disk( - struct xfs_mount *mp, + struct xfs_sb *to, xfs_dsb_t *from) { - struct xfs_sb *to = &mp->m_sb; - to->sb_magicnum = be32_to_cpu(from->sb_magicnum); to->sb_blocksize = be32_to_cpu(from->sb_blocksize); to->sb_dblocks = be64_to_cpu(from->sb_dblocks); @@ -626,6 +612,72 @@ xfs_sb_to_disk( } } +static void +xfs_sb_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_sb sb; + int error; + + xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp)); + + /* + * Only check the in progress field for the primary superblock as + * mkfs.xfs doesn't clear it from secondary superblocks. + */ + error = xfs_mount_validate_sb(mp, &sb, bp->b_bn == XFS_SB_DADDR); + if (error) + xfs_buf_ioerror(bp, error); +} + +static void +xfs_sb_read_verify( + struct xfs_buf *bp) +{ + xfs_sb_verify(bp); +} + +/* + * We may be probed for a filesystem match, so we may not want to emit + * messages when the superblock buffer is not actually an XFS superblock. + * If we find an XFS superblock, the run a normal, noisy mount because we are + * really going to mount it and want to know about errors. + */ +static void +xfs_sb_quiet_read_verify( + struct xfs_buf *bp) +{ + struct xfs_sb sb; + + xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp)); + + if (sb.sb_magicnum == XFS_SB_MAGIC) { + /* XFS filesystem, verify noisily! */ + xfs_sb_read_verify(bp); + return; + } + /* quietly fail */ + xfs_buf_ioerror(bp, EFSCORRUPTED); +} + +static void +xfs_sb_write_verify( + struct xfs_buf *bp) +{ + xfs_sb_verify(bp); +} + +const struct xfs_buf_ops xfs_sb_buf_ops = { + .verify_read = xfs_sb_read_verify, + .verify_write = xfs_sb_write_verify, +}; + +static const struct xfs_buf_ops xfs_sb_quiet_buf_ops = { + .verify_read = xfs_sb_quiet_read_verify, + .verify_write = xfs_sb_write_verify, +}; + /* * xfs_readsb * @@ -651,26 +703,27 @@ xfs_readsb(xfs_mount_t *mp, int flags) reread: bp = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR, - BTOBB(sector_size), 0); + BTOBB(sector_size), 0, + loud ? &xfs_sb_buf_ops + : &xfs_sb_quiet_buf_ops); if (!bp) { if (loud) xfs_warn(mp, "SB buffer read failed"); return EIO; } - - /* - * Initialize the mount structure from the superblock. - * But first do some basic consistency checking. - */ - xfs_sb_from_disk(mp, XFS_BUF_TO_SBP(bp)); - error = xfs_mount_validate_sb(mp, &(mp->m_sb), flags); - if (error) { + if (bp->b_error) { + error = bp->b_error; if (loud) xfs_warn(mp, "SB validate failed"); goto release_buf; } /* + * Initialize the mount structure from the superblock. + */ + xfs_sb_from_disk(&mp->m_sb, XFS_BUF_TO_SBP(bp)); + + /* * We must be able to do sector-sized and sector-aligned IO. */ if (sector_size > mp->m_sb.sb_sectsize) { @@ -1001,7 +1054,7 @@ xfs_check_sizes(xfs_mount_t *mp) } bp = xfs_buf_read_uncached(mp->m_ddev_targp, d - XFS_FSS_TO_BB(mp, 1), - XFS_FSS_TO_BB(mp, 1), 0); + XFS_FSS_TO_BB(mp, 1), 0, NULL); if (!bp) { xfs_warn(mp, "last sector read failed"); return EIO; @@ -1016,7 +1069,7 @@ xfs_check_sizes(xfs_mount_t *mp) } bp = xfs_buf_read_uncached(mp->m_logdev_targp, d - XFS_FSB_TO_BB(mp, 1), - XFS_FSB_TO_BB(mp, 1), 0); + XFS_FSB_TO_BB(mp, 1), 0, NULL); if (!bp) { xfs_warn(mp, "log device read failed"); return EIO; @@ -1427,6 +1480,8 @@ xfs_unmountfs( __uint64_t resblks; int error; + cancel_delayed_work_sync(&mp->m_eofblocks_work); + xfs_qm_unmount_quotas(mp); xfs_rtunmount_inodes(mp); IRELE(mp->m_rootip); @@ -1450,21 +1505,16 @@ xfs_unmountfs( /* * And reclaim all inodes. At this point there should be no dirty - * inode, and none should be pinned or locked, but use synchronous - * reclaim just to be sure. + * inodes and none should be pinned or locked, but use synchronous + * reclaim just to be sure. We can stop background inode reclaim + * here as well if it is still running. */ + cancel_delayed_work_sync(&mp->m_reclaim_work); xfs_reclaim_inodes(mp, SYNC_WAIT); xfs_qm_unmount(mp); /* - * Flush out the log synchronously so that we know for sure - * that nothing is pinned. This is important because bflush() - * will skip pinned buffers. - */ - xfs_log_force(mp, XFS_LOG_SYNC); - - /* * Unreserve any blocks we have so that when we unmount we don't account * the reserved free space as used. This is really only necessary for * lazy superblock counting because it trusts the incore superblock @@ -1489,23 +1539,6 @@ xfs_unmountfs( xfs_warn(mp, "Unable to update superblock counters. " "Freespace may not be correct on next mount."); - /* - * At this point we might have modified the superblock again and thus - * added an item to the AIL, thus flush it again. - */ - xfs_ail_push_all_sync(mp->m_ail); - xfs_wait_buftarg(mp->m_ddev_targp); - - /* - * The superblock buffer is uncached and xfsaild_push() will lock and - * set the XBF_ASYNC flag on the buffer. We cannot do xfs_buf_iowait() - * here but a lock on the superblock buffer will block until iodone() - * has completed. - */ - xfs_buf_lock(mp->m_sb_bp); - xfs_buf_unlock(mp->m_sb_bp); - - xfs_log_unmount_write(mp); xfs_log_unmount(mp); xfs_uuid_unmount(mp); diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index deee09e534dc..bab8314507e4 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -51,8 +51,6 @@ typedef struct xfs_trans_reservations { #else /* __KERNEL__ */ -#include "xfs_sync.h" - struct xlog; struct xfs_inode; struct xfs_mru_cache; @@ -197,9 +195,9 @@ typedef struct xfs_mount { struct mutex m_icsb_mutex; /* balancer sync lock */ #endif struct xfs_mru_cache *m_filestream; /* per-mount filestream data */ - struct delayed_work m_sync_work; /* background sync work */ struct delayed_work m_reclaim_work; /* background inode reclaim */ - struct work_struct m_flush_work; /* background inode flush */ + struct delayed_work m_eofblocks_work; /* background eof blocks + trimming */ __int64_t m_update_flags; /* sb flags we need to update on the next remount,rw */ struct shrinker m_inode_shrink; /* inode reclaim shrinker */ @@ -209,6 +207,9 @@ typedef struct xfs_mount { struct workqueue_struct *m_data_workqueue; struct workqueue_struct *m_unwritten_workqueue; struct workqueue_struct *m_cil_workqueue; + struct workqueue_struct *m_reclaim_workqueue; + struct workqueue_struct *m_log_workqueue; + struct workqueue_struct *m_eofblocks_workqueue; } xfs_mount_t; /* @@ -387,7 +388,9 @@ extern void xfs_set_low_space_thresholds(struct xfs_mount *); extern void xfs_mod_sb(struct xfs_trans *, __int64_t); extern int xfs_initialize_perag(struct xfs_mount *, xfs_agnumber_t, xfs_agnumber_t *); -extern void xfs_sb_from_disk(struct xfs_mount *, struct xfs_dsb *); +extern void xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *); extern void xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t); +extern const struct xfs_buf_ops xfs_sb_buf_ops; + #endif /* __XFS_MOUNT_H__ */ diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 2e86fa0cfc0d..60eff4763156 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -40,6 +40,7 @@ #include "xfs_utils.h" #include "xfs_qm.h" #include "xfs_trace.h" +#include "xfs_icache.h" /* * The global quota manager. There is only one of these for the entire @@ -891,7 +892,8 @@ xfs_qm_dqiter_bufs( while (blkcnt--) { error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, bno), - mp->m_quotainfo->qi_dqchunklen, 0, &bp); + mp->m_quotainfo->qi_dqchunklen, 0, &bp, + &xfs_dquot_buf_ops); if (error) break; @@ -978,7 +980,8 @@ xfs_qm_dqiterate( while (rablkcnt--) { xfs_buf_readahead(mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, rablkno), - mp->m_quotainfo->qi_dqchunklen); + mp->m_quotainfo->qi_dqchunklen, + NULL); rablkno++; } } @@ -1453,7 +1456,7 @@ xfs_qm_dqreclaim_one( int error; if (!xfs_dqlock_nowait(dqp)) - goto out_busy; + goto out_move_tail; /* * This dquot has acquired a reference in the meantime remove it from @@ -1476,7 +1479,7 @@ xfs_qm_dqreclaim_one( * getting flushed to disk, we don't want to reclaim it. */ if (!xfs_dqflock_nowait(dqp)) - goto out_busy; + goto out_unlock_move_tail; if (XFS_DQ_IS_DIRTY(dqp)) { struct xfs_buf *bp = NULL; @@ -1487,7 +1490,7 @@ xfs_qm_dqreclaim_one( if (error) { xfs_warn(mp, "%s: dquot %p flush failed", __func__, dqp); - goto out_busy; + goto out_unlock_move_tail; } xfs_buf_delwri_queue(bp, buffer_list); @@ -1496,7 +1499,7 @@ xfs_qm_dqreclaim_one( * Give the dquot another try on the freelist, as the * flushing will take some time. */ - goto out_busy; + goto out_unlock_move_tail; } xfs_dqfunlock(dqp); @@ -1515,14 +1518,13 @@ xfs_qm_dqreclaim_one( XFS_STATS_INC(xs_qm_dqreclaims); return; -out_busy: - xfs_dqunlock(dqp); - /* * Move the dquot to the tail of the list so that we don't spin on it. */ +out_unlock_move_tail: + xfs_dqunlock(dqp); +out_move_tail: list_move_tail(&dqp->q_lru, &qi->qi_lru_list); - trace_xfs_dqreclaim_busy(dqp); XFS_STATS_INC(xs_qm_dqreclaim_misses); } diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 858a3b186110..5f53e75409b8 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -40,6 +40,7 @@ #include "xfs_utils.h" #include "xfs_qm.h" #include "xfs_trace.h" +#include "xfs_icache.h" STATIC int xfs_qm_log_quotaoff(xfs_mount_t *, xfs_qoff_logitem_t **, uint); STATIC int xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *, @@ -845,7 +846,8 @@ STATIC int xfs_dqrele_inode( struct xfs_inode *ip, struct xfs_perag *pag, - int flags) + int flags, + void *args) { /* skip quota inodes */ if (ip == ip->i_mount->m_quotainfo->qi_uquotaip || @@ -881,5 +883,5 @@ xfs_qm_dqrele_all_inodes( uint flags) { ASSERT(mp->m_quotainfo); - xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags); + xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, NULL); } diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index ca28a4ba4b54..98dc670d3ee0 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -38,6 +38,7 @@ #include "xfs_utils.h" #include "xfs_trace.h" #include "xfs_buf.h" +#include "xfs_icache.h" /* @@ -869,7 +870,7 @@ xfs_rtbuf_get( ASSERT(map.br_startblock != NULLFSBLOCK); error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, map.br_startblock), - mp->m_bsize, 0, &bp); + mp->m_bsize, 0, &bp, NULL); if (error) return error; ASSERT(!xfs_buf_geterror(bp)); @@ -1872,9 +1873,14 @@ xfs_growfs_rt( */ bp = xfs_buf_read_uncached(mp->m_rtdev_targp, XFS_FSB_TO_BB(mp, nrblocks - 1), - XFS_FSB_TO_BB(mp, 1), 0); + XFS_FSB_TO_BB(mp, 1), 0, NULL); if (!bp) return EIO; + if (bp->b_error) { + error = bp->b_error; + xfs_buf_relse(bp); + return error; + } xfs_buf_relse(bp); /* @@ -2219,9 +2225,11 @@ xfs_rtmount_init( } bp = xfs_buf_read_uncached(mp->m_rtdev_targp, d - XFS_FSB_TO_BB(mp, 1), - XFS_FSB_TO_BB(mp, 1), 0); - if (!bp) { + XFS_FSB_TO_BB(mp, 1), 0, NULL); + if (!bp || bp->b_error) { xfs_warn(mp, "realtime device size check failed"); + if (bp) + xfs_buf_relse(bp); return EIO; } xfs_buf_relse(bp); diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h index f429d9d5d325..a05b45175fb0 100644 --- a/fs/xfs/xfs_sb.h +++ b/fs/xfs/xfs_sb.h @@ -81,6 +81,7 @@ struct xfs_mount; #define XFS_SB_VERSION2_ATTR2BIT 0x00000008 /* Inline attr rework */ #define XFS_SB_VERSION2_PARENTBIT 0x00000010 /* parent pointers */ #define XFS_SB_VERSION2_PROJID32BIT 0x00000080 /* 32 bit project id */ +#define XFS_SB_VERSION2_CRCBIT 0x00000100 /* metadata CRCs */ #define XFS_SB_VERSION2_OKREALFBITS \ (XFS_SB_VERSION2_LAZYSBCOUNTBIT | \ @@ -503,6 +504,12 @@ static inline int xfs_sb_version_hasprojid32bit(xfs_sb_t *sbp) (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT); } +static inline int xfs_sb_version_hascrc(xfs_sb_t *sbp) +{ + return (xfs_sb_version_hasmorebits(sbp) && + (sbp->sb_features2 & XFS_SB_VERSION2_CRCBIT)); +} + /* * end of superblock version macros */ diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 26a09bd7f975..ab8839b26272 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -49,7 +49,7 @@ #include "xfs_extfree_item.h" #include "xfs_mru_cache.h" #include "xfs_inode_item.h" -#include "xfs_sync.h" +#include "xfs_icache.h" #include "xfs_trace.h" #include <linux/namei.h> @@ -863,8 +863,30 @@ xfs_init_mount_workqueues( WQ_MEM_RECLAIM, 0, mp->m_fsname); if (!mp->m_cil_workqueue) goto out_destroy_unwritten; + + mp->m_reclaim_workqueue = alloc_workqueue("xfs-reclaim/%s", + WQ_NON_REENTRANT, 0, mp->m_fsname); + if (!mp->m_reclaim_workqueue) + goto out_destroy_cil; + + mp->m_log_workqueue = alloc_workqueue("xfs-log/%s", + WQ_NON_REENTRANT, 0, mp->m_fsname); + if (!mp->m_log_workqueue) + goto out_destroy_reclaim; + + mp->m_eofblocks_workqueue = alloc_workqueue("xfs-eofblocks/%s", + WQ_NON_REENTRANT, 0, mp->m_fsname); + if (!mp->m_eofblocks_workqueue) + goto out_destroy_log; + return 0; +out_destroy_log: + destroy_workqueue(mp->m_log_workqueue); +out_destroy_reclaim: + destroy_workqueue(mp->m_reclaim_workqueue); +out_destroy_cil: + destroy_workqueue(mp->m_cil_workqueue); out_destroy_unwritten: destroy_workqueue(mp->m_unwritten_workqueue); out_destroy_data_iodone_queue: @@ -877,11 +899,32 @@ STATIC void xfs_destroy_mount_workqueues( struct xfs_mount *mp) { + destroy_workqueue(mp->m_eofblocks_workqueue); + destroy_workqueue(mp->m_log_workqueue); + destroy_workqueue(mp->m_reclaim_workqueue); destroy_workqueue(mp->m_cil_workqueue); destroy_workqueue(mp->m_data_workqueue); destroy_workqueue(mp->m_unwritten_workqueue); } +/* + * Flush all dirty data to disk. Must not be called while holding an XFS_ILOCK + * or a page lock. We use sync_inodes_sb() here to ensure we block while waiting + * for IO to complete so that we effectively throttle multiple callers to the + * rate at which IO is completing. + */ +void +xfs_flush_inodes( + struct xfs_mount *mp) +{ + struct super_block *sb = mp->m_super; + + if (down_read_trylock(&sb->s_umount)) { + sync_inodes_sb(sb); + up_read(&sb->s_umount); + } +} + /* Catch misguided souls that try to use this interface on XFS */ STATIC struct inode * xfs_fs_alloc_inode( @@ -1006,9 +1049,8 @@ xfs_fs_put_super( struct xfs_mount *mp = XFS_M(sb); xfs_filestream_unmount(mp); - cancel_delayed_work_sync(&mp->m_sync_work); xfs_unmountfs(mp); - xfs_syncd_stop(mp); + xfs_freesb(mp); xfs_icsb_destroy_counters(mp); xfs_destroy_mount_workqueues(mp); @@ -1023,7 +1065,6 @@ xfs_fs_sync_fs( int wait) { struct xfs_mount *mp = XFS_M(sb); - int error; /* * Doing anything during the async pass would be counterproductive. @@ -1031,17 +1072,14 @@ xfs_fs_sync_fs( if (!wait) return 0; - error = xfs_quiesce_data(mp); - if (error) - return -error; - + xfs_log_force(mp, XFS_LOG_SYNC); if (laptop_mode) { /* * The disk must be active because we're syncing. - * We schedule xfssyncd now (now that the disk is + * We schedule log work now (now that the disk is * active) instead of later (when it might not be). */ - flush_delayed_work(&mp->m_sync_work); + flush_delayed_work(&mp->m_log->l_work); } return 0; @@ -1118,6 +1156,48 @@ xfs_restore_resvblks(struct xfs_mount *mp) xfs_reserve_blocks(mp, &resblks, NULL); } +/* + * Trigger writeback of all the dirty metadata in the file system. + * + * This ensures that the metadata is written to their location on disk rather + * than just existing in transactions in the log. This means after a quiesce + * there is no log replay required to write the inodes to disk - this is the + * primary difference between a sync and a quiesce. + * + * Note: xfs_log_quiesce() stops background log work - the callers must ensure + * it is started again when appropriate. + */ +void +xfs_quiesce_attr( + struct xfs_mount *mp) +{ + int error = 0; + + /* wait for all modifications to complete */ + while (atomic_read(&mp->m_active_trans) > 0) + delay(100); + + /* force the log to unpin objects from the now complete transactions */ + xfs_log_force(mp, XFS_LOG_SYNC); + + /* reclaim inodes to do any IO before the freeze completes */ + xfs_reclaim_inodes(mp, 0); + xfs_reclaim_inodes(mp, SYNC_WAIT); + + /* Push the superblock and write an unmount record */ + error = xfs_log_sbcount(mp); + if (error) + xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. " + "Frozen image may not be consistent."); + /* + * Just warn here till VFS can correctly support + * read-only remount without racing. + */ + WARN_ON(atomic_read(&mp->m_active_trans) != 0); + + xfs_log_quiesce(mp); +} + STATIC int xfs_fs_remount( struct super_block *sb, @@ -1198,20 +1278,18 @@ xfs_fs_remount( * value if it is non-zero, otherwise go with the default. */ xfs_restore_resvblks(mp); + xfs_log_work_queue(mp); } /* rw -> ro */ if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) { /* - * After we have synced the data but before we sync the - * metadata, we need to free up the reserve block pool so that - * the used block count in the superblock on disk is correct at - * the end of the remount. Stash the current reserve pool size - * so that if we get remounted rw, we can return it to the same - * size. + * Before we sync the metadata, we need to free up the reserve + * block pool so that the used block count in the superblock on + * disk is correct at the end of the remount. Stash the current + * reserve pool size so that if we get remounted rw, we can + * return it to the same size. */ - - xfs_quiesce_data(mp); xfs_save_resvblks(mp); xfs_quiesce_attr(mp); mp->m_flags |= XFS_MOUNT_RDONLY; @@ -1243,6 +1321,7 @@ xfs_fs_unfreeze( struct xfs_mount *mp = XFS_M(sb); xfs_restore_resvblks(mp); + xfs_log_work_queue(mp); return 0; } @@ -1321,6 +1400,8 @@ xfs_fs_fill_super( spin_lock_init(&mp->m_sb_lock); mutex_init(&mp->m_growlock); atomic_set(&mp->m_active_trans, 0); + INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker); + INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker); mp->m_super = sb; sb->s_fs_info = mp; @@ -1371,10 +1452,6 @@ xfs_fs_fill_super( /* * we must configure the block size in the superblock before we run the * full mount process as the mount process can lookup and cache inodes. - * For the same reason we must also initialise the syncd and register - * the inode cache shrinker so that inodes can be reclaimed during - * operations like a quotacheck that iterate all inodes in the - * filesystem. */ sb->s_magic = XFS_SB_MAGIC; sb->s_blocksize = mp->m_sb.sb_blocksize; @@ -1384,13 +1461,9 @@ xfs_fs_fill_super( sb->s_time_gran = 1; set_posix_acl_flag(sb); - error = xfs_syncd_init(mp); - if (error) - goto out_filestream_unmount; - error = xfs_mountfs(mp); if (error) - goto out_syncd_stop; + goto out_filestream_unmount; root = igrab(VFS_I(mp->m_rootip)); if (!root) { @@ -1408,8 +1481,7 @@ xfs_fs_fill_super( } return 0; - out_syncd_stop: - xfs_syncd_stop(mp); + out_filestream_unmount: xfs_filestream_unmount(mp); out_free_sb: @@ -1429,7 +1501,6 @@ out_destroy_workqueues: out_unmount: xfs_filestream_unmount(mp); xfs_unmountfs(mp); - xfs_syncd_stop(mp); goto out_free_sb; } @@ -1625,16 +1696,6 @@ STATIC int __init xfs_init_workqueues(void) { /* - * We never want to the same work item to run twice, reclaiming inodes - * or idling the log is not going to get any faster by multiple CPUs - * competing for ressources. Use the default large max_active value - * so that even lots of filesystems can perform these task in parallel. - */ - xfs_syncd_wq = alloc_workqueue("xfssyncd", WQ_NON_REENTRANT, 0); - if (!xfs_syncd_wq) - return -ENOMEM; - - /* * The allocation workqueue can be used in memory reclaim situations * (writepage path), and parallelism is only limited by the number of * AGs in all the filesystems mounted. Hence use the default large @@ -1642,20 +1703,15 @@ xfs_init_workqueues(void) */ xfs_alloc_wq = alloc_workqueue("xfsalloc", WQ_MEM_RECLAIM, 0); if (!xfs_alloc_wq) - goto out_destroy_syncd; + return -ENOMEM; return 0; - -out_destroy_syncd: - destroy_workqueue(xfs_syncd_wq); - return -ENOMEM; } STATIC void xfs_destroy_workqueues(void) { destroy_workqueue(xfs_alloc_wq); - destroy_workqueue(xfs_syncd_wq); } STATIC int __init diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h index 9de4a920ba05..bbe3d15a7904 100644 --- a/fs/xfs/xfs_super.h +++ b/fs/xfs/xfs_super.h @@ -74,6 +74,7 @@ struct block_device; extern __uint64_t xfs_max_file_offset(unsigned int); +extern void xfs_flush_inodes(struct xfs_mount *mp); extern void xfs_blkdev_issue_flush(struct xfs_buftarg *); extern xfs_agnumber_t xfs_set_inode32(struct xfs_mount *); extern xfs_agnumber_t xfs_set_inode64(struct xfs_mount *); diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c index ee2d2adaa438..2801b5ce6cdb 100644 --- a/fs/xfs/xfs_sysctl.c +++ b/fs/xfs/xfs_sysctl.c @@ -202,6 +202,15 @@ static ctl_table xfs_table[] = { .extra1 = &xfs_params.fstrm_timer.min, .extra2 = &xfs_params.fstrm_timer.max, }, + { + .procname = "speculative_prealloc_lifetime", + .data = &xfs_params.eofb_timer.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &xfs_params.eofb_timer.min, + .extra2 = &xfs_params.eofb_timer.max, + }, /* please keep this the last entry */ #ifdef CONFIG_PROC_FS { diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h index b9937d450f8e..bd8e157c20ef 100644 --- a/fs/xfs/xfs_sysctl.h +++ b/fs/xfs/xfs_sysctl.h @@ -47,6 +47,7 @@ typedef struct xfs_param { xfs_sysctl_val_t rotorstep; /* inode32 AG rotoring control knob */ xfs_sysctl_val_t inherit_nodfrg;/* Inherit the "nodefrag" inode flag. */ xfs_sysctl_val_t fstrm_timer; /* Filestream dir-AG assoc'n timeout. */ + xfs_sysctl_val_t eofb_timer; /* Interval between eofb scan wakeups */ } xfs_param_t; /* diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 7d36ccf57f93..2e137d4a85ae 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -96,6 +96,8 @@ DEFINE_ATTR_LIST_EVENT(xfs_attr_list_full); DEFINE_ATTR_LIST_EVENT(xfs_attr_list_add); DEFINE_ATTR_LIST_EVENT(xfs_attr_list_wrong_blk); DEFINE_ATTR_LIST_EVENT(xfs_attr_list_notfound); +DEFINE_ATTR_LIST_EVENT(xfs_attr_leaf_list); +DEFINE_ATTR_LIST_EVENT(xfs_attr_node_list); DECLARE_EVENT_CLASS(xfs_perag_class, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount, @@ -130,6 +132,8 @@ DEFINE_PERAG_REF_EVENT(xfs_perag_get_tag); DEFINE_PERAG_REF_EVENT(xfs_perag_put); DEFINE_PERAG_REF_EVENT(xfs_perag_set_reclaim); DEFINE_PERAG_REF_EVENT(xfs_perag_clear_reclaim); +DEFINE_PERAG_REF_EVENT(xfs_perag_set_eofblocks); +DEFINE_PERAG_REF_EVENT(xfs_perag_clear_eofblocks); TRACE_EVENT(xfs_attr_list_node_descend, TP_PROTO(struct xfs_attr_list_context *ctx, @@ -585,6 +589,10 @@ DEFINE_INODE_EVENT(xfs_update_time); DEFINE_INODE_EVENT(xfs_dquot_dqalloc); DEFINE_INODE_EVENT(xfs_dquot_dqdetach); +DEFINE_INODE_EVENT(xfs_inode_set_eofblocks_tag); +DEFINE_INODE_EVENT(xfs_inode_clear_eofblocks_tag); +DEFINE_INODE_EVENT(xfs_inode_free_eofblocks_invalid); + DECLARE_EVENT_CLASS(xfs_iref_class, TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), TP_ARGS(ip, caller_ip), @@ -1496,8 +1504,42 @@ DEFINE_DIR2_EVENT(xfs_dir2_node_replace); DEFINE_DIR2_EVENT(xfs_dir2_node_removename); DEFINE_DIR2_EVENT(xfs_dir2_node_to_leaf); +DECLARE_EVENT_CLASS(xfs_attr_class, + TP_PROTO(struct xfs_da_args *args), + TP_ARGS(args), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __dynamic_array(char, name, args->namelen) + __field(int, namelen) + __field(int, valuelen) + __field(xfs_dahash_t, hashval) + __field(int, op_flags) + ), + TP_fast_assign( + __entry->dev = VFS_I(args->dp)->i_sb->s_dev; + __entry->ino = args->dp->i_ino; + if (args->namelen) + memcpy(__get_str(name), args->name, args->namelen); + __entry->namelen = args->namelen; + __entry->valuelen = args->valuelen; + __entry->hashval = args->hashval; + __entry->op_flags = args->op_flags; + ), + TP_printk("dev %d:%d ino 0x%llx name %.*s namelen %d valuelen %d " + "hashval 0x%x op_flags %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->namelen, + __entry->namelen ? __get_str(name) : NULL, + __entry->namelen, + __entry->valuelen, + __entry->hashval, + __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS)) +) + #define DEFINE_ATTR_EVENT(name) \ -DEFINE_EVENT(xfs_da_class, name, \ +DEFINE_EVENT(xfs_attr_class, name, \ TP_PROTO(struct xfs_da_args *args), \ TP_ARGS(args)) DEFINE_ATTR_EVENT(xfs_attr_sf_add); @@ -1511,10 +1553,14 @@ DEFINE_ATTR_EVENT(xfs_attr_sf_to_leaf); DEFINE_ATTR_EVENT(xfs_attr_leaf_add); DEFINE_ATTR_EVENT(xfs_attr_leaf_add_old); DEFINE_ATTR_EVENT(xfs_attr_leaf_add_new); +DEFINE_ATTR_EVENT(xfs_attr_leaf_add_work); DEFINE_ATTR_EVENT(xfs_attr_leaf_addname); DEFINE_ATTR_EVENT(xfs_attr_leaf_create); +DEFINE_ATTR_EVENT(xfs_attr_leaf_compact); +DEFINE_ATTR_EVENT(xfs_attr_leaf_get); DEFINE_ATTR_EVENT(xfs_attr_leaf_lookup); DEFINE_ATTR_EVENT(xfs_attr_leaf_replace); +DEFINE_ATTR_EVENT(xfs_attr_leaf_remove); DEFINE_ATTR_EVENT(xfs_attr_leaf_removename); DEFINE_ATTR_EVENT(xfs_attr_leaf_split); DEFINE_ATTR_EVENT(xfs_attr_leaf_split_before); @@ -1526,12 +1572,21 @@ DEFINE_ATTR_EVENT(xfs_attr_leaf_to_sf); DEFINE_ATTR_EVENT(xfs_attr_leaf_to_node); DEFINE_ATTR_EVENT(xfs_attr_leaf_rebalance); DEFINE_ATTR_EVENT(xfs_attr_leaf_unbalance); +DEFINE_ATTR_EVENT(xfs_attr_leaf_toosmall); DEFINE_ATTR_EVENT(xfs_attr_node_addname); +DEFINE_ATTR_EVENT(xfs_attr_node_get); DEFINE_ATTR_EVENT(xfs_attr_node_lookup); DEFINE_ATTR_EVENT(xfs_attr_node_replace); DEFINE_ATTR_EVENT(xfs_attr_node_removename); +DEFINE_ATTR_EVENT(xfs_attr_fillstate); +DEFINE_ATTR_EVENT(xfs_attr_refillstate); + +DEFINE_ATTR_EVENT(xfs_attr_rmtval_get); +DEFINE_ATTR_EVENT(xfs_attr_rmtval_set); +DEFINE_ATTR_EVENT(xfs_attr_rmtval_remove); + #define DEFINE_DA_EVENT(name) \ DEFINE_EVENT(xfs_da_class, name, \ TP_PROTO(struct xfs_da_args *args), \ @@ -1550,9 +1605,12 @@ DEFINE_DA_EVENT(xfs_da_node_split); DEFINE_DA_EVENT(xfs_da_node_remove); DEFINE_DA_EVENT(xfs_da_node_rebalance); DEFINE_DA_EVENT(xfs_da_node_unbalance); +DEFINE_DA_EVENT(xfs_da_node_toosmall); DEFINE_DA_EVENT(xfs_da_swap_lastblock); DEFINE_DA_EVENT(xfs_da_grow_inode); DEFINE_DA_EVENT(xfs_da_shrink_inode); +DEFINE_DA_EVENT(xfs_da_fixhashpath); +DEFINE_DA_EVENT(xfs_da_path_shift); DECLARE_EVENT_CLASS(xfs_dir2_space_class, TP_PROTO(struct xfs_da_args *args, int idx), diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index db056544cbb5..c6c0601abd7a 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -464,10 +464,7 @@ xfs_trans_get_buf( int numblks, uint flags) { - struct xfs_buf_map map = { - .bm_bn = blkno, - .bm_len = numblks, - }; + DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); return xfs_trans_get_buf_map(tp, target, &map, 1, flags); } @@ -476,7 +473,8 @@ int xfs_trans_read_buf_map(struct xfs_mount *mp, struct xfs_buftarg *target, struct xfs_buf_map *map, int nmaps, xfs_buf_flags_t flags, - struct xfs_buf **bpp); + struct xfs_buf **bpp, + const struct xfs_buf_ops *ops); static inline int xfs_trans_read_buf( @@ -486,13 +484,12 @@ xfs_trans_read_buf( xfs_daddr_t blkno, int numblks, xfs_buf_flags_t flags, - struct xfs_buf **bpp) + struct xfs_buf **bpp, + const struct xfs_buf_ops *ops) { - struct xfs_buf_map map = { - .bm_bn = blkno, - .bm_len = numblks, - }; - return xfs_trans_read_buf_map(mp, tp, target, &map, 1, flags, bpp); + DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); + return xfs_trans_read_buf_map(mp, tp, target, &map, 1, + flags, bpp, ops); } struct xfs_buf *xfs_trans_getsb(xfs_trans_t *, struct xfs_mount *, int); diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 6311b99c267f..4fc17d479d42 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -257,7 +257,8 @@ xfs_trans_read_buf_map( struct xfs_buf_map *map, int nmaps, xfs_buf_flags_t flags, - struct xfs_buf **bpp) + struct xfs_buf **bpp, + const struct xfs_buf_ops *ops) { xfs_buf_t *bp; xfs_buf_log_item_t *bip; @@ -265,7 +266,7 @@ xfs_trans_read_buf_map( *bpp = NULL; if (!tp) { - bp = xfs_buf_read_map(target, map, nmaps, flags); + bp = xfs_buf_read_map(target, map, nmaps, flags, ops); if (!bp) return (flags & XBF_TRYLOCK) ? EAGAIN : XFS_ERROR(ENOMEM); @@ -312,7 +313,9 @@ xfs_trans_read_buf_map( if (!(XFS_BUF_ISDONE(bp))) { trace_xfs_trans_read_buf_io(bp, _RET_IP_); ASSERT(!XFS_BUF_ISASYNC(bp)); + ASSERT(bp->b_iodone == NULL); XFS_BUF_READ(bp); + bp->b_ops = ops; xfsbdstrat(tp->t_mountp, bp); error = xfs_buf_iowait(bp); if (error) { @@ -349,7 +352,7 @@ xfs_trans_read_buf_map( return 0; } - bp = xfs_buf_read_map(target, map, nmaps, flags); + bp = xfs_buf_read_map(target, map, nmaps, flags, ops); if (bp == NULL) { *bpp = NULL; return (flags & XBF_TRYLOCK) ? diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 2a5c637344b4..d95f565a390e 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -47,6 +47,7 @@ #include "xfs_filestream.h" #include "xfs_vnodeops.h" #include "xfs_trace.h" +#include "xfs_icache.h" /* * The maximum pathlen is 1024 bytes. Since the minimum file system @@ -79,7 +80,7 @@ xfs_readlink_bmap( d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock); byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount); - bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0); + bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0, NULL); if (!bp) return XFS_ERROR(ENOMEM); error = bp->b_error; @@ -150,7 +151,7 @@ xfs_readlink( * when the link count isn't zero and by xfs_dm_punch_hole() when * punching a hole to EOF. */ -STATIC int +int xfs_free_eofblocks( xfs_mount_t *mp, xfs_inode_t *ip, @@ -199,7 +200,7 @@ xfs_free_eofblocks( if (need_iolock) { if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { xfs_trans_cancel(tp, 0); - return 0; + return EAGAIN; } } @@ -237,6 +238,8 @@ xfs_free_eofblocks( } else { error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + if (!error) + xfs_inode_clear_eofblocks_tag(ip); } xfs_iunlock(ip, XFS_ILOCK_EXCL); @@ -425,19 +428,18 @@ xfs_release( truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED); if (truncated) { xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE); - if (VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0) - xfs_flush_pages(ip, 0, -1, XBF_ASYNC, FI_NONE); + if (VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0) { + error = -filemap_flush(VFS_I(ip)->i_mapping); + if (error) + return error; + } } } if (ip->i_d.di_nlink == 0) return 0; - if ((S_ISREG(ip->i_d.di_mode) && - (VFS_I(ip)->i_size > 0 || - (VN_CACHED(VFS_I(ip)) > 0 || ip->i_delayed_blks > 0)) && - (ip->i_df.if_flags & XFS_IFEXTENTS)) && - (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) { + if (xfs_can_free_eofblocks(ip, false)) { /* * If we can't get the iolock just skip truncating the blocks @@ -464,7 +466,7 @@ xfs_release( return 0; error = xfs_free_eofblocks(mp, ip, true); - if (error) + if (error && error != EAGAIN) return error; /* delalloc blocks after truncation means it really is dirty */ @@ -513,13 +515,12 @@ xfs_inactive( goto out; if (ip->i_d.di_nlink != 0) { - if ((S_ISREG(ip->i_d.di_mode) && - (VFS_I(ip)->i_size > 0 || - (VN_CACHED(VFS_I(ip)) > 0 || ip->i_delayed_blks > 0)) && - (ip->i_df.if_flags & XFS_IFEXTENTS) && - (!(ip->i_d.di_flags & - (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) || - ip->i_delayed_blks != 0))) { + /* + * force is true because we are evicting an inode from the + * cache. Post-eof blocks must be freed, lest we end up with + * broken free space accounting. + */ + if (xfs_can_free_eofblocks(ip, true)) { error = xfs_free_eofblocks(mp, ip, false); if (error) return VN_INACTIVE_CACHE; @@ -777,7 +778,7 @@ xfs_create( XFS_TRANS_PERM_LOG_RES, log_count); if (error == ENOSPC) { /* flush outstanding delalloc blocks and retry */ - xfs_flush_inodes(dp); + xfs_flush_inodes(mp); error = xfs_trans_reserve(tp, resblks, log_res, 0, XFS_TRANS_PERM_LOG_RES, log_count); } @@ -1957,12 +1958,11 @@ xfs_free_file_space( rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE); ioffset = offset & ~(rounding - 1); - - if (VN_CACHED(VFS_I(ip)) != 0) { - error = xfs_flushinval_pages(ip, ioffset, -1, FI_REMAPF_LOCKED); - if (error) - goto out_unlock_iolock; - } + error = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping, + ioffset, -1); + if (error) + goto out_unlock_iolock; + truncate_pagecache_range(VFS_I(ip), ioffset, -1); /* * Need to zero the stuff we're not freeing, on disk. @@ -2095,6 +2095,73 @@ xfs_free_file_space( return error; } + +STATIC int +xfs_zero_file_space( + struct xfs_inode *ip, + xfs_off_t offset, + xfs_off_t len, + int attr_flags) +{ + struct xfs_mount *mp = ip->i_mount; + uint granularity; + xfs_off_t start_boundary; + xfs_off_t end_boundary; + int error; + + granularity = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE); + + /* + * Round the range of extents we are going to convert inwards. If the + * offset is aligned, then it doesn't get changed so we zero from the + * start of the block offset points to. + */ + start_boundary = round_up(offset, granularity); + end_boundary = round_down(offset + len, granularity); + + ASSERT(start_boundary >= offset); + ASSERT(end_boundary <= offset + len); + + if (!(attr_flags & XFS_ATTR_NOLOCK)) + xfs_ilock(ip, XFS_IOLOCK_EXCL); + + if (start_boundary < end_boundary - 1) { + /* punch out the page cache over the conversion range */ + truncate_pagecache_range(VFS_I(ip), start_boundary, + end_boundary - 1); + /* convert the blocks */ + error = xfs_alloc_file_space(ip, start_boundary, + end_boundary - start_boundary - 1, + XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT, + attr_flags); + if (error) + goto out_unlock; + + /* We've handled the interior of the range, now for the edges */ + if (start_boundary != offset) + error = xfs_iozero(ip, offset, start_boundary - offset); + if (error) + goto out_unlock; + + if (end_boundary != offset + len) + error = xfs_iozero(ip, end_boundary, + offset + len - end_boundary); + + } else { + /* + * It's either a sub-granularity range or the range spanned lies + * partially across two adjacent blocks. + */ + error = xfs_iozero(ip, offset, len); + } + +out_unlock: + if (!(attr_flags & XFS_ATTR_NOLOCK)) + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + return error; + +} + /* * xfs_change_file_space() * This routine allocates or frees disk space for the given file. @@ -2120,10 +2187,8 @@ xfs_change_file_space( xfs_fsize_t fsize; int setprealloc; xfs_off_t startoffset; - xfs_off_t llen; xfs_trans_t *tp; struct iattr iattr; - int prealloc_type; if (!S_ISREG(ip->i_d.di_mode)) return XFS_ERROR(EINVAL); @@ -2141,12 +2206,30 @@ xfs_change_file_space( return XFS_ERROR(EINVAL); } - llen = bf->l_len > 0 ? bf->l_len - 1 : bf->l_len; + /* + * length of <= 0 for resv/unresv/zero is invalid. length for + * alloc/free is ignored completely and we have no idea what userspace + * might have set it to, so set it to zero to allow range + * checks to pass. + */ + switch (cmd) { + case XFS_IOC_ZERO_RANGE: + case XFS_IOC_RESVSP: + case XFS_IOC_RESVSP64: + case XFS_IOC_UNRESVSP: + case XFS_IOC_UNRESVSP64: + if (bf->l_len <= 0) + return XFS_ERROR(EINVAL); + break; + default: + bf->l_len = 0; + break; + } if (bf->l_start < 0 || bf->l_start > mp->m_super->s_maxbytes || - bf->l_start + llen < 0 || - bf->l_start + llen > mp->m_super->s_maxbytes) + bf->l_start + bf->l_len < 0 || + bf->l_start + bf->l_len >= mp->m_super->s_maxbytes) return XFS_ERROR(EINVAL); bf->l_whence = 0; @@ -2154,29 +2237,20 @@ xfs_change_file_space( startoffset = bf->l_start; fsize = XFS_ISIZE(ip); - /* - * XFS_IOC_RESVSP and XFS_IOC_UNRESVSP will reserve or unreserve - * file space. - * These calls do NOT zero the data space allocated to the file, - * nor do they change the file size. - * - * XFS_IOC_ALLOCSP and XFS_IOC_FREESP will allocate and free file - * space. - * These calls cause the new file data to be zeroed and the file - * size to be changed. - */ setprealloc = clrprealloc = 0; - prealloc_type = XFS_BMAPI_PREALLOC; - switch (cmd) { case XFS_IOC_ZERO_RANGE: - prealloc_type |= XFS_BMAPI_CONVERT; - xfs_tosspages(ip, startoffset, startoffset + bf->l_len, 0); - /* FALLTHRU */ + error = xfs_zero_file_space(ip, startoffset, bf->l_len, + attr_flags); + if (error) + return error; + setprealloc = 1; + break; + case XFS_IOC_RESVSP: case XFS_IOC_RESVSP64: error = xfs_alloc_file_space(ip, startoffset, bf->l_len, - prealloc_type, attr_flags); + XFS_BMAPI_PREALLOC, attr_flags); if (error) return error; setprealloc = 1; diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h index 447e146b2ba6..5163022d9808 100644 --- a/fs/xfs/xfs_vnodeops.h +++ b/fs/xfs/xfs_vnodeops.h @@ -48,14 +48,9 @@ int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name, int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags); int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize, int flags, struct attrlist_cursor_kern *cursor); -void xfs_tosspages(struct xfs_inode *inode, xfs_off_t first, - xfs_off_t last, int fiopt); -int xfs_flushinval_pages(struct xfs_inode *ip, xfs_off_t first, - xfs_off_t last, int fiopt); -int xfs_flush_pages(struct xfs_inode *ip, xfs_off_t first, - xfs_off_t last, uint64_t flags, int fiopt); -int xfs_wait_on_pages(struct xfs_inode *ip, xfs_off_t first, xfs_off_t last); +int xfs_iozero(struct xfs_inode *, loff_t, size_t); int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t); +int xfs_free_eofblocks(struct xfs_mount *, struct xfs_inode *, bool); #endif /* _XFS_VNODEOPS_H */ |