From 5db53f3e80dee2d9dff5e534f9e9fe1db17c9936 Mon Sep 17 00:00:00 2001 From: Joern Engel <joern@logfs.org> Date: Fri, 20 Nov 2009 20:13:39 +0100 Subject: [LogFS] add new flash file system This is a new flash file system. See Documentation/filesystems/logfs.txt Signed-off-by: Joern Engel <joern@logfs.org> --- fs/Kconfig | 1 + fs/Makefile | 1 + fs/logfs/Kconfig | 17 + fs/logfs/Makefile | 13 + fs/logfs/compr.c | 95 +++ fs/logfs/dev_bdev.c | 263 ++++++ fs/logfs/dev_mtd.c | 253 ++++++ fs/logfs/dir.c | 818 ++++++++++++++++++ fs/logfs/file.c | 263 ++++++ fs/logfs/gc.c | 730 ++++++++++++++++ fs/logfs/inode.c | 417 ++++++++++ fs/logfs/journal.c | 879 ++++++++++++++++++++ fs/logfs/logfs.h | 722 ++++++++++++++++ fs/logfs/logfs_abi.h | 627 ++++++++++++++ fs/logfs/readwrite.c | 2246 ++++++++++++++++++++++++++++++++++++++++++++++++++ fs/logfs/segment.c | 924 +++++++++++++++++++++ fs/logfs/super.c | 634 ++++++++++++++ 17 files changed, 8903 insertions(+) create mode 100644 fs/logfs/Kconfig create mode 100644 fs/logfs/Makefile create mode 100644 fs/logfs/compr.c create mode 100644 fs/logfs/dev_bdev.c create mode 100644 fs/logfs/dev_mtd.c create mode 100644 fs/logfs/dir.c create mode 100644 fs/logfs/file.c create mode 100644 fs/logfs/gc.c create mode 100644 fs/logfs/inode.c create mode 100644 fs/logfs/journal.c create mode 100644 fs/logfs/logfs.h create mode 100644 fs/logfs/logfs_abi.h create mode 100644 fs/logfs/readwrite.c create mode 100644 fs/logfs/segment.c create mode 100644 fs/logfs/super.c (limited to 'fs') diff --git a/fs/Kconfig b/fs/Kconfig index 64d44efad7a5..7405f071be67 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -177,6 +177,7 @@ source "fs/efs/Kconfig" source "fs/jffs2/Kconfig" # UBIFS File system configuration source "fs/ubifs/Kconfig" +source "fs/logfs/Kconfig" source "fs/cramfs/Kconfig" source "fs/squashfs/Kconfig" source "fs/freevxfs/Kconfig" diff --git a/fs/Makefile b/fs/Makefile index af6d04700d9c..c3633aa46911 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -99,6 +99,7 @@ obj-$(CONFIG_NTFS_FS) += ntfs/ obj-$(CONFIG_UFS_FS) += ufs/ obj-$(CONFIG_EFS_FS) += efs/ obj-$(CONFIG_JFFS2_FS) += jffs2/ +obj-$(CONFIG_LOGFS) += logfs/ obj-$(CONFIG_UBIFS_FS) += ubifs/ obj-$(CONFIG_AFFS_FS) += affs/ obj-$(CONFIG_ROMFS_FS) += romfs/ diff --git a/fs/logfs/Kconfig b/fs/logfs/Kconfig new file mode 100644 index 000000000000..daf9a9b32dd3 --- /dev/null +++ b/fs/logfs/Kconfig @@ -0,0 +1,17 @@ +config LOGFS + tristate "LogFS file system (EXPERIMENTAL)" + depends on (MTD || BLOCK) && EXPERIMENTAL + select ZLIB_INFLATE + select ZLIB_DEFLATE + select CRC32 + select BTREE + help + Flash filesystem aimed to scale efficiently to large devices. + In comparison to JFFS2 it offers significantly faster mount + times and potentially less RAM usage, although the latter has + not been measured yet. + + In its current state it is still very experimental and should + not be used for other than testing purposes. + + If unsure, say N. diff --git a/fs/logfs/Makefile b/fs/logfs/Makefile new file mode 100644 index 000000000000..4820027787ee --- /dev/null +++ b/fs/logfs/Makefile @@ -0,0 +1,13 @@ +obj-$(CONFIG_LOGFS) += logfs.o + +logfs-y += compr.o +logfs-y += dir.o +logfs-y += file.o +logfs-y += gc.o +logfs-y += inode.o +logfs-y += journal.o +logfs-y += readwrite.o +logfs-y += segment.o +logfs-y += super.o +logfs-$(CONFIG_BLOCK) += dev_bdev.o +logfs-$(CONFIG_MTD) += dev_mtd.o diff --git a/fs/logfs/compr.c b/fs/logfs/compr.c new file mode 100644 index 000000000000..44bbfd249abc --- /dev/null +++ b/fs/logfs/compr.c @@ -0,0 +1,95 @@ +/* + * fs/logfs/compr.c - compression routines + * + * As should be obvious for Linux kernel code, license is GPLv2 + * + * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org> + */ +#include "logfs.h" +#include <linux/vmalloc.h> +#include <linux/zlib.h> + +#define COMPR_LEVEL 3 + +static DEFINE_MUTEX(compr_mutex); +static struct z_stream_s stream; + +int logfs_compress(void *in, void *out, size_t inlen, size_t outlen) +{ + int err, ret; + + ret = -EIO; + mutex_lock(&compr_mutex); + err = zlib_deflateInit(&stream, COMPR_LEVEL); + if (err != Z_OK) + goto error; + + stream.next_in = in; + stream.avail_in = inlen; + stream.total_in = 0; + stream.next_out = out; + stream.avail_out = outlen; + stream.total_out = 0; + + err = zlib_deflate(&stream, Z_FINISH); + if (err != Z_STREAM_END) + goto error; + + err = zlib_deflateEnd(&stream); + if (err != Z_OK) + goto error; + + if (stream.total_out >= stream.total_in) + goto error; + + ret = stream.total_out; +error: + mutex_unlock(&compr_mutex); + return ret; +} + +int logfs_uncompress(void *in, void *out, size_t inlen, size_t outlen) +{ + int err, ret; + + ret = -EIO; + mutex_lock(&compr_mutex); + err = zlib_inflateInit(&stream); + if (err != Z_OK) + goto error; + + stream.next_in = in; + stream.avail_in = inlen; + stream.total_in = 0; + stream.next_out = out; + stream.avail_out = outlen; + stream.total_out = 0; + + err = zlib_inflate(&stream, Z_FINISH); + if (err != Z_STREAM_END) + goto error; + + err = zlib_inflateEnd(&stream); + if (err != Z_OK) + goto error; + + ret = 0; +error: + mutex_unlock(&compr_mutex); + return ret; +} + +int __init logfs_compr_init(void) +{ + size_t size = max(zlib_deflate_workspacesize(), + zlib_inflate_workspacesize()); + stream.workspace = vmalloc(size); + if (!stream.workspace) + return -ENOMEM; + return 0; +} + +void logfs_compr_exit(void) +{ + vfree(stream.workspace); +} diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c new file mode 100644 index 000000000000..58a057b6e1af --- /dev/null +++ b/fs/logfs/dev_bdev.c @@ -0,0 +1,263 @@ +/* + * fs/logfs/dev_bdev.c - Device access methods for block devices + * + * As should be obvious for Linux kernel code, license is GPLv2 + * + * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org> + */ +#include "logfs.h" +#include <linux/bio.h> +#include <linux/blkdev.h> +#include <linux/buffer_head.h> + +#define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1)) + +static void request_complete(struct bio *bio, int err) +{ + complete((struct completion *)bio->bi_private); +} + +static int sync_request(struct page *page, struct block_device *bdev, int rw) +{ + struct bio bio; + struct bio_vec bio_vec; + struct completion complete; + + bio_init(&bio); + bio.bi_io_vec = &bio_vec; + bio_vec.bv_page = page; + bio_vec.bv_len = PAGE_SIZE; + bio_vec.bv_offset = 0; + bio.bi_vcnt = 1; + bio.bi_idx = 0; + bio.bi_size = PAGE_SIZE; + bio.bi_bdev = bdev; + bio.bi_sector = page->index * (PAGE_SIZE >> 9); + init_completion(&complete); + bio.bi_private = &complete; + bio.bi_end_io = request_complete; + + submit_bio(rw, &bio); + generic_unplug_device(bdev_get_queue(bdev)); + wait_for_completion(&complete); + return test_bit(BIO_UPTODATE, &bio.bi_flags) ? 0 : -EIO; +} + +static int bdev_readpage(void *_sb, struct page *page) +{ + struct super_block *sb = _sb; + struct block_device *bdev = logfs_super(sb)->s_bdev; + int err; + + err = sync_request(page, bdev, READ); + if (err) { + ClearPageUptodate(page); + SetPageError(page); + } else { + SetPageUptodate(page); + ClearPageError(page); + } + unlock_page(page); + return err; +} + +static DECLARE_WAIT_QUEUE_HEAD(wq); + +static void writeseg_end_io(struct bio *bio, int err) +{ + const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; + struct super_block *sb = bio->bi_private; + struct logfs_super *super = logfs_super(sb); + struct page *page; + + BUG_ON(!uptodate); /* FIXME: Retry io or write elsewhere */ + BUG_ON(err); + BUG_ON(bio->bi_vcnt == 0); + do { + page = bvec->bv_page; + if (--bvec >= bio->bi_io_vec) + prefetchw(&bvec->bv_page->flags); + + end_page_writeback(page); + } while (bvec >= bio->bi_io_vec); + bio_put(bio); + if (atomic_dec_and_test(&super->s_pending_writes)) + wake_up(&wq); +} + +static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index, + size_t nr_pages) +{ + struct logfs_super *super = logfs_super(sb); + struct address_space *mapping = super->s_mapping_inode->i_mapping; + struct bio *bio; + struct page *page; + struct request_queue *q = bdev_get_queue(sb->s_bdev); + unsigned int max_pages = queue_max_hw_sectors(q) >> (PAGE_SHIFT - 9); + int i; + + bio = bio_alloc(GFP_NOFS, max_pages); + BUG_ON(!bio); /* FIXME: handle this */ + + for (i = 0; i < nr_pages; i++) { + if (i >= max_pages) { + /* Block layer cannot split bios :( */ + bio->bi_vcnt = i; + bio->bi_idx = 0; + bio->bi_size = i * PAGE_SIZE; + bio->bi_bdev = super->s_bdev; + bio->bi_sector = ofs >> 9; + bio->bi_private = sb; + bio->bi_end_io = writeseg_end_io; + atomic_inc(&super->s_pending_writes); + submit_bio(WRITE, bio); + + ofs += i * PAGE_SIZE; + index += i; + nr_pages -= i; + i = 0; + + bio = bio_alloc(GFP_NOFS, max_pages); + BUG_ON(!bio); + } + page = find_lock_page(mapping, index + i); + BUG_ON(!page); + bio->bi_io_vec[i].bv_page = page; + bio->bi_io_vec[i].bv_len = PAGE_SIZE; + bio->bi_io_vec[i].bv_offset = 0; + + BUG_ON(PageWriteback(page)); + set_page_writeback(page); + unlock_page(page); + } + bio->bi_vcnt = nr_pages; + bio->bi_idx = 0; + bio->bi_size = nr_pages * PAGE_SIZE; + bio->bi_bdev = super->s_bdev; + bio->bi_sector = ofs >> 9; + bio->bi_private = sb; + bio->bi_end_io = writeseg_end_io; + atomic_inc(&super->s_pending_writes); + submit_bio(WRITE, bio); + return 0; +} + +static void bdev_writeseg(struct super_block *sb, u64 ofs, size_t len) +{ + struct logfs_super *super = logfs_super(sb); + int head; + + BUG_ON(super->s_flags & LOGFS_SB_FLAG_RO); + + if (len == 0) { + /* This can happen when the object fit perfectly into a + * segment, the segment gets written per sync and subsequently + * closed. + */ + return; + } + head = ofs & (PAGE_SIZE - 1); + if (head) { + ofs -= head; + len += head; + } + len = PAGE_ALIGN(len); + __bdev_writeseg(sb, ofs, ofs >> PAGE_SHIFT, len >> PAGE_SHIFT); + generic_unplug_device(bdev_get_queue(logfs_super(sb)->s_bdev)); +} + +static int bdev_erase(struct super_block *sb, loff_t to, size_t len) +{ + struct logfs_super *super = logfs_super(sb); + struct address_space *mapping = super->s_mapping_inode->i_mapping; + struct page *page; + pgoff_t index = to >> PAGE_SHIFT; + int i, nr_pages = len >> PAGE_SHIFT; + + BUG_ON(to & (PAGE_SIZE - 1)); + BUG_ON(len & (PAGE_SIZE - 1)); + + if (logfs_super(sb)->s_flags & LOGFS_SB_FLAG_RO) + return -EROFS; + + for (i = 0; i < nr_pages; i++) { + page = find_get_page(mapping, index + i); + if (page) { + memset(page_address(page), 0xFF, PAGE_SIZE); + page_cache_release(page); + } + } + return 0; +} + +static void bdev_sync(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + + wait_event(wq, atomic_read(&super->s_pending_writes) == 0); +} + +static struct page *bdev_find_first_sb(struct super_block *sb, u64 *ofs) +{ + struct logfs_super *super = logfs_super(sb); + struct address_space *mapping = super->s_mapping_inode->i_mapping; + filler_t *filler = bdev_readpage; + + *ofs = 0; + return read_cache_page(mapping, 0, filler, sb); +} + +static struct page *bdev_find_last_sb(struct super_block *sb, u64 *ofs) +{ + struct logfs_super *super = logfs_super(sb); + struct address_space *mapping = super->s_mapping_inode->i_mapping; + filler_t *filler = bdev_readpage; + u64 pos = (super->s_bdev->bd_inode->i_size & ~0xfffULL) - 0x1000; + pgoff_t index = pos >> PAGE_SHIFT; + + *ofs = pos; + return read_cache_page(mapping, index, filler, sb); +} + +static int bdev_write_sb(struct super_block *sb, struct page *page) +{ + struct block_device *bdev = logfs_super(sb)->s_bdev; + + /* Nothing special to do for block devices. */ + return sync_request(page, bdev, WRITE); +} + +static void bdev_put_device(struct super_block *sb) +{ + close_bdev_exclusive(logfs_super(sb)->s_bdev, FMODE_READ|FMODE_WRITE); +} + +static const struct logfs_device_ops bd_devops = { + .find_first_sb = bdev_find_first_sb, + .find_last_sb = bdev_find_last_sb, + .write_sb = bdev_write_sb, + .readpage = bdev_readpage, + .writeseg = bdev_writeseg, + .erase = bdev_erase, + .sync = bdev_sync, + .put_device = bdev_put_device, +}; + +int logfs_get_sb_bdev(struct file_system_type *type, int flags, + const char *devname, struct vfsmount *mnt) +{ + struct block_device *bdev; + + bdev = open_bdev_exclusive(devname, FMODE_READ|FMODE_WRITE, type); + if (IS_ERR(bdev)) + return PTR_ERR(bdev); + + if (MAJOR(bdev->bd_dev) == MTD_BLOCK_MAJOR) { + int mtdnr = MINOR(bdev->bd_dev); + close_bdev_exclusive(bdev, FMODE_READ|FMODE_WRITE); + return logfs_get_sb_mtd(type, flags, mtdnr, mnt); + } + + return logfs_get_sb_device(type, flags, NULL, bdev, &bd_devops, mnt); +} diff --git a/fs/logfs/dev_mtd.c b/fs/logfs/dev_mtd.c new file mode 100644 index 000000000000..68e99d046c23 --- /dev/null +++ b/fs/logfs/dev_mtd.c @@ -0,0 +1,253 @@ +/* + * fs/logfs/dev_mtd.c - Device access methods for MTD + * + * As should be obvious for Linux kernel code, license is GPLv2 + * + * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org> + */ +#include "logfs.h" +#include <linux/completion.h> +#include <linux/mount.h> +#include <linux/sched.h> + +#define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1)) + +static int mtd_read(struct super_block *sb, loff_t ofs, size_t len, void *buf) +{ + struct mtd_info *mtd = logfs_super(sb)->s_mtd; + size_t retlen; + int ret; + + ret = mtd->read(mtd, ofs, len, &retlen, buf); + BUG_ON(ret == -EINVAL); + if (ret) + return ret; + + /* Not sure if we should loop instead. */ + if (retlen != len) + return -EIO; + + return 0; +} + +static int mtd_write(struct super_block *sb, loff_t ofs, size_t len, void *buf) +{ + struct logfs_super *super = logfs_super(sb); + struct mtd_info *mtd = super->s_mtd; + size_t retlen; + loff_t page_start, page_end; + int ret; + + if (super->s_flags & LOGFS_SB_FLAG_RO) + return -EROFS; + + BUG_ON((ofs >= mtd->size) || (len > mtd->size - ofs)); + BUG_ON(ofs != (ofs >> super->s_writeshift) << super->s_writeshift); + BUG_ON(len > PAGE_CACHE_SIZE); + page_start = ofs & PAGE_CACHE_MASK; + page_end = PAGE_CACHE_ALIGN(ofs + len) - 1; + ret = mtd->write(mtd, ofs, len, &retlen, buf); + if (ret || (retlen != len)) + return -EIO; + + return 0; +} + +/* + * For as long as I can remember (since about 2001) mtd->erase has been an + * asynchronous interface lacking the first driver to actually use the + * asynchronous properties. So just to prevent the first implementor of such + * a thing from breaking logfs in 2350, we do the usual pointless dance to + * declare a completion variable and wait for completion before returning + * from mtd_erase(). What an excercise in futility! + */ +static void logfs_erase_callback(struct erase_info *ei) +{ + complete((struct completion *)ei->priv); +} + +static int mtd_erase_mapping(struct super_block *sb, loff_t ofs, size_t len) +{ + struct logfs_super *super = logfs_super(sb); + struct address_space *mapping = super->s_mapping_inode->i_mapping; + struct page *page; + pgoff_t index = ofs >> PAGE_SHIFT; + + for (index = ofs >> PAGE_SHIFT; index < (ofs + len) >> PAGE_SHIFT; index++) { + page = find_get_page(mapping, index); + if (!page) + continue; + memset(page_address(page), 0xFF, PAGE_SIZE); + page_cache_release(page); + } + return 0; +} + +static int mtd_erase(struct super_block *sb, loff_t ofs, size_t len) +{ + struct mtd_info *mtd = logfs_super(sb)->s_mtd; + struct erase_info ei; + DECLARE_COMPLETION_ONSTACK(complete); + int ret; + + BUG_ON(len % mtd->erasesize); + if (logfs_super(sb)->s_flags & LOGFS_SB_FLAG_RO) + return -EROFS; + + memset(&ei, 0, sizeof(ei)); + ei.mtd = mtd; + ei.addr = ofs; + ei.len = len; + ei.callback = logfs_erase_callback; + ei.priv = (long)&complete; + ret = mtd->erase(mtd, &ei); + if (ret) + return -EIO; + + wait_for_completion(&complete); + if (ei.state != MTD_ERASE_DONE) + return -EIO; + return mtd_erase_mapping(sb, ofs, len); +} + +static void mtd_sync(struct super_block *sb) +{ + struct mtd_info *mtd = logfs_super(sb)->s_mtd; + + if (mtd->sync) + mtd->sync(mtd); +} + +static int mtd_readpage(void *_sb, struct page *page) +{ + struct super_block *sb = _sb; + int err; + + err = mtd_read(sb, page->index << PAGE_SHIFT, PAGE_SIZE, + page_address(page)); + if (err == -EUCLEAN) { + err = 0; + /* FIXME: force GC this segment */ + } + if (err) { + ClearPageUptodate(page); + SetPageError(page); + } else { + SetPageUptodate(page); + ClearPageError(page); + } + unlock_page(page); + return err; +} + +static struct page *mtd_find_first_sb(struct super_block *sb, u64 *ofs) +{ + struct logfs_super *super = logfs_super(sb); + struct address_space *mapping = super->s_mapping_inode->i_mapping; + filler_t *filler = mtd_readpage; + struct mtd_info *mtd = super->s_mtd; + + if (!mtd->block_isbad) + return NULL; + + *ofs = 0; + while (mtd->block_isbad(mtd, *ofs)) { + *ofs += mtd->erasesize; + if (*ofs >= mtd->size) + return NULL; + } + BUG_ON(*ofs & ~PAGE_MASK); + return read_cache_page(mapping, *ofs >> PAGE_SHIFT, filler, sb); +} + +static struct page *mtd_find_last_sb(struct super_block *sb, u64 *ofs) +{ + struct logfs_super *super = logfs_super(sb); + struct address_space *mapping = super->s_mapping_inode->i_mapping; + filler_t *filler = mtd_readpage; + struct mtd_info *mtd = super->s_mtd; + + if (!mtd->block_isbad) + return NULL; + + *ofs = mtd->size - mtd->erasesize; + while (mtd->block_isbad(mtd, *ofs)) { + *ofs -= mtd->erasesize; + if (*ofs <= 0) + return NULL; + } + *ofs = *ofs + mtd->erasesize - 0x1000; + BUG_ON(*ofs & ~PAGE_MASK); + return read_cache_page(mapping, *ofs >> PAGE_SHIFT, filler, sb); +} + +static int __mtd_writeseg(struct super_block *sb, u64 ofs, pgoff_t index, + size_t nr_pages) +{ + struct logfs_super *super = logfs_super(sb); + struct address_space *mapping = super->s_mapping_inode->i_mapping; + struct page *page; + int i, err; + + for (i = 0; i < nr_pages; i++) { + page = find_lock_page(mapping, index + i); + BUG_ON(!page); + + err = mtd_write(sb, page->index << PAGE_SHIFT, PAGE_SIZE, + page_address(page)); + unlock_page(page); + page_cache_release(page); + if (err) + return err; + } + return 0; +} + +static void mtd_writeseg(struct super_block *sb, u64 ofs, size_t len) +{ + struct logfs_super *super = logfs_super(sb); + int head; + + if (super->s_flags & LOGFS_SB_FLAG_RO) + return; + + if (len == 0) { + /* This can happen when the object fit perfectly into a + * segment, the segment gets written per sync and subsequently + * closed. + */ + return; + } + head = ofs & (PAGE_SIZE - 1); + if (head) { + ofs -= head; + len += head; + } + len = PAGE_ALIGN(len); + __mtd_writeseg(sb, ofs, ofs >> PAGE_SHIFT, len >> PAGE_SHIFT); +} + +static void mtd_put_device(struct super_block *sb) +{ + put_mtd_device(logfs_super(sb)->s_mtd); +} + +static const struct logfs_device_ops mtd_devops = { + .find_first_sb = mtd_find_first_sb, + .find_last_sb = mtd_find_last_sb, + .readpage = mtd_readpage, + .writeseg = mtd_writeseg, + .erase = mtd_erase, + .sync = mtd_sync, + .put_device = mtd_put_device, +}; + +int logfs_get_sb_mtd(struct file_system_type *type, int flags, + int mtdnr, struct vfsmount *mnt) +{ + struct mtd_info *mtd; + const struct logfs_device_ops *devops = &mtd_devops; + + mtd = get_mtd_device(NULL, mtdnr); + return logfs_get_sb_device(type, flags, mtd, NULL, devops, mnt); +} diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c new file mode 100644 index 000000000000..89104e6f81c4 --- /dev/null +++ b/fs/logfs/dir.c @@ -0,0 +1,818 @@ +/* + * fs/logfs/dir.c - directory-related code + * + * As should be obvious for Linux kernel code, license is GPLv2 + * + * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org> + */ +#include "logfs.h" + + +/* + * Atomic dir operations + * + * Directory operations are by default not atomic. Dentries and Inodes are + * created/removed/altered in seperate operations. Therefore we need to do + * a small amount of journaling. + * + * Create, link, mkdir, mknod and symlink all share the same function to do + * the work: __logfs_create. This function works in two atomic steps: + * 1. allocate inode (remember in journal) + * 2. allocate dentry (clear journal) + * + * As we can only get interrupted between the two, when the inode we just + * created is simply stored in the anchor. On next mount, if we were + * interrupted, we delete the inode. From a users point of view the + * operation never happened. + * + * Unlink and rmdir also share the same function: unlink. Again, this + * function works in two atomic steps + * 1. remove dentry (remember inode in journal) + * 2. unlink inode (clear journal) + * + * And again, on the next mount, if we were interrupted, we delete the inode. + * From a users point of view the operation succeeded. + * + * Rename is the real pain to deal with, harder than all the other methods + * combined. Depending on the circumstances we can run into three cases. + * A "target rename" where the target dentry already existed, a "local + * rename" where both parent directories are identical or a "cross-directory + * rename" in the remaining case. + * + * Local rename is atomic, as the old dentry is simply rewritten with a new + * name. + * + * Cross-directory rename works in two steps, similar to __logfs_create and + * logfs_unlink: + * 1. Write new dentry (remember old dentry in journal) + * 2. Remove old dentry (clear journal) + * + * Here we remember a dentry instead of an inode. On next mount, if we were + * interrupted, we delete the dentry. From a users point of view, the + * operation succeeded. + * + * Target rename works in three atomic steps: + * 1. Attach old inode to new dentry (remember old dentry and new inode) + * 2. Remove old dentry (still remember the new inode) + * 3. Remove victim inode + * + * Here we remember both an inode an a dentry. If we get interrupted + * between steps 1 and 2, we delete both the dentry and the inode. If + * we get interrupted between steps 2 and 3, we delete just the inode. + * In either case, the remaining objects are deleted on next mount. From + * a users point of view, the operation succeeded. + */ + +static int write_dir(struct inode *dir, struct logfs_disk_dentry *dd, + loff_t pos) +{ + return logfs_inode_write(dir, dd, sizeof(*dd), pos, WF_LOCK, NULL); +} + +static int write_inode(struct inode *inode) +{ + return __logfs_write_inode(inode, WF_LOCK); +} + +static s64 dir_seek_data(struct inode *inode, s64 pos) +{ + s64 new_pos = logfs_seek_data(inode, pos); + + return max(pos, new_pos - 1); +} + +static int beyond_eof(struct inode *inode, loff_t bix) +{ + loff_t pos = bix << inode->i_sb->s_blocksize_bits; + return pos >= i_size_read(inode); +} + +/* + * Prime value was chosen to be roughly 256 + 26. r5 hash uses 11, + * so short names (len <= 9) don't even occupy the complete 32bit name + * space. A prime >256 ensures short names quickly spread the 32bit + * name space. Add about 26 for the estimated amount of information + * of each character and pick a prime nearby, preferrably a bit-sparse + * one. + */ +static u32 hash_32(const char *s, int len, u32 seed) +{ + u32 hash = seed; + int i; + + for (i = 0; i < len; i++) + hash = hash * 293 + s[i]; + return hash; +} + +/* + * We have to satisfy several conflicting requirements here. Small + * directories should stay fairly compact and not require too many + * indirect blocks. The number of possible locations for a given hash + * should be small to make lookup() fast. And we should try hard not + * to overflow the 32bit name space or nfs and 32bit host systems will + * be unhappy. + * + * So we use the following scheme. First we reduce the hash to 0..15 + * and try a direct block. If that is occupied we reduce the hash to + * 16..255 and try an indirect block. Same for 2x and 3x indirect + * blocks. Lastly we reduce the hash to 0x800_0000 .. 0xffff_ffff, + * but use buckets containing eight entries instead of a single one. + * + * Using 16 entries should allow for a reasonable amount of hash + * collisions, so the 32bit name space can be packed fairly tight + * before overflowing. Oh and currently we don't overflow but return + * and error. + * + * How likely are collisions? Doing the appropriate math is beyond me + * and the Bronstein textbook. But running a test program to brute + * force collisions for a couple of days showed that on average the + * first collision occurs after 598M entries, with 290M being the + * smallest result. Obviously 21 entries could already cause a + * collision if all entries are carefully chosen. + */ +static pgoff_t hash_index(u32 hash, int round) +{ + switch (round) { + case 0: + return hash % I0_BLOCKS; + case 1: + return I0_BLOCKS + hash % (I1_BLOCKS - I0_BLOCKS); + case 2: + return I1_BLOCKS + hash % (I2_BLOCKS - I1_BLOCKS); + case 3: + return I2_BLOCKS + hash % (I3_BLOCKS - I2_BLOCKS); + case 4 ... 19: + return I3_BLOCKS + 16 * (hash % (((1<<31) - I3_BLOCKS) / 16)) + + round - 4; + } + BUG(); +} + +static struct page *logfs_get_dd_page(struct inode *dir, struct dentry *dentry) +{ + struct qstr *name = &dentry->d_name; + struct page *page; + struct logfs_disk_dentry *dd; + u32 hash = hash_32(name->name, name->len, 0); + pgoff_t index; + int round; + + if (name->len > LOGFS_MAX_NAMELEN) + return ERR_PTR(-ENAMETOOLONG); + + for (round = 0; round < 20; round++) { + index = hash_index(hash, round); + + if (beyond_eof(dir, index)) + return NULL; + if (!logfs_exist_block(dir, index)) + continue; + page = read_cache_page(dir->i_mapping, index, + (filler_t *)logfs_readpage, NULL); + if (IS_ERR(page)) + return page; + dd = kmap_atomic(page, KM_USER0); + BUG_ON(dd->namelen == 0); + + if (name->len != be16_to_cpu(dd->namelen) || + memcmp(name->name, dd->name, name->len)) { + kunmap_atomic(dd, KM_USER0); + page_cache_release(page); + continue; + } + + kunmap_atomic(dd, KM_USER0); + return page; + } + return NULL; +} + +static int logfs_remove_inode(struct inode *inode) +{ + int ret; + + inode->i_nlink--; + ret = write_inode(inode); + LOGFS_BUG_ON(ret, inode->i_sb); + return ret; +} + +static void abort_transaction(struct inode *inode, struct logfs_transaction *ta) +{ + if (logfs_inode(inode)->li_block) + logfs_inode(inode)->li_block->ta = NULL; + kfree(ta); +} + +static int logfs_unlink(struct inode *dir, struct dentry *dentry) +{ + struct logfs_super *super = logfs_super(dir->i_sb); + struct inode *inode = dentry->d_inode; + struct logfs_transaction *ta; + struct page *page; + pgoff_t index; + int ret; + + ta = kzalloc(sizeof(*ta), GFP_KERNEL); + if (!ta) + return -ENOMEM; + + ta->state = UNLINK_1; + ta->ino = inode->i_ino; + + inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; + + page = logfs_get_dd_page(dir, dentry); + if (!page) + return -ENOENT; + if (IS_ERR(page)) + return PTR_ERR(page); + index = page->index; + page_cache_release(page); + + mutex_lock(&super->s_dirop_mutex); + logfs_add_transaction(dir, ta); + + ret = logfs_delete(dir, index, NULL); + if (!ret) + ret = write_inode(dir); + + if (ret) { + abort_transaction(dir, ta); + printk(KERN_ERR"LOGFS: unable to delete inode\n"); + goto out; + } + + ta->state = UNLINK_2; + logfs_add_transaction(inode, ta); + ret = logfs_remove_inode(inode); +out: + mutex_unlock(&super->s_dirop_mutex); + return ret; +} + +static inline int logfs_empty_dir(struct inode *dir) +{ + u64 data; + + data = logfs_seek_data(dir, 0) << dir->i_sb->s_blocksize_bits; + return data >= i_size_read(dir); +} + +static int logfs_rmdir(struct inode *dir, struct dentry *dentry) +{ + struct inode *inode = dentry->d_inode; + + if (!logfs_empty_dir(inode)) + return -ENOTEMPTY; + + return logfs_unlink(dir, dentry); +} + +/* FIXME: readdir currently has it's own dir_walk code. I don't see a good + * way to combine the two copies */ +#define IMPLICIT_NODES 2 +static int __logfs_readdir(struct file *file, void *buf, filldir_t filldir) +{ + struct inode *dir = file->f_dentry->d_inode; + loff_t pos = file->f_pos - IMPLICIT_NODES; + struct page *page; + struct logfs_disk_dentry *dd; + int full; + + BUG_ON(pos < 0); + for (;; pos++) { + if (beyond_eof(dir, pos)) + break; + if (!logfs_exist_block(dir, pos)) { + /* deleted dentry */ + pos = dir_seek_data(dir, pos); + continue; + } + page = read_cache_page(dir->i_mapping, pos, + (filler_t *)logfs_readpage, NULL); + if (IS_ERR(page)) + return PTR_ERR(page); + dd = kmap_atomic(page, KM_USER0); + BUG_ON(dd->namelen == 0); + + full = filldir(buf, (char *)dd->name, be16_to_cpu(dd->namelen), + pos, be64_to_cpu(dd->ino), dd->type); + kunmap_atomic(dd, KM_USER0); + page_cache_release(page); + if (full) + break; + } + + file->f_pos = pos + IMPLICIT_NODES; + return 0; +} + +static int logfs_readdir(struct file *file, void *buf, filldir_t filldir) +{ + struct inode *inode = file->f_dentry->d_inode; + ino_t pino = parent_ino(file->f_dentry); + int err; + + if (file->f_pos < 0) + return -EINVAL; + + if (file->f_pos == 0) { + if (filldir(buf, ".", 1, 1, inode->i_ino, DT_DIR) < 0) + return 0; + file->f_pos++; + } + if (file->f_pos == 1) { + if (filldir(buf, "..", 2, 2, pino, DT_DIR) < 0) + return 0; + file->f_pos++; + } + + err = __logfs_readdir(file, buf, filldir); + return err; +} + +static void logfs_set_name(struct logfs_disk_dentry *dd, struct qstr *name) +{ + dd->namelen = cpu_to_be16(name->len); + memcpy(dd->name, name->name, name->len); +} + +static struct dentry *logfs_lookup(struct inode *dir, struct dentry *dentry, + struct nameidata *nd) +{ + struct page *page; + struct logfs_disk_dentry *dd; + pgoff_t index; + u64 ino = 0; + struct inode *inode; + + page = logfs_get_dd_page(dir, dentry); + if (IS_ERR(page)) + return ERR_CAST(page); + if (!page) { + d_add(dentry, NULL); + return NULL; + } + index = page->index; + dd = kmap_atomic(page, KM_USER0); + ino = be64_to_cpu(dd->ino); + kunmap_atomic(dd, KM_USER0); + page_cache_release(page); + + inode = logfs_iget(dir->i_sb, ino); + if (IS_ERR(inode)) { + printk(KERN_ERR"LogFS: Cannot read inode #%llx for dentry (%lx, %lx)n", + ino, dir->i_ino, index); + return ERR_CAST(inode); + } + return d_splice_alias(inode, dentry); +} + +static void grow_dir(struct inode *dir, loff_t index) +{ + index = (index + 1) << dir->i_sb->s_blocksize_bits; + if (i_size_read(dir) < index) + i_size_write(dir, index); +} + +static int logfs_write_dir(struct inode *dir, struct dentry *dentry, + struct inode *inode) +{ + struct page *page; + struct logfs_disk_dentry *dd; + u32 hash = hash_32(dentry->d_name.name, dentry->d_name.len, 0); + pgoff_t index; + int round, err; + + for (round = 0; round < 20; round++) { + index = hash_index(hash, round); + + if (logfs_exist_block(dir, index)) + continue; + page = find_or_create_page(dir->i_mapping, index, GFP_KERNEL); + if (!page) + return -ENOMEM; + + dd = kmap_atomic(page, KM_USER0); + memset(dd, 0, sizeof(*dd)); + dd->ino = cpu_to_be64(inode->i_ino); + dd->type = logfs_type(inode); + logfs_set_name(dd, &dentry->d_name); + kunmap_atomic(dd, KM_USER0); + + err = logfs_write_buf(dir, page, WF_LOCK); + unlock_page(page); + page_cache_release(page); + if (!err) + grow_dir(dir, index); + return err; + } + /* FIXME: Is there a better return value? In most cases neither + * the filesystem nor the directory are full. But we have had + * too many collisions for this particular hash and no fallback. + */ + return -ENOSPC; +} + +static int __logfs_create(struct inode *dir, struct dentry *dentry, + struct inode *inode, const char *dest, long destlen) +{ + struct logfs_super *super = logfs_super(dir->i_sb); + struct logfs_inode *li = logfs_inode(inode); + struct logfs_transaction *ta; + int ret; + + ta = kzalloc(sizeof(*ta), GFP_KERNEL); + if (!ta) + return -ENOMEM; + + ta->state = CREATE_1; + ta->ino = inode->i_ino; + mutex_lock(&super->s_dirop_mutex); + logfs_add_transaction(inode, ta); + + if (dest) { + /* symlink */ + ret = logfs_inode_write(inode, dest, destlen, 0, WF_LOCK, NULL); + if (!ret) + ret = write_inode(inode); + } else { + /* creat/mkdir/mknod */ + ret = write_inode(inode); + } + if (ret) { + abort_transaction(inode, ta); + li->li_flags |= LOGFS_IF_STILLBORN; + /* FIXME: truncate symlink */ + inode->i_nlink--; + iput(inode); + goto out; + } + + ta->state = CREATE_2; + logfs_add_transaction(dir, ta); + ret = logfs_write_dir(dir, dentry, inode); + /* sync directory */ + if (!ret) + ret = write_inode(dir); + + if (ret) { + logfs_del_transaction(dir, ta); + ta->state = CREATE_2; + logfs_add_transaction(inode, ta); + logfs_remove_inode(inode); + iput(inode); + goto out; + } + d_instantiate(dentry, inode); +out: + mutex_unlock(&super->s_dirop_mutex); + return ret; +} + +static int logfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +{ + struct inode *inode; + + /* + * FIXME: why do we have to fill in S_IFDIR, while the mode is + * correct for mknod, creat, etc.? Smells like the vfs *should* + * do it for us but for some reason fails to do so. + */ + inode = logfs_new_inode(dir, S_IFDIR | mode); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + inode->i_op = &logfs_dir_iops; + inode->i_fop = &logfs_dir_fops; + + return __logfs_create(dir, dentry, inode, NULL, 0); +} + +static int logfs_create(struct inode *dir, struct dentry *dentry, int mode, + struct nameidata *nd) +{ + struct inode *inode; + + inode = logfs_new_inode(dir, mode); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + inode->i_op = &logfs_reg_iops; + inode->i_fop = &logfs_reg_fops; + inode->i_mapping->a_ops = &logfs_reg_aops; + + return __logfs_create(dir, dentry, inode, NULL, 0); +} + +static int logfs_mknod(struct inode *dir, struct dentry *dentry, int mode, + dev_t rdev) +{ + struct inode *inode; + + if (dentry->d_name.len > LOGFS_MAX_NAMELEN) + return -ENAMETOOLONG; + + inode = logfs_new_inode(dir, mode); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + init_special_inode(inode, mode, rdev); + + return __logfs_create(dir, dentry, inode, NULL, 0); +} + +static int logfs_symlink(struct inode *dir, struct dentry *dentry, + const char *target) +{ + struct inode *inode; + size_t destlen = strlen(target) + 1; + + if (destlen > dir->i_sb->s_blocksize) + return -ENAMETOOLONG; + + inode = logfs_new_inode(dir, S_IFLNK | 0777); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + inode->i_op = &logfs_symlink_iops; + inode->i_mapping->a_ops = &logfs_reg_aops; + + return __logfs_create(dir, dentry, inode, target, destlen); +} + +static int logfs_permission(struct inode *inode, int mask) +{ + return generic_permission(inode, mask, NULL); +} + +static int logfs_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *dentry) +{ + struct inode *inode = old_dentry->d_inode; + + if (inode->i_nlink >= LOGFS_LINK_MAX) + return -EMLINK; + + inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; + atomic_inc(&inode->i_count); + inode->i_nlink++; + mark_inode_dirty_sync(inode); + + return __logfs_create(dir, dentry, inode, NULL, 0); +} + +static int logfs_get_dd(struct inode *dir, struct dentry *dentry, + struct logfs_disk_dentry *dd, loff_t *pos) +{ + struct page *page; + void *map; + + page = logfs_get_dd_page(dir, dentry); + if (IS_ERR(page)) + return PTR_ERR(page); + *pos = page->index; + map = kmap_atomic(page, KM_USER0); + memcpy(dd, map, sizeof(*dd)); + kunmap_atomic(map, KM_USER0); + page_cache_release(page); + return 0; +} + +static int logfs_delete_dd(struct inode *dir, loff_t pos) +{ + /* + * Getting called with pos somewhere beyond eof is either a goofup + * within this file or means someone maliciously edited the + * (crc-protected) journal. + */ + BUG_ON(beyond_eof(dir, pos)); + dir->i_ctime = dir->i_mtime = CURRENT_TIME; + log_dir(" Delete dentry (%lx, %llx)\n", dir->i_ino, pos); + return logfs_delete(dir, pos, NULL); +} + +/* + * Cross-directory rename, target does not exist. Just a little nasty. + * Create a new dentry in the target dir, then remove the old dentry, + * all the while taking care to remember our operation in the journal. + */ +static int logfs_rename_cross(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct logfs_super *super = logfs_super(old_dir->i_sb); + struct logfs_disk_dentry dd; + struct logfs_transaction *ta; + loff_t pos; + int err; + + /* 1. locate source dd */ + err = logfs_get_dd(old_dir, old_dentry, &dd, &pos); + if (err) + return err; + + ta = kzalloc(sizeof(*ta), GFP_KERNEL); + if (!ta) + return -ENOMEM; + + ta->state = CROSS_RENAME_1; + ta->dir = old_dir->i_ino; + ta->pos = pos; + + /* 2. write target dd */ + mutex_lock(&super->s_dirop_mutex); + logfs_add_transaction(new_dir, ta); + err = logfs_write_dir(new_dir, new_dentry, old_dentry->d_inode); + if (!err) + err = write_inode(new_dir); + + if (err) { + super->s_rename_dir = 0; + super->s_rename_pos = 0; + abort_transaction(new_dir, ta); + goto out; + } + + /* 3. remove source dd */ + ta->state = CROSS_RENAME_2; + logfs_add_transaction(old_dir, ta); + err = logfs_delete_dd(old_dir, pos); + if (!err) + err = write_inode(old_dir); + LOGFS_BUG_ON(err, old_dir->i_sb); +out: + mutex_unlock(&super->s_dirop_mutex); + return err; +} + +static int logfs_replace_inode(struct inode *dir, struct dentry *dentry, + struct logfs_disk_dentry *dd, struct inode *inode) +{ + loff_t pos; + int err; + + err = logfs_get_dd(dir, dentry, dd, &pos); + if (err) + return err; + dd->ino = cpu_to_be64(inode->i_ino); + dd->type = logfs_type(inode); + + err = write_dir(dir, dd, pos); + if (err) + return err; + log_dir("Replace dentry (%lx, %llx) %s -> %llx\n", dir->i_ino, pos, + dd->name, be64_to_cpu(dd->ino)); + return write_inode(dir); +} + +/* Target dentry exists - the worst case. We need to attach the source + * inode to the target dentry, then remove the orphaned target inode and + * source dentry. + */ +static int logfs_rename_target(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct logfs_super *super = logfs_super(old_dir->i_sb); + struct inode *old_inode = old_dentry->d_inode; + struct inode *new_inode = new_dentry->d_inode; + int isdir = S_ISDIR(old_inode->i_mode); + struct logfs_disk_dentry dd; + struct logfs_transaction *ta; + loff_t pos; + int err; + + BUG_ON(isdir != S_ISDIR(new_inode->i_mode)); + if (isdir) { + if (!logfs_empty_dir(new_inode)) + return -ENOTEMPTY; + } + + /* 1. locate source dd */ + err = logfs_get_dd(old_dir, old_dentry, &dd, &pos); + if (err) + return err; + + ta = kzalloc(sizeof(*ta), GFP_KERNEL); + if (!ta) + return -ENOMEM; + + ta->state = TARGET_RENAME_1; + ta->dir = old_dir->i_ino; + ta->pos = pos; + ta->ino = new_inode->i_ino; + + /* 2. attach source inode to target dd */ + mutex_lock(&super->s_dirop_mutex); + logfs_add_transaction(new_dir, ta); + err = logfs_replace_inode(new_dir, new_dentry, &dd, old_inode); + if (err) { + super->s_rename_dir = 0; + super->s_rename_pos = 0; + super->s_victim_ino = 0; + abort_transaction(new_dir, ta); + goto out; + } + + /* 3. remove source dd */ + ta->state = TARGET_RENAME_2; + logfs_add_transaction(old_dir, ta); + err = logfs_delete_dd(old_dir, pos); + if (!err) + err = write_inode(old_dir); + LOGFS_BUG_ON(err, old_dir->i_sb); + + /* 4. remove target inode */ + ta->state = TARGET_RENAME_3; + logfs_add_transaction(new_inode, ta); + err = logfs_remove_inode(new_inode); + +out: + mutex_unlock(&super->s_dirop_mutex); + return err; +} + +static int logfs_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + if (new_dentry->d_inode) + return logfs_rename_target(old_dir, old_dentry, + new_dir, new_dentry); + return logfs_rename_cross(old_dir, old_dentry, new_dir, new_dentry); +} + +/* No locking done here, as this is called before .get_sb() returns. */ +int logfs_replay_journal(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + struct inode *inode; + u64 ino, pos; + int err; + + if (super->s_victim_ino) { + /* delete victim inode */ + ino = super->s_victim_ino; + printk(KERN_INFO"LogFS: delete unmapped inode #%llx\n", ino); + inode = logfs_iget(sb, ino); + if (IS_ERR(inode)) + goto fail; + + LOGFS_BUG_ON(i_size_read(inode) > 0, sb); + super->s_victim_ino = 0; + err = logfs_remove_inode(inode); + iput(inode); + if (err) { + super->s_victim_ino = ino; + goto fail; + } + } + if (super->s_rename_dir) { + /* delete old dd from rename */ + ino = super->s_rename_dir; + pos = super->s_rename_pos; + printk(KERN_INFO"LogFS: delete unbacked dentry (%llx, %llx)\n", + ino, pos); + inode = logfs_iget(sb, ino); + if (IS_ERR(inode)) + goto fail; + + super->s_rename_dir = 0; + super->s_rename_pos = 0; + err = logfs_delete_dd(inode, pos); + iput(inode); + if (err) { + super->s_rename_dir = ino; + super->s_rename_pos = pos; + goto fail; + } + } + return 0; +fail: + LOGFS_BUG(sb); + return -EIO; +} + +const struct inode_operations logfs_symlink_iops = { + .readlink = generic_readlink, + .follow_link = page_follow_link_light, +}; + +const struct inode_operations logfs_dir_iops = { + .create = logfs_create, + .link = logfs_link, + .lookup = logfs_lookup, + .mkdir = logfs_mkdir, + .mknod = logfs_mknod, + .rename = logfs_rename, + .rmdir = logfs_rmdir, + .permission = logfs_permission, + .symlink = logfs_symlink, + .unlink = logfs_unlink, +}; +const struct file_operations logfs_dir_fops = { + .fsync = logfs_fsync, + .ioctl = logfs_ioctl, + .readdir = logfs_readdir, + .read = generic_read_dir, +}; diff --git a/fs/logfs/file.c b/fs/logfs/file.c new file mode 100644 index 000000000000..370f367a933e --- /dev/null +++ b/fs/logfs/file.c @@ -0,0 +1,263 @@ +/* + * fs/logfs/file.c - prepare_write, commit_write and friends + * + * As should be obvious for Linux kernel code, license is GPLv2 + * + * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org> + */ +#include "logfs.h" +#include <linux/sched.h> +#include <linux/writeback.h> + +static int logfs_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + struct inode *inode = mapping->host; + struct page *page; + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + + page = grab_cache_page_write_begin(mapping, index, flags); + if (!page) + return -ENOMEM; + *pagep = page; + + if ((len == PAGE_CACHE_SIZE) || PageUptodate(page)) + return 0; + if ((pos & PAGE_CACHE_MASK) >= i_size_read(inode)) { + unsigned start = pos & (PAGE_CACHE_SIZE - 1); + unsigned end = start + len; + + /* Reading beyond i_size is simple: memset to zero */ + zero_user_segments(page, 0, start, end, PAGE_CACHE_SIZE); + return 0; + } + return logfs_readpage_nolock(page); +} + +static int logfs_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, struct page *page, + void *fsdata) +{ + struct inode *inode = mapping->host; + pgoff_t index = page->index; + unsigned start = pos & (PAGE_CACHE_SIZE - 1); + unsigned end = start + copied; + int ret = 0; + + BUG_ON(PAGE_CACHE_SIZE != inode->i_sb->s_blocksize); + BUG_ON(page->index > I3_BLOCKS); + + if (copied < len) { + /* + * Short write of a non-initialized paged. Just tell userspace + * to retry the entire page. + */ + if (!PageUptodate(page)) { + copied = 0; + goto out; + } + } + if (copied == 0) + goto out; /* FIXME: do we need to update inode? */ + + if (i_size_read(inode) < (index << PAGE_CACHE_SHIFT) + end) { + i_size_write(inode, (index << PAGE_CACHE_SHIFT) + end); + mark_inode_dirty_sync(inode); + } + + SetPageUptodate(page); + if (!PageDirty(page)) { + if (!get_page_reserve(inode, page)) + __set_page_dirty_nobuffers(page); + else + ret = logfs_write_buf(inode, page, WF_LOCK); + } +out: + unlock_page(page); + page_cache_release(page); + return ret ? ret : copied; +} + +int logfs_readpage(struct file *file, struct page *page) +{ + int ret; + + ret = logfs_readpage_nolock(page); + unlock_page(page); + return ret; +} + +/* Clear the page's dirty flag in the radix tree. */ +/* TODO: mucking with PageWriteback is silly. Add a generic function to clear + * the dirty bit from the radix tree for filesystems that don't have to wait + * for page writeback to finish (i.e. any compressing filesystem). + */ +static void clear_radix_tree_dirty(struct page *page) +{ + BUG_ON(PagePrivate(page) || page->private); + set_page_writeback(page); + end_page_writeback(page); +} + +static int __logfs_writepage(struct page *page) +{ + struct inode *inode = page->mapping->host; + int err; + + err = logfs_write_buf(inode, page, WF_LOCK); + if (err) + set_page_dirty(page); + else + clear_radix_tree_dirty(page); + unlock_page(page); + return err; +} + +static int logfs_writepage(struct page *page, struct writeback_control *wbc) +{ + struct inode *inode = page->mapping->host; + loff_t i_size = i_size_read(inode); + pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; + unsigned offset; + u64 bix; + level_t level; + + log_file("logfs_writepage(%lx, %lx, %p)\n", inode->i_ino, page->index, + page); + + logfs_unpack_index(page->index, &bix, &level); + + /* Indirect blocks are never truncated */ + if (level != 0) + return __logfs_writepage(page); + + /* + * TODO: everything below is a near-verbatim copy of nobh_writepage(). + * The relevant bits should be factored out after logfs is merged. + */ + + /* Is the page fully inside i_size? */ + if (bix < end_index) + return __logfs_writepage(page); + + /* Is the page fully outside i_size? (truncate in progress) */ + offset = i_size & (PAGE_CACHE_SIZE-1); + if (bix > end_index || offset == 0) { + unlock_page(page); + return 0; /* don't care */ + } + + /* + * The page straddles i_size. It must be zeroed out on each and every + * writepage invokation because it may be mmapped. "A file is mapped + * in multiples of the page size. For a file that is not a multiple of + * the page size, the remaining memory is zeroed when mapped, and + * writes to that region are not written out to the file." + */ + zero_user_segment(page, offset, PAGE_CACHE_SIZE); + return __logfs_writepage(page); +} + +static void logfs_invalidatepage(struct page *page, unsigned long offset) +{ + move_page_to_btree(page); + BUG_ON(PagePrivate(page) || page->private); +} + +static int logfs_releasepage(struct page *page, gfp_t only_xfs_uses_this) +{ + return 0; /* None of these are easy to release */ +} + + +int logfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct logfs_inode *li = logfs_inode(inode); + unsigned int oldflags, flags; + int err; + + switch (cmd) { + case FS_IOC_GETFLAGS: + flags = li->li_flags & LOGFS_FL_USER_VISIBLE; + return put_user(flags, (int __user *)arg); + case FS_IOC_SETFLAGS: + if (IS_RDONLY(inode)) + return -EROFS; + + if (!is_owner_or_cap(inode)) + return -EACCES; + + err = get_user(flags, (int __user *)arg); + if (err) + return err; + + mutex_lock(&inode->i_mutex); + oldflags = li->li_flags; + flags &= LOGFS_FL_USER_MODIFIABLE; + flags |= oldflags & ~LOGFS_FL_USER_MODIFIABLE; + li->li_flags = flags; + mutex_unlock(&inode->i_mutex); + + inode->i_ctime = CURRENT_TIME; + mark_inode_dirty_sync(inode); + return 0; + + default: + return -ENOTTY; + } +} + +int logfs_fsync(struct file *file, struct dentry *dentry, int datasync) +{ + struct super_block *sb = dentry->d_inode->i_sb; + struct logfs_super *super = logfs_super(sb); + + /* FIXME: write anchor */ + super->s_devops->sync(sb); + return 0; +} + +static int logfs_setattr(struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = dentry->d_inode; + int err = 0; + + if (attr->ia_valid & ATTR_SIZE) + err = logfs_truncate(inode, attr->ia_size); + attr->ia_valid &= ~ATTR_SIZE; + + if (!err) + err = inode_change_ok(inode, attr); + if (!err) + err = inode_setattr(inode, attr); + return err; +} + +const struct inode_operations logfs_reg_iops = { + .setattr = logfs_setattr, +}; + +const struct file_operations logfs_reg_fops = { + .aio_read = generic_file_aio_read, + .aio_write = generic_file_aio_write, + .fsync = logfs_fsync, + .ioctl = logfs_ioctl, + .llseek = generic_file_llseek, + .mmap = generic_file_readonly_mmap, + .open = generic_file_open, + .read = do_sync_read, + .write = do_sync_write, +}; + +const struct address_space_operations logfs_reg_aops = { + .invalidatepage = logfs_invalidatepage, + .readpage = logfs_readpage, + .releasepage = logfs_releasepage, + .set_page_dirty = __set_page_dirty_nobuffers, + .writepage = logfs_writepage, + .writepages = generic_writepages, + .write_begin = logfs_write_begin, + .write_end = logfs_write_end, +}; diff --git a/fs/logfs/gc.c b/fs/logfs/gc.c new file mode 100644 index 000000000000..b3656c44190e --- /dev/null +++ b/fs/logfs/gc.c @@ -0,0 +1,730 @@ +/* + * fs/logfs/gc.c - garbage collection code + * + * As should be obvious for Linux kernel code, license is GPLv2 + * + * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org> + */ +#include "logfs.h" +#include <linux/sched.h> + +/* + * Wear leveling needs to kick in when the difference between low erase + * counts and high erase counts gets too big. A good value for "too big" + * may be somewhat below 10% of maximum erase count for the device. + * Why not 397, to pick a nice round number with no specific meaning? :) + * + * WL_RATELIMIT is the minimum time between two wear level events. A huge + * number of segments may fulfil the requirements for wear leveling at the + * same time. If that happens we don't want to cause a latency from hell, + * but just gently pick one segment every so often and minimize overhead. + */ +#define WL_DELTA 397 +#define WL_RATELIMIT 100 +#define MAX_OBJ_ALIASES 2600 +#define SCAN_RATIO 512 /* number of scanned segments per gc'd segment */ +#define LIST_SIZE 64 /* base size of candidate lists */ +#define SCAN_ROUNDS 128 /* maximum number of complete medium scans */ +#define SCAN_ROUNDS_HIGH 4 /* maximum number of higher-level scans */ + +static int no_free_segments(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + + return super->s_free_list.count; +} + +/* journal has distance -1, top-most ifile layer distance 0 */ +static u8 root_distance(struct super_block *sb, gc_level_t __gc_level) +{ + struct logfs_super *super = logfs_super(sb); + u8 gc_level = (__force u8)__gc_level; + + switch (gc_level) { + case 0: /* fall through */ + case 1: /* fall through */ + case 2: /* fall through */ + case 3: + /* file data or indirect blocks */ + return super->s_ifile_levels + super->s_iblock_levels - gc_level; + case 6: /* fall through */ + case 7: /* fall through */ + case 8: /* fall through */ + case 9: + /* inode file data or indirect blocks */ + return super->s_ifile_levels - (gc_level - 6); + default: + printk(KERN_ERR"LOGFS: segment of unknown level %x found\n", + gc_level); + WARN_ON(1); + return super->s_ifile_levels + super->s_iblock_levels; + } +} + +static int segment_is_reserved(struct super_block *sb, u32 segno) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_area *area; + void *reserved; + int i; + + /* Some segments are reserved. Just pretend they were all valid */ + reserved = btree_lookup32(&super->s_reserved_segments, segno); + if (reserved) + return 1; + + /* Currently open segments */ + for_each_area(i) { + area = super->s_area[i]; + if (area->a_is_open && area->a_segno == segno) + return 1; + } + + return 0; +} + +static void logfs_mark_segment_bad(struct super_block *sb, u32 segno) +{ + BUG(); +} + +/* + * Returns the bytes consumed by valid objects in this segment. Object headers + * are counted, the segment header is not. + */ +static u32 logfs_valid_bytes(struct super_block *sb, u32 segno, u32 *ec, + gc_level_t *gc_level) +{ + struct logfs_segment_entry se; + u32 ec_level; + + logfs_get_segment_entry(sb, segno, &se); + if (se.ec_level == cpu_to_be32(BADSEG) || + se.valid == cpu_to_be32(RESERVED)) + return RESERVED; + + ec_level = be32_to_cpu(se.ec_level); + *ec = ec_level >> 4; + *gc_level = GC_LEVEL(ec_level & 0xf); + return be32_to_cpu(se.valid); +} + +static void logfs_cleanse_block(struct super_block *sb, u64 ofs, u64 ino, + u64 bix, gc_level_t gc_level) +{ + struct inode *inode; + int err, cookie; + + inode = logfs_safe_iget(sb, ino, &cookie); + err = logfs_rewrite_block(inode, bix, ofs, gc_level, 0); + BUG_ON(err); + logfs_safe_iput(inode, cookie); +} + +static u32 logfs_gc_segment(struct super_block *sb, u32 segno, u8 dist) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_segment_header sh; + struct logfs_object_header oh; + u64 ofs, ino, bix; + u32 seg_ofs, logical_segno, cleaned = 0; + int err, len, valid; + gc_level_t gc_level; + + LOGFS_BUG_ON(segment_is_reserved(sb, segno), sb); + + btree_insert32(&super->s_reserved_segments, segno, (void *)1, GFP_NOFS); + err = wbuf_read(sb, dev_ofs(sb, segno, 0), sizeof(sh), &sh); + BUG_ON(err); + gc_level = GC_LEVEL(sh.level); + logical_segno = be32_to_cpu(sh.segno); + if (sh.crc != logfs_crc32(&sh, sizeof(sh), 4)) { + logfs_mark_segment_bad(sb, segno); + cleaned = -1; + goto out; + } + + for (seg_ofs = LOGFS_SEGMENT_HEADERSIZE; + seg_ofs + sizeof(oh) < super->s_segsize; ) { + ofs = dev_ofs(sb, logical_segno, seg_ofs); + err = wbuf_read(sb, dev_ofs(sb, segno, seg_ofs), sizeof(oh), + &oh); + BUG_ON(err); + + if (!memchr_inv(&oh, 0xff, sizeof(oh))) + break; + + if (oh.crc != logfs_crc32(&oh, sizeof(oh) - 4, 4)) { + logfs_mark_segment_bad(sb, segno); + cleaned = super->s_segsize - 1; + goto out; + } + + ino = be64_to_cpu(oh.ino); + bix = be64_to_cpu(oh.bix); + len = sizeof(oh) + be16_to_cpu(oh.len); + valid = logfs_is_valid_block(sb, ofs, ino, bix, gc_level); + if (valid == 1) { + logfs_cleanse_block(sb, ofs, ino, bix, gc_level); + cleaned += len; + } else if (valid == 2) { + /* Will be invalid upon journal commit */ + cleaned += len; + } + seg_ofs += len; + } +out: + btree_remove32(&super->s_reserved_segments, segno); + return cleaned; +} + +static struct gc_candidate *add_list(struct gc_candidate *cand, + struct candidate_list *list) +{ + struct rb_node **p = &list->rb_tree.rb_node; + struct rb_node *parent = NULL; + struct gc_candidate *cur; + int comp; + + cand->list = list; + while (*p) { + parent = *p; + cur = rb_entry(parent, struct gc_candidate, rb_node); + + if (list->sort_by_ec) + comp = cand->erase_count < cur->erase_count; + else + comp = cand->valid < cur->valid; + + if (comp) + p = &parent->rb_left; + else + p = &parent->rb_right; + } + rb_link_node(&cand->rb_node, parent, p); + rb_insert_color(&cand->rb_node, &list->rb_tree); + + if (list->count <= list->maxcount) { + list->count++; + return NULL; + } + cand = rb_entry(rb_last(&list->rb_tree), struct gc_candidate, rb_node); + rb_erase(&cand->rb_node, &list->rb_tree); + cand->list = NULL; + return cand; +} + +static void remove_from_list(struct gc_candidate *cand) +{ + struct candidate_list *list = cand->list; + + rb_erase(&cand->rb_node, &list->rb_tree); + list->count--; +} + +static void free_candidate(struct super_block *sb, struct gc_candidate *cand) +{ + struct logfs_super *super = logfs_super(sb); + + btree_remove32(&super->s_cand_tree, cand->segno); + kfree(cand); +} + +u32 get_best_cand(struct super_block *sb, struct candidate_list *list, u32 *ec) +{ + struct gc_candidate *cand; + u32 segno; + + BUG_ON(list->count == 0); + + cand = rb_entry(rb_first(&list->rb_tree), struct gc_candidate, rb_node); + remove_from_list(cand); + segno = cand->segno; + if (ec) + *ec = cand->erase_count; + free_candidate(sb, cand); + return segno; +} + +/* + * We have several lists to manage segments with. The reserve_list is used to + * deal with bad blocks. We try to keep the best (lowest ec) segments on this + * list. + * The free_list contains free segments for normal usage. It usually gets the + * second pick after the reserve_list. But when the free_list is running short + * it is more important to keep the free_list full than to keep a reserve. + * + * Segments that are not free are put onto a per-level low_list. If we have + * to run garbage collection, we pick a candidate from there. All segments on + * those lists should have at least some free space so GC will make progress. + * + * And last we have the ec_list, which is used to pick segments for wear + * leveling. + * + * If all appropriate lists are full, we simply free the candidate and forget + * about that segment for a while. We have better candidates for each purpose. + */ +static void __add_candidate(struct super_block *sb, struct gc_candidate *cand) +{ + struct logfs_super *super = logfs_super(sb); + u32 full = super->s_segsize - LOGFS_SEGMENT_RESERVE; + + if (cand->valid == 0) { + /* 100% free segments */ + log_gc_noisy("add reserve segment %x (ec %x) at %llx\n", + cand->segno, cand->erase_count, + dev_ofs(sb, cand->segno, 0)); + cand = add_list(cand, &super->s_reserve_list); + if (cand) { + log_gc_noisy("add free segment %x (ec %x) at %llx\n", + cand->segno, cand->erase_count, + dev_ofs(sb, cand->segno, 0)); + cand = add_list(cand, &super->s_free_list); + } + } else { + /* good candidates for Garbage Collection */ + if (cand->valid < full) + cand = add_list(cand, &super->s_low_list[cand->dist]); + /* good candidates for wear leveling, + * segments that were recently written get ignored */ + if (cand) + cand = add_list(cand, &super->s_ec_list); + } + if (cand) + free_candidate(sb, cand); +} + +static int add_candidate(struct super_block *sb, u32 segno, u32 valid, u32 ec, + u8 dist) +{ + struct logfs_super *super = logfs_super(sb); + struct gc_candidate *cand; + + cand = kmalloc(sizeof(*cand), GFP_NOFS); + if (!cand) + return -ENOMEM; + + cand->segno = segno; + cand->valid = valid; + cand->erase_count = ec; + cand->dist = dist; + + btree_insert32(&super->s_cand_tree, segno, cand, GFP_NOFS); + __add_candidate(sb, cand); + return 0; +} + +static void remove_segment_from_lists(struct super_block *sb, u32 segno) +{ + struct logfs_super *super = logfs_super(sb); + struct gc_candidate *cand; + + cand = btree_lookup32(&super->s_cand_tree, segno); + if (cand) { + remove_from_list(cand); + free_candidate(sb, cand); + } +} + +static void scan_segment(struct super_block *sb, u32 segno) +{ + u32 valid, ec = 0; + gc_level_t gc_level = 0; + u8 dist; + + if (segment_is_reserved(sb, segno)) + return; + + remove_segment_from_lists(sb, segno); + valid = logfs_valid_bytes(sb, segno, &ec, &gc_level); + if (valid == RESERVED) + return; + + dist = root_distance(sb, gc_level); + add_candidate(sb, segno, valid, ec, dist); +} + +static struct gc_candidate *first_in_list(struct candidate_list *list) +{ + if (list->count == 0) + return NULL; + return rb_entry(rb_first(&list->rb_tree), struct gc_candidate, rb_node); +} + +/* + * Find the best segment for garbage collection. Main criterion is + * the segment requiring the least effort to clean. Secondary + * criterion is to GC on the lowest level available. + * + * So we search the least effort segment on the lowest level first, + * then move up and pick another segment iff is requires significantly + * less effort. Hence the LOGFS_MAX_OBJECTSIZE in the comparison. + */ +static struct gc_candidate *get_candidate(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + int i, max_dist; + struct gc_candidate *cand = NULL, *this; + + max_dist = min(no_free_segments(sb), LOGFS_NO_AREAS); + + for (i = max_dist; i >= 0; i--) { + this = first_in_list(&super->s_low_list[i]); + if (!this) + continue; + if (!cand) + cand = this; + if (this->valid + LOGFS_MAX_OBJECTSIZE <= cand->valid) + cand = this; + } + return cand; +} + +static int __logfs_gc_once(struct super_block *sb, struct gc_candidate *cand) +{ + struct logfs_super *super = logfs_super(sb); + gc_level_t gc_level; + u32 cleaned, valid, segno, ec; + u8 dist; + + if (!cand) { + log_gc("GC attempted, but no candidate found\n"); + return 0; + } + + segno = cand->segno; + dist = cand->dist; + valid = logfs_valid_bytes(sb, segno, &ec, &gc_level); + free_candidate(sb, cand); + log_gc("GC segment #%02x at %llx, %x required, %x free, %x valid, %llx free\n", + segno, (u64)segno << super->s_segshift, + dist, no_free_segments(sb), valid, + super->s_free_bytes); + cleaned = logfs_gc_segment(sb, segno, dist); + log_gc("GC segment #%02x complete - now %x valid\n", segno, + valid - cleaned); + BUG_ON(cleaned != valid); + return 1; +} + +static int logfs_gc_once(struct super_block *sb) +{ + struct gc_candidate *cand; + + cand = get_candidate(sb); + if (cand) + remove_from_list(cand); + return __logfs_gc_once(sb, cand); +} + +/* returns 1 if a wrap occurs, 0 otherwise */ +static int logfs_scan_some(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + u32 segno; + int i, ret = 0; + + segno = super->s_sweeper; + for (i = SCAN_RATIO; i > 0; i--) { + segno++; + if (segno >= super->s_no_segs) { + segno = 0; + ret = 1; + /* Break out of the loop. We want to read a single + * block from the segment size on next invocation if + * SCAN_RATIO is set to match block size + */ + break; + } + + scan_segment(sb, segno); + } + super->s_sweeper = segno; + return ret; +} + +/* + * In principle, this function should loop forever, looking for GC candidates + * and moving data. LogFS is designed in such a way that this loop is + * guaranteed to terminate. + * + * Limiting the loop to some iterations serves purely to catch cases when + * these guarantees have failed. An actual endless loop is an obvious bug + * and should be reported as such. + */ +static void __logfs_gc_pass(struct super_block *sb, int target) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_block *block; + int round, progress, last_progress = 0; + + if (no_free_segments(sb) >= target && + super->s_no_object_aliases < MAX_OBJ_ALIASES) + return; + + log_gc("__logfs_gc_pass(%x)\n", target); + for (round = 0; round < SCAN_ROUNDS; ) { + if (no_free_segments(sb) >= target) + goto write_alias; + + /* Sync in-memory state with on-medium state in case they + * diverged */ + logfs_write_anchor(super->s_master_inode); + round += logfs_scan_some(sb); + if (no_free_segments(sb) >= target) + goto write_alias; + progress = logfs_gc_once(sb); + if (progress) + last_progress = round; + else if (round - last_progress > 2) + break; + continue; + + /* + * The goto logic is nasty, I just don't know a better way to + * code it. GC is supposed to ensure two things: + * 1. Enough free segments are available. + * 2. The number of aliases is bounded. + * When 1. is achieved, we take a look at 2. and write back + * some alias-containing blocks, if necessary. However, after + * each such write we need to go back to 1., as writes can + * consume free segments. + */ +write_alias: + if (super->s_no_object_aliases < MAX_OBJ_ALIASES) + return; + if (list_empty(&super->s_object_alias)) { + /* All aliases are still in btree */ + return; + } + log_gc("Write back one alias\n"); + block = list_entry(super->s_object_alias.next, + struct logfs_block, alias_list); + block->ops->write_block(block); + /* + * To round off the nasty goto logic, we reset round here. It + * is a safety-net for GC not making any progress and limited + * to something reasonably small. If incremented it for every + * single alias, the loop could terminate rather quickly. + */ + round = 0; + } + LOGFS_BUG(sb); +} + +static int wl_ratelimit(struct super_block *sb, u64 *next_event) +{ + struct logfs_super *super = logfs_super(sb); + + if (*next_event < super->s_gec) { + *next_event = super->s_gec + WL_RATELIMIT; + return 0; + } + return 1; +} + +static void logfs_wl_pass(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + struct gc_candidate *wl_cand, *free_cand; + + if (wl_ratelimit(sb, &super->s_wl_gec_ostore)) + return; + + wl_cand = first_in_list(&super->s_ec_list); + if (!wl_cand) + return; + free_cand = first_in_list(&super->s_free_list); + if (!free_cand) + return; + + if (wl_cand->erase_count < free_cand->erase_count + WL_DELTA) { + remove_from_list(wl_cand); + __logfs_gc_once(sb, wl_cand); + } +} + +/* + * The journal needs wear leveling as well. But moving the journal is an + * expensive operation so we try to avoid it as much as possible. And if we + * have to do it, we move the whole journal, not individual segments. + * + * Ratelimiting is not strictly necessary here, it mainly serves to avoid the + * calculations. First we check whether moving the journal would be a + * significant improvement. That means that a) the current journal segments + * have more wear than the future journal segments and b) the current journal + * segments have more wear than normal ostore segments. + * Rationale for b) is that we don't have to move the journal if it is aging + * less than the ostore, even if the reserve segments age even less (they are + * excluded from wear leveling, after all). + * Next we check that the superblocks have less wear than the journal. Since + * moving the journal requires writing the superblocks, we have to protect the + * superblocks even more than the journal. + * + * Also we double the acceptable wear difference, compared to ostore wear + * leveling. Journal data is read and rewritten rapidly, comparatively. So + * soft errors have much less time to accumulate and we allow the journal to + * be a bit worse than the ostore. + */ +static void logfs_journal_wl_pass(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + struct gc_candidate *cand; + u32 min_journal_ec = -1, max_reserve_ec = 0; + int i; + + if (wl_ratelimit(sb, &super->s_wl_gec_journal)) + return; + + if (super->s_reserve_list.count < super->s_no_journal_segs) { + /* Reserve is not full enough to move complete journal */ + return; + } + + journal_for_each(i) + if (super->s_journal_seg[i]) + min_journal_ec = min(min_journal_ec, + super->s_journal_ec[i]); + cand = rb_entry(rb_first(&super->s_free_list.rb_tree), + struct gc_candidate, rb_node); + max_reserve_ec = cand->erase_count; + for (i = 0; i < 2; i++) { + struct logfs_segment_entry se; + u32 segno = seg_no(sb, super->s_sb_ofs[i]); + u32 ec; + + logfs_get_segment_entry(sb, segno, &se); + ec = be32_to_cpu(se.ec_level) >> 4; + max_reserve_ec = max(max_reserve_ec, ec); + } + + if (min_journal_ec > max_reserve_ec + 2 * WL_DELTA) { + do_logfs_journal_wl_pass(sb); + } +} + +void logfs_gc_pass(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + + //BUG_ON(mutex_trylock(&logfs_super(sb)->s_w_mutex)); + /* Write journal before free space is getting saturated with dirty + * objects. + */ + if (super->s_dirty_used_bytes + super->s_dirty_free_bytes + + LOGFS_MAX_OBJECTSIZE >= super->s_free_bytes) + logfs_write_anchor(super->s_master_inode); + __logfs_gc_pass(sb, logfs_super(sb)->s_total_levels); + logfs_wl_pass(sb); + logfs_journal_wl_pass(sb); +} + +static int check_area(struct super_block *sb, int i) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_area *area = super->s_area[i]; + struct logfs_object_header oh; + u32 segno = area->a_segno; + u32 ofs = area->a_used_bytes; + __be32 crc; + int err; + + if (!area->a_is_open) + return 0; + + for (ofs = area->a_used_bytes; + ofs <= super->s_segsize - sizeof(oh); + ofs += (u32)be16_to_cpu(oh.len) + sizeof(oh)) { + err = wbuf_read(sb, dev_ofs(sb, segno, ofs), sizeof(oh), &oh); + if (err) + return err; + + if (!memchr_inv(&oh, 0xff, sizeof(oh))) + break; + + crc = logfs_crc32(&oh, sizeof(oh) - 4, 4); + if (crc != oh.crc) { + printk(KERN_INFO "interrupted header at %llx\n", + dev_ofs(sb, segno, ofs)); + return 0; + } + } + if (ofs != area->a_used_bytes) { + printk(KERN_INFO "%x bytes unaccounted data found at %llx\n", + ofs - area->a_used_bytes, + dev_ofs(sb, segno, area->a_used_bytes)); + area->a_used_bytes = ofs; + } + return 0; +} + +int logfs_check_areas(struct super_block *sb) +{ + int i, err; + + for_each_area(i) { + err = check_area(sb, i); + if (err) + return err; + } + return 0; +} + +static void logfs_init_candlist(struct candidate_list *list, int maxcount, + int sort_by_ec) +{ + list->count = 0; + list->maxcount = maxcount; + list->sort_by_ec = sort_by_ec; + list->rb_tree = RB_ROOT; +} + +int logfs_init_gc(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + int i; + + btree_init_mempool32(&super->s_cand_tree, super->s_btree_pool); + logfs_init_candlist(&super->s_free_list, LIST_SIZE + SCAN_RATIO, 1); + logfs_init_candlist(&super->s_reserve_list, + super->s_bad_seg_reserve, 1); + for_each_area(i) + logfs_init_candlist(&super->s_low_list[i], LIST_SIZE, 0); + logfs_init_candlist(&super->s_ec_list, LIST_SIZE, 1); + return 0; +} + +static void logfs_cleanup_list(struct super_block *sb, + struct candidate_list *list) +{ + struct gc_candidate *cand; + + while (list->count) { + cand = rb_entry(list->rb_tree.rb_node, struct gc_candidate, + rb_node); + remove_from_list(cand); + free_candidate(sb, cand); + } + BUG_ON(list->rb_tree.rb_node); +} + +void logfs_cleanup_gc(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + int i; + + if (!super->s_free_list.count) + return; + + /* + * FIXME: The btree may still contain a single empty node. So we + * call the grim visitor to clean up that mess. Btree code should + * do it for us, really. + */ + btree_grim_visitor32(&super->s_cand_tree, 0, NULL); + logfs_cleanup_list(sb, &super->s_free_list); + logfs_cleanup_list(sb, &super->s_reserve_list); + for_each_area(i) + logfs_cleanup_list(sb, &super->s_low_list[i]); + logfs_cleanup_list(sb, &super->s_ec_list); +} diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c new file mode 100644 index 000000000000..6d08b3762641 --- /dev/null +++ b/fs/logfs/inode.c @@ -0,0 +1,417 @@ +/* + * fs/logfs/inode.c - inode handling code + * + * As should be obvious for Linux kernel code, license is GPLv2 + * + * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org> + */ +#include "logfs.h" +#include <linux/writeback.h> +#include <linux/backing-dev.h> + +/* + * How soon to reuse old inode numbers? LogFS doesn't store deleted inodes + * on the medium. It therefore also lacks a method to store the previous + * generation number for deleted inodes. Instead a single generation number + * is stored which will be used for new inodes. Being just a 32bit counter, + * this can obvious wrap relatively quickly. So we only reuse inodes if we + * know that a fair number of inodes can be created before we have to increment + * the generation again - effectively adding some bits to the counter. + * But being too aggressive here means we keep a very large and very sparse + * inode file, wasting space on indirect blocks. + * So what is a good value? Beats me. 64k seems moderately bad on both + * fronts, so let's use that for now... + * + * NFS sucks, as everyone already knows. + */ +#define INOS_PER_WRAP (0x10000) + +/* + * Logfs' requirement to read inodes for garbage collection makes life a bit + * harder. GC may have to read inodes that are in I_FREEING state, when they + * are being written out - and waiting for GC to make progress, naturally. + * + * So we cannot just call iget() or some variant of it, but first have to check + * wether the inode in question might be in I_FREEING state. Therefore we + * maintain our own per-sb list of "almost deleted" inodes and check against + * that list first. Normally this should be at most 1-2 entries long. + * + * Also, inodes have logfs-specific reference counting on top of what the vfs + * does. When .destroy_inode is called, normally the reference count will drop + * to zero and the inode gets deleted. But if GC accessed the inode, its + * refcount will remain nonzero and final deletion will have to wait. + * + * As a result we have two sets of functions to get/put inodes: + * logfs_safe_iget/logfs_safe_iput - safe to call from GC context + * logfs_iget/iput - normal version + */ +static struct kmem_cache *logfs_inode_cache; + +static DEFINE_SPINLOCK(logfs_inode_lock); + +static void logfs_inode_setops(struct inode *inode) +{ + switch (inode->i_mode & S_IFMT) { + case S_IFDIR: + inode->i_op = &logfs_dir_iops; + inode->i_fop = &logfs_dir_fops; + inode->i_mapping->a_ops = &logfs_reg_aops; + break; + case S_IFREG: + inode->i_op = &logfs_reg_iops; + inode->i_fop = &logfs_reg_fops; + inode->i_mapping->a_ops = &logfs_reg_aops; + break; + case S_IFLNK: + inode->i_op = &logfs_symlink_iops; + inode->i_mapping->a_ops = &logfs_reg_aops; + break; + case S_IFSOCK: /* fall through */ + case S_IFBLK: /* fall through */ + case S_IFCHR: /* fall through */ + case S_IFIFO: + init_special_inode(inode, inode->i_mode, inode->i_rdev); + break; + default: + BUG(); + } +} + +static struct inode *__logfs_iget(struct super_block *sb, ino_t ino) +{ + struct inode *inode = iget_locked(sb, ino); + int err; + + if (!inode) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; + + err = logfs_read_inode(inode); + if (err || inode->i_nlink == 0) { + /* inode->i_nlink == 0 can be true when called from + * block validator */ + /* set i_nlink to 0 to prevent caching */ + inode->i_nlink = 0; + logfs_inode(inode)->li_flags |= LOGFS_IF_ZOMBIE; + iget_failed(inode); + if (!err) + err = -ENOENT; + return ERR_PTR(err); + } + + logfs_inode_setops(inode); + unlock_new_inode(inode); + return inode; +} + +struct inode *logfs_iget(struct super_block *sb, ino_t ino) +{ + BUG_ON(ino == LOGFS_INO_MASTER); + BUG_ON(ino == LOGFS_INO_SEGFILE); + return __logfs_iget(sb, ino); +} + +/* + * is_cached is set to 1 if we hand out a cached inode, 0 otherwise. + * this allows logfs_iput to do the right thing later + */ +struct inode *logfs_safe_iget(struct super_block *sb, ino_t ino, int *is_cached) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_inode *li; + + if (ino == LOGFS_INO_MASTER) + return super->s_master_inode; + if (ino == LOGFS_INO_SEGFILE) + return super->s_segfile_inode; + + spin_lock(&logfs_inode_lock); + list_for_each_entry(li, &super->s_freeing_list, li_freeing_list) + if (li->vfs_inode.i_ino == ino) { + li->li_refcount++; + spin_unlock(&logfs_inode_lock); + *is_cached = 1; + return &li->vfs_inode; + } + spin_unlock(&logfs_inode_lock); + + *is_cached = 0; + return __logfs_iget(sb, ino); +} + +static void __logfs_destroy_inode(struct inode *inode) +{ + struct logfs_inode *li = logfs_inode(inode); + + BUG_ON(li->li_block); + list_del(&li->li_freeing_list); + kmem_cache_free(logfs_inode_cache, li); +} + +static void logfs_destroy_inode(struct inode *inode) +{ + struct logfs_inode *li = logfs_inode(inode); + + BUG_ON(list_empty(&li->li_freeing_list)); + spin_lock(&logfs_inode_lock); + li->li_refcount--; + if (li->li_refcount == 0) + __logfs_destroy_inode(inode); + spin_unlock(&logfs_inode_lock); +} + +void logfs_safe_iput(struct inode *inode, int is_cached) +{ + if (inode->i_ino == LOGFS_INO_MASTER) + return; + if (inode->i_ino == LOGFS_INO_SEGFILE) + return; + + if (is_cached) { + logfs_destroy_inode(inode); + return; + } + + iput(inode); +} + +static void logfs_init_inode(struct super_block *sb, struct inode *inode) +{ + struct logfs_inode *li = logfs_inode(inode); + int i; + + li->li_flags = 0; + li->li_height = 0; + li->li_used_bytes = 0; + li->li_block = NULL; + inode->i_uid = 0; + inode->i_gid = 0; + inode->i_size = 0; + inode->i_blocks = 0; + inode->i_ctime = CURRENT_TIME; + inode->i_mtime = CURRENT_TIME; + inode->i_nlink = 1; + INIT_LIST_HEAD(&li->li_freeing_list); + + for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++) + li->li_data[i] = 0; + + return; +} + +static struct inode *logfs_alloc_inode(struct super_block *sb) +{ + struct logfs_inode *li; + + li = kmem_cache_alloc(logfs_inode_cache, GFP_NOFS); + if (!li) + return NULL; + logfs_init_inode(sb, &li->vfs_inode); + return &li->vfs_inode; +} + +/* + * In logfs inodes are written to an inode file. The inode file, like any + * other file, is managed with a inode. The inode file's inode, aka master + * inode, requires special handling in several respects. First, it cannot be + * written to the inode file, so it is stored in the journal instead. + * + * Secondly, this inode cannot be written back and destroyed before all other + * inodes have been written. The ordering is important. Linux' VFS is happily + * unaware of the ordering constraint and would ordinarily destroy the master + * inode at umount time while other inodes are still in use and dirty. Not + * good. + * + * So logfs makes sure the master inode is not written until all other inodes + * have been destroyed. Sadly, this method has another side-effect. The VFS + * will notice one remaining inode and print a frightening warning message. + * Worse, it is impossible to judge whether such a warning was caused by the + * master inode or any other inodes have leaked as well. + * + * Our attempt of solving this is with logfs_new_meta_inode() below. Its + * purpose is to create a new inode that will not trigger the warning if such + * an inode is still in use. An ugly hack, no doubt. Suggections for + * improvement are welcome. + */ +struct inode *logfs_new_meta_inode(struct super_block *sb, u64 ino) +{ + struct inode *inode; + + inode = logfs_alloc_inode(sb); + if (!inode) + return ERR_PTR(-ENOMEM); + + inode->i_mode = S_IFREG; + inode->i_ino = ino; + inode->i_sb = sb; + + /* This is a blatant copy of alloc_inode code. We'd need alloc_inode + * to be nonstatic, alas. */ + { + struct address_space * const mapping = &inode->i_data; + + mapping->a_ops = &logfs_reg_aops; + mapping->host = inode; + mapping->flags = 0; + mapping_set_gfp_mask(mapping, GFP_NOFS); + mapping->assoc_mapping = NULL; + mapping->backing_dev_info = &default_backing_dev_info; + inode->i_mapping = mapping; + inode->i_nlink = 1; + } + + return inode; +} + +struct inode *logfs_read_meta_inode(struct super_block *sb, u64 ino) +{ + struct inode *inode; + int err; + + inode = logfs_new_meta_inode(sb, ino); + if (IS_ERR(inode)) + return inode; + + err = logfs_read_inode(inode); + if (err) { + destroy_meta_inode(inode); + return ERR_PTR(err); + } + logfs_inode_setops(inode); + return inode; +} + +static int logfs_write_inode(struct inode *inode, int do_sync) +{ + int ret; + long flags = WF_LOCK; + + /* Can only happen if creat() failed. Safe to skip. */ + if (logfs_inode(inode)->li_flags & LOGFS_IF_STILLBORN) + return 0; + + ret = __logfs_write_inode(inode, flags); + LOGFS_BUG_ON(ret, inode->i_sb); + return ret; +} + +void destroy_meta_inode(struct inode *inode) +{ + if (inode) { + if (inode->i_data.nrpages) + truncate_inode_pages(&inode->i_data, 0); + logfs_clear_inode(inode); + kmem_cache_free(logfs_inode_cache, logfs_inode(inode)); + } +} + +/* called with inode_lock held */ +static void logfs_drop_inode(struct inode *inode) +{ + struct logfs_super *super = logfs_super(inode->i_sb); + struct logfs_inode *li = logfs_inode(inode); + + spin_lock(&logfs_inode_lock); + list_move(&li->li_freeing_list, &super->s_freeing_list); + spin_unlock(&logfs_inode_lock); + generic_drop_inode(inode); +} + +static void logfs_set_ino_generation(struct super_block *sb, + struct inode *inode) +{ + struct logfs_super *super = logfs_super(sb); + u64 ino; + + mutex_lock(&super->s_journal_mutex); + ino = logfs_seek_hole(super->s_master_inode, super->s_last_ino); + super->s_last_ino = ino; + super->s_inos_till_wrap--; + if (super->s_inos_till_wrap < 0) { + super->s_last_ino = LOGFS_RESERVED_INOS; + super->s_generation++; + super->s_inos_till_wrap = INOS_PER_WRAP; + } + inode->i_ino = ino; + inode->i_generation = super->s_generation; + mutex_unlock(&super->s_journal_mutex); +} + +struct inode *logfs_new_inode(struct inode *dir, int mode) +{ + struct super_block *sb = dir->i_sb; + struct inode *inode; + + inode = new_inode(sb); + if (!inode) + return ERR_PTR(-ENOMEM); + + logfs_init_inode(sb, inode); + + /* inherit parent flags */ + logfs_inode(inode)->li_flags |= + logfs_inode(dir)->li_flags & LOGFS_FL_INHERITED; + + inode->i_mode = mode; + logfs_set_ino_generation(sb, inode); + + inode->i_uid = current_fsuid(); + inode->i_gid = current_fsgid(); + if (dir->i_mode & S_ISGID) { + inode->i_gid = dir->i_gid; + if (S_ISDIR(mode)) + inode->i_mode |= S_ISGID; + } + + logfs_inode_setops(inode); + insert_inode_hash(inode); + + return inode; +} + +static void logfs_init_once(void *_li) +{ + struct logfs_inode *li = _li; + int i; + + li->li_flags = 0; + li->li_used_bytes = 0; + li->li_refcount = 1; + for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++) + li->li_data[i] = 0; + inode_init_once(&li->vfs_inode); +} + +static int logfs_sync_fs(struct super_block *sb, int wait) +{ + /* FIXME: write anchor */ + logfs_super(sb)->s_devops->sync(sb); + return 0; +} + +const struct super_operations logfs_super_operations = { + .alloc_inode = logfs_alloc_inode, + .clear_inode = logfs_clear_inode, + .delete_inode = logfs_delete_inode, + .destroy_inode = logfs_destroy_inode, + .drop_inode = logfs_drop_inode, + .write_inode = logfs_write_inode, + .statfs = logfs_statfs, + .sync_fs = logfs_sync_fs, +}; + +int logfs_init_inode_cache(void) +{ + logfs_inode_cache = kmem_cache_create("logfs_inode_cache", + sizeof(struct logfs_inode), 0, SLAB_RECLAIM_ACCOUNT, + logfs_init_once); + if (!logfs_inode_cache) + return -ENOMEM; + return 0; +} + +void logfs_destroy_inode_cache(void) +{ + kmem_cache_destroy(logfs_inode_cache); +} diff --git a/fs/logfs/journal.c b/fs/logfs/journal.c new file mode 100644 index 000000000000..7a023dbba9f8 --- /dev/null +++ b/fs/logfs/journal.c @@ -0,0 +1,879 @@ +/* + * fs/logfs/journal.c - journal handling code + * + * As should be obvious for Linux kernel code, license is GPLv2 + * + * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org> + */ +#include "logfs.h" + +static void logfs_calc_free(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + u64 reserve, no_segs = super->s_no_segs; + s64 free; + int i; + + /* superblock segments */ + no_segs -= 2; + super->s_no_journal_segs = 0; + /* journal */ + journal_for_each(i) + if (super->s_journal_seg[i]) { + no_segs--; + super->s_no_journal_segs++; + } + + /* open segments plus one extra per level for GC */ + no_segs -= 2 * super->s_total_levels; + + free = no_segs * (super->s_segsize - LOGFS_SEGMENT_RESERVE); + free -= super->s_used_bytes; + /* just a bit extra */ + free -= super->s_total_levels * 4096; + + /* Bad blocks are 'paid' for with speed reserve - the filesystem + * simply gets slower as bad blocks accumulate. Until the bad blocks + * exceed the speed reserve - then the filesystem gets smaller. + */ + reserve = super->s_bad_segments + super->s_bad_seg_reserve; + reserve *= super->s_segsize - LOGFS_SEGMENT_RESERVE; + reserve = max(reserve, super->s_speed_reserve); + free -= reserve; + if (free < 0) + free = 0; + + super->s_free_bytes = free; +} + +static void reserve_sb_and_journal(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + struct btree_head32 *head = &super->s_reserved_segments; + int i, err; + + err = btree_insert32(head, seg_no(sb, super->s_sb_ofs[0]), (void *)1, + GFP_KERNEL); + BUG_ON(err); + + err = btree_insert32(head, seg_no(sb, super->s_sb_ofs[1]), (void *)1, + GFP_KERNEL); + BUG_ON(err); + + journal_for_each(i) { + if (!super->s_journal_seg[i]) + continue; + err = btree_insert32(head, super->s_journal_seg[i], (void *)1, + GFP_KERNEL); + BUG_ON(err); + } +} + +static void read_dynsb(struct super_block *sb, + struct logfs_je_dynsb *dynsb) +{ + struct logfs_super *super = logfs_super(sb); + + super->s_gec = be64_to_cpu(dynsb->ds_gec); + super->s_sweeper = be64_to_cpu(dynsb->ds_sweeper); + super->s_victim_ino = be64_to_cpu(dynsb->ds_victim_ino); + super->s_rename_dir = be64_to_cpu(dynsb->ds_rename_dir); + super->s_rename_pos = be64_to_cpu(dynsb->ds_rename_pos); + super->s_used_bytes = be64_to_cpu(dynsb->ds_used_bytes); + super->s_generation = be32_to_cpu(dynsb->ds_generation); +} + +static void read_anchor(struct super_block *sb, + struct logfs_je_anchor *da) +{ + struct logfs_super *super = logfs_super(sb); + struct inode *inode = super->s_master_inode; + struct logfs_inode *li = logfs_inode(inode); + int i; + + super->s_last_ino = be64_to_cpu(da->da_last_ino); + li->li_flags = 0; + li->li_height = da->da_height; + i_size_write(inode, be64_to_cpu(da->da_size)); + li->li_used_bytes = be64_to_cpu(da->da_used_bytes); + + for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++) + li->li_data[i] = be64_to_cpu(da->da_data[i]); +} + +static void read_erasecount(struct super_block *sb, + struct logfs_je_journal_ec *ec) +{ + struct logfs_super *super = logfs_super(sb); + int i; + + journal_for_each(i) + super->s_journal_ec[i] = be32_to_cpu(ec->ec[i]); +} + +static int read_area(struct super_block *sb, struct logfs_je_area *a) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_area *area = super->s_area[a->gc_level]; + u64 ofs; + u32 writemask = ~(super->s_writesize - 1); + + if (a->gc_level >= LOGFS_NO_AREAS) + return -EIO; + if (a->vim != VIM_DEFAULT) + return -EIO; /* TODO: close area and continue */ + + area->a_used_bytes = be32_to_cpu(a->used_bytes); + area->a_written_bytes = area->a_used_bytes & writemask; + area->a_segno = be32_to_cpu(a->segno); + if (area->a_segno) + area->a_is_open = 1; + + ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes); + if (super->s_writesize > 1) + logfs_buf_recover(area, ofs, a + 1, super->s_writesize); + else + logfs_buf_recover(area, ofs, NULL, 0); + return 0; +} + +static void *unpack(void *from, void *to) +{ + struct logfs_journal_header *jh = from; + void *data = from + sizeof(struct logfs_journal_header); + int err; + size_t inlen, outlen; + + inlen = be16_to_cpu(jh->h_len); + outlen = be16_to_cpu(jh->h_datalen); + + if (jh->h_compr == COMPR_NONE) + memcpy(to, data, inlen); + else { + err = logfs_uncompress(data, to, inlen, outlen); + BUG_ON(err); + } + return to; +} + +static int __read_je_header(struct super_block *sb, u64 ofs, + struct logfs_journal_header *jh) +{ + struct logfs_super *super = logfs_super(sb); + size_t bufsize = max_t(size_t, sb->s_blocksize, super->s_writesize) + + MAX_JOURNAL_HEADER; + u16 type, len, datalen; + int err; + + /* read header only */ + err = wbuf_read(sb, ofs, sizeof(*jh), jh); + if (err) + return err; + type = be16_to_cpu(jh->h_type); + len = be16_to_cpu(jh->h_len); + datalen = be16_to_cpu(jh->h_datalen); + if (len > sb->s_blocksize) + return -EIO; + if ((type < JE_FIRST) || (type > JE_LAST)) + return -EIO; + if (datalen > bufsize) + return -EIO; + return 0; +} + +static int __read_je_payload(struct super_block *sb, u64 ofs, + struct logfs_journal_header *jh) +{ + u16 len; + int err; + + len = be16_to_cpu(jh->h_len); + err = wbuf_read(sb, ofs + sizeof(*jh), len, jh + 1); + if (err) + return err; + if (jh->h_crc != logfs_crc32(jh, len + sizeof(*jh), 4)) { + /* Old code was confused. It forgot about the header length + * and stopped calculating the crc 16 bytes before the end + * of data - ick! + * FIXME: Remove this hack once the old code is fixed. + */ + if (jh->h_crc == logfs_crc32(jh, len, 4)) + WARN_ON_ONCE(1); + else + return -EIO; + } + return 0; +} + +/* + * jh needs to be large enough to hold the complete entry, not just the header + */ +static int __read_je(struct super_block *sb, u64 ofs, + struct logfs_journal_header *jh) +{ + int err; + + err = __read_je_header(sb, ofs, jh); + if (err) + return err; + return __read_je_payload(sb, ofs, jh); +} + +static int read_je(struct super_block *sb, u64 ofs) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_journal_header *jh = super->s_compressed_je; + void *scratch = super->s_je; + u16 type, datalen; + int err; + + err = __read_je(sb, ofs, jh); + if (err) + return err; + type = be16_to_cpu(jh->h_type); + datalen = be16_to_cpu(jh->h_datalen); + + switch (type) { + case JE_DYNSB: + read_dynsb(sb, unpack(jh, scratch)); + break; + case JE_ANCHOR: + read_anchor(sb, unpack(jh, scratch)); + break; + case JE_ERASECOUNT: + read_erasecount(sb, unpack(jh, scratch)); + break; + case JE_AREA: + read_area(sb, unpack(jh, scratch)); + break; + case JE_OBJ_ALIAS: + err = logfs_load_object_aliases(sb, unpack(jh, scratch), + datalen); + break; + default: + WARN_ON_ONCE(1); + return -EIO; + } + return err; +} + +static int logfs_read_segment(struct super_block *sb, u32 segno) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_journal_header *jh = super->s_compressed_je; + u64 ofs, seg_ofs = dev_ofs(sb, segno, 0); + u32 h_ofs, last_ofs = 0; + u16 len, datalen, last_len; + int i, err; + + /* search for most recent commit */ + for (h_ofs = 0; h_ofs < super->s_segsize; h_ofs += sizeof(*jh)) { + ofs = seg_ofs + h_ofs; + err = __read_je_header(sb, ofs, jh); + if (err) + continue; + if (jh->h_type != cpu_to_be16(JE_COMMIT)) + continue; + err = __read_je_payload(sb, ofs, jh); + if (err) + continue; + len = be16_to_cpu(jh->h_len); + datalen = be16_to_cpu(jh->h_datalen); + if ((datalen > sizeof(super->s_je_array)) || + (datalen % sizeof(__be64))) + continue; + last_ofs = h_ofs; + last_len = datalen; + h_ofs += ALIGN(len, sizeof(*jh)) - sizeof(*jh); + } + /* read commit */ + if (last_ofs == 0) + return -ENOENT; + ofs = seg_ofs + last_ofs; + log_journal("Read commit from %llx\n", ofs); + err = __read_je(sb, ofs, jh); + BUG_ON(err); /* We should have caught it in the scan loop already */ + if (err) + return err; + /* uncompress */ + unpack(jh, super->s_je_array); + super->s_no_je = last_len / sizeof(__be64); + /* iterate over array */ + for (i = 0; i < super->s_no_je; i++) { + err = read_je(sb, be64_to_cpu(super->s_je_array[i])); + if (err) + return err; + } + super->s_journal_area->a_segno = segno; + return 0; +} + +static u64 read_gec(struct super_block *sb, u32 segno) +{ + struct logfs_segment_header sh; + __be32 crc; + int err; + + if (!segno) + return 0; + err = wbuf_read(sb, dev_ofs(sb, segno, 0), sizeof(sh), &sh); + if (err) + return 0; + crc = logfs_crc32(&sh, sizeof(sh), 4); + if (crc != sh.crc) { + WARN_ON(sh.gec != cpu_to_be64(0xffffffffffffffffull)); + /* Most likely it was just erased */ + return 0; + } + return be64_to_cpu(sh.gec); +} + +static int logfs_read_journal(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + u64 gec[LOGFS_JOURNAL_SEGS], max; + u32 segno; + int i, max_i; + + max = 0; + max_i = -1; + journal_for_each(i) { + segno = super->s_journal_seg[i]; + gec[i] = read_gec(sb, super->s_journal_seg[i]); + if (gec[i] > max) { + max = gec[i]; + max_i = i; + } + } + if (max_i == -1) + return -EIO; + /* FIXME: Try older segments in case of error */ + return logfs_read_segment(sb, super->s_journal_seg[max_i]); +} + +/* + * First search the current segment (outer loop), then pick the next segment + * in the array, skipping any zero entries (inner loop). + */ +static void journal_get_free_segment(struct logfs_area *area) +{ + struct logfs_super *super = logfs_super(area->a_sb); + int i; + + journal_for_each(i) { + if (area->a_segno != super->s_journal_seg[i]) + continue; + + do { + i++; + if (i == LOGFS_JOURNAL_SEGS) + i = 0; + } while (!super->s_journal_seg[i]); + + area->a_segno = super->s_journal_seg[i]; + area->a_erase_count = ++(super->s_journal_ec[i]); + log_journal("Journal now at %x (ec %x)\n", area->a_segno, + area->a_erase_count); + return; + } + BUG(); +} + +static void journal_get_erase_count(struct logfs_area *area) +{ + /* erase count is stored globally and incremented in + * journal_get_free_segment() - nothing to do here */ +} + +static int journal_erase_segment(struct logfs_area *area) +{ + struct super_block *sb = area->a_sb; + struct logfs_segment_header sh; + u64 ofs; + int err; + + err = logfs_erase_segment(sb, area->a_segno); + if (err) + return err; + + sh.pad = 0; + sh.type = SEG_JOURNAL; + sh.level = 0; + sh.segno = cpu_to_be32(area->a_segno); + sh.ec = cpu_to_be32(area->a_erase_count); + sh.gec = cpu_to_be64(logfs_super(sb)->s_gec); + sh.crc = logfs_crc32(&sh, sizeof(sh), 4); + + /* This causes a bug in segment.c. Not yet. */ + //logfs_set_segment_erased(sb, area->a_segno, area->a_erase_count, 0); + + ofs = dev_ofs(sb, area->a_segno, 0); + area->a_used_bytes = ALIGN(sizeof(sh), 16); + logfs_buf_write(area, ofs, &sh, sizeof(sh)); + return 0; +} + +static size_t __logfs_write_header(struct logfs_super *super, + struct logfs_journal_header *jh, size_t len, size_t datalen, + u16 type, u8 compr) +{ + jh->h_len = cpu_to_be16(len); + jh->h_type = cpu_to_be16(type); + jh->h_version = cpu_to_be16(++super->s_last_version); + jh->h_datalen = cpu_to_be16(datalen); + jh->h_compr = compr; + jh->h_pad[0] = 'H'; + jh->h_pad[1] = 'A'; + jh->h_pad[2] = 'T'; + jh->h_crc = logfs_crc32(jh, len + sizeof(*jh), 4); + return ALIGN(len, 16) + sizeof(*jh); +} + +static size_t logfs_write_header(struct logfs_super *super, + struct logfs_journal_header *jh, size_t datalen, u16 type) +{ + size_t len = datalen; + + return __logfs_write_header(super, jh, len, datalen, type, COMPR_NONE); +} + +static inline size_t logfs_journal_erasecount_size(struct logfs_super *super) +{ + return LOGFS_JOURNAL_SEGS * sizeof(__be32); +} + +static void *logfs_write_erasecount(struct super_block *sb, void *_ec, + u16 *type, size_t *len) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_je_journal_ec *ec = _ec; + int i; + + journal_for_each(i) + ec->ec[i] = cpu_to_be32(super->s_journal_ec[i]); + *type = JE_ERASECOUNT; + *len = logfs_journal_erasecount_size(super); + return ec; +} + +static void account_shadow(void *_shadow, unsigned long _sb, u64 ignore, + size_t ignore2) +{ + struct logfs_shadow *shadow = _shadow; + struct super_block *sb = (void *)_sb; + struct logfs_super *super = logfs_super(sb); + + /* consume new space */ + super->s_free_bytes -= shadow->new_len; + super->s_used_bytes += shadow->new_len; + super->s_dirty_used_bytes -= shadow->new_len; + + /* free up old space */ + super->s_free_bytes += shadow->old_len; + super->s_used_bytes -= shadow->old_len; + super->s_dirty_free_bytes -= shadow->old_len; + + logfs_set_segment_used(sb, shadow->old_ofs, -shadow->old_len); + logfs_set_segment_used(sb, shadow->new_ofs, shadow->new_len); + + log_journal("account_shadow(%llx, %llx, %x) %llx->%llx %x->%x\n", + shadow->ino, shadow->bix, shadow->gc_level, + shadow->old_ofs, shadow->new_ofs, + shadow->old_len, shadow->new_len); + mempool_free(shadow, super->s_shadow_pool); +} + +static void account_shadows(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + struct inode *inode = super->s_master_inode; + struct logfs_inode *li = logfs_inode(inode); + struct shadow_tree *tree = &super->s_shadow_tree; + + btree_grim_visitor64(&tree->new, (unsigned long)sb, account_shadow); + btree_grim_visitor64(&tree->old, (unsigned long)sb, account_shadow); + + if (li->li_block) { + /* + * We never actually use the structure, when attached to the + * master inode. But it is easier to always free it here than + * to have checks in several places elsewhere when allocating + * it. + */ + li->li_block->ops->free_block(sb, li->li_block); + } + BUG_ON((s64)li->li_used_bytes < 0); +} + +static void *__logfs_write_anchor(struct super_block *sb, void *_da, + u16 *type, size_t *len) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_je_anchor *da = _da; + struct inode *inode = super->s_master_inode; + struct logfs_inode *li = logfs_inode(inode); + int i; + + da->da_height = li->li_height; + da->da_last_ino = cpu_to_be64(super->s_last_ino); + da->da_size = cpu_to_be64(i_size_read(inode)); + da->da_used_bytes = cpu_to_be64(li->li_used_bytes); + for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++) + da->da_data[i] = cpu_to_be64(li->li_data[i]); + *type = JE_ANCHOR; + *len = sizeof(*da); + return da; +} + +static void *logfs_write_dynsb(struct super_block *sb, void *_dynsb, + u16 *type, size_t *len) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_je_dynsb *dynsb = _dynsb; + + dynsb->ds_gec = cpu_to_be64(super->s_gec); + dynsb->ds_sweeper = cpu_to_be64(super->s_sweeper); + dynsb->ds_victim_ino = cpu_to_be64(super->s_victim_ino); + dynsb->ds_rename_dir = cpu_to_be64(super->s_rename_dir); + dynsb->ds_rename_pos = cpu_to_be64(super->s_rename_pos); + dynsb->ds_used_bytes = cpu_to_be64(super->s_used_bytes); + dynsb->ds_generation = cpu_to_be32(super->s_generation); + *type = JE_DYNSB; + *len = sizeof(*dynsb); + return dynsb; +} + +static void write_wbuf(struct super_block *sb, struct logfs_area *area, + void *wbuf) +{ + struct logfs_super *super = logfs_super(sb); + struct address_space *mapping = super->s_mapping_inode->i_mapping; + u64 ofs; + pgoff_t index; + int page_ofs; + struct page *page; + + ofs = dev_ofs(sb, area->a_segno, + area->a_used_bytes & ~(super->s_writesize - 1)); + index = ofs >> PAGE_SHIFT; + page_ofs = ofs & (PAGE_SIZE - 1); + + page = find_lock_page(mapping, index); + BUG_ON(!page); + memcpy(wbuf, page_address(page) + page_ofs, super->s_writesize); + unlock_page(page); +} + +static void *logfs_write_area(struct super_block *sb, void *_a, + u16 *type, size_t *len) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_area *area = super->s_area[super->s_sum_index]; + struct logfs_je_area *a = _a; + + a->vim = VIM_DEFAULT; + a->gc_level = super->s_sum_index; + a->used_bytes = cpu_to_be32(area->a_used_bytes); + a->segno = cpu_to_be32(area->a_segno); + if (super->s_writesize > 1) + write_wbuf(sb, area, a + 1); + + *type = JE_AREA; + *len = sizeof(*a) + super->s_writesize; + return a; +} + +static void *logfs_write_commit(struct super_block *sb, void *h, + u16 *type, size_t *len) +{ + struct logfs_super *super = logfs_super(sb); + + *type = JE_COMMIT; + *len = super->s_no_je * sizeof(__be64); + return super->s_je_array; +} + +static size_t __logfs_write_je(struct super_block *sb, void *buf, u16 type, + size_t len) +{ + struct logfs_super *super = logfs_super(sb); + void *header = super->s_compressed_je; + void *data = header + sizeof(struct logfs_journal_header); + ssize_t compr_len, pad_len; + u8 compr = COMPR_ZLIB; + + if (len == 0) + return logfs_write_header(super, header, 0, type); + + compr_len = logfs_compress(buf, data, len, sb->s_blocksize); + if (compr_len < 0 || type == JE_ANCHOR) { + BUG_ON(len > sb->s_blocksize); + memcpy(data, buf, len); + compr_len = len; + compr = COMPR_NONE; + } + + pad_len = ALIGN(compr_len, 16); + memset(data + compr_len, 0, pad_len - compr_len); + + return __logfs_write_header(super, header, compr_len, len, type, compr); +} + +static s64 logfs_get_free_bytes(struct logfs_area *area, size_t *bytes, + int must_pad) +{ + u32 writesize = logfs_super(area->a_sb)->s_writesize; + s32 ofs; + int ret; + + ret = logfs_open_area(area, *bytes); + if (ret) + return -EAGAIN; + + ofs = area->a_used_bytes; + area->a_used_bytes += *bytes; + + if (must_pad) { + area->a_used_bytes = ALIGN(area->a_used_bytes, writesize); + *bytes = area->a_used_bytes - ofs; + } + + return dev_ofs(area->a_sb, area->a_segno, ofs); +} + +static int logfs_write_je_buf(struct super_block *sb, void *buf, u16 type, + size_t buf_len) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_area *area = super->s_journal_area; + struct logfs_journal_header *jh = super->s_compressed_je; + size_t len; + int must_pad = 0; + s64 ofs; + + len = __logfs_write_je(sb, buf, type, buf_len); + if (jh->h_type == cpu_to_be16(JE_COMMIT)) + must_pad = 1; + + ofs = logfs_get_free_bytes(area, &len, must_pad); + if (ofs < 0) + return ofs; + logfs_buf_write(area, ofs, super->s_compressed_je, len); + super->s_je_array[super->s_no_je++] = cpu_to_be64(ofs); + return 0; +} + +static int logfs_write_je(struct super_block *sb, + void* (*write)(struct super_block *sb, void *scratch, + u16 *type, size_t *len)) +{ + void *buf; + size_t len; + u16 type; + + buf = write(sb, logfs_super(sb)->s_je, &type, &len); + return logfs_write_je_buf(sb, buf, type, len); +} + +int write_alias_journal(struct super_block *sb, u64 ino, u64 bix, + level_t level, int child_no, __be64 val) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_obj_alias *oa = super->s_je; + int err = 0, fill = super->s_je_fill; + + log_aliases("logfs_write_obj_aliases #%x(%llx, %llx, %x, %x) %llx\n", + fill, ino, bix, level, child_no, be64_to_cpu(val)); + oa[fill].ino = cpu_to_be64(ino); + oa[fill].bix = cpu_to_be64(bix); + oa[fill].val = val; + oa[fill].level = (__force u8)level; + oa[fill].child_no = cpu_to_be16(child_no); + fill++; + if (fill >= sb->s_blocksize / sizeof(*oa)) { + err = logfs_write_je_buf(sb, oa, JE_OBJ_ALIAS, sb->s_blocksize); + fill = 0; + } + + super->s_je_fill = fill; + return err; +} + +static int logfs_write_obj_aliases(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + int err; + + log_journal("logfs_write_obj_aliases: %d aliases to write\n", + super->s_no_object_aliases); + super->s_je_fill = 0; + err = logfs_write_obj_aliases_pagecache(sb); + if (err) + return err; + + if (super->s_je_fill) + err = logfs_write_je_buf(sb, super->s_je, JE_OBJ_ALIAS, + super->s_je_fill + * sizeof(struct logfs_obj_alias)); + return err; +} + +/* + * Write all journal entries. The goto logic ensures that all journal entries + * are written whenever a new segment is used. It is ugly and potentially a + * bit wasteful, but robustness is more important. With this we can *always* + * erase all journal segments except the one containing the most recent commit. + */ +void logfs_write_anchor(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + struct logfs_super *super = logfs_super(sb); + struct logfs_area *area = super->s_journal_area; + int i, err; + + BUG_ON(logfs_super(sb)->s_flags & LOGFS_SB_FLAG_SHUTDOWN); + mutex_lock(&super->s_journal_mutex); + + /* Do this first or suffer corruption */ + logfs_sync_segments(sb); + account_shadows(sb); + +again: + super->s_no_je = 0; + for_each_area(i) { + if (!super->s_area[i]->a_is_open) + continue; + super->s_sum_index = i; + err = logfs_write_je(sb, logfs_write_area); + if (err) + goto again; + } + err = logfs_write_obj_aliases(sb); + if (err) + goto again; + err = logfs_write_je(sb, logfs_write_erasecount); + if (err) + goto again; + err = logfs_write_je(sb, __logfs_write_anchor); + if (err) + goto again; + err = logfs_write_je(sb, logfs_write_dynsb); + if (err) + goto again; + /* + * Order is imperative. First we sync all writes, including the + * non-committed journal writes. Then we write the final commit and + * sync the current journal segment. + * There is a theoretical bug here. Syncing the journal segment will + * write a number of journal entries and the final commit. All these + * are written in a single operation. If the device layer writes the + * data back-to-front, the commit will precede the other journal + * entries, leaving a race window. + * Two fixes are possible. Preferred is to fix the device layer to + * ensure writes happen front-to-back. Alternatively we can insert + * another logfs_sync_area() super->s_devops->sync() combo before + * writing the commit. + */ + /* + * On another subject, super->s_devops->sync is usually not necessary. + * Unless called from sys_sync or friends, a barrier would suffice. + */ + super->s_devops->sync(sb); + err = logfs_write_je(sb, logfs_write_commit); + if (err) + goto again; + log_journal("Write commit to %llx\n", + be64_to_cpu(super->s_je_array[super->s_no_je - 1])); + logfs_sync_area(area); + BUG_ON(area->a_used_bytes != area->a_written_bytes); + super->s_devops->sync(sb); + + mutex_unlock(&super->s_journal_mutex); + return; +} + +void do_logfs_journal_wl_pass(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_area *area = super->s_journal_area; + u32 segno, ec; + int i, err; + + log_journal("Journal requires wear-leveling.\n"); + /* Drop old segments */ + journal_for_each(i) + if (super->s_journal_seg[i]) { + logfs_set_segment_unreserved(sb, + super->s_journal_seg[i], + super->s_journal_ec[i]); + super->s_journal_seg[i] = 0; + super->s_journal_ec[i] = 0; + } + /* Get new segments */ + for (i = 0; i < super->s_no_journal_segs; i++) { + segno = get_best_cand(sb, &super->s_reserve_list, &ec); + super->s_journal_seg[i] = segno; + super->s_journal_ec[i] = ec; + logfs_set_segment_reserved(sb, segno); + } + /* Manually move journal_area */ + area->a_segno = super->s_journal_seg[0]; + area->a_is_open = 0; + area->a_used_bytes = 0; + /* Write journal */ + logfs_write_anchor(super->s_master_inode); + /* Write superblocks */ + err = logfs_write_sb(sb); + BUG_ON(err); +} + +static const struct logfs_area_ops journal_area_ops = { + .get_free_segment = journal_get_free_segment, + .get_erase_count = journal_get_erase_count, + .erase_segment = journal_erase_segment, +}; + +int logfs_init_journal(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + size_t bufsize = max_t(size_t, sb->s_blocksize, super->s_writesize) + + MAX_JOURNAL_HEADER; + int ret = -ENOMEM; + + mutex_init(&super->s_journal_mutex); + btree_init_mempool32(&super->s_reserved_segments, super->s_btree_pool); + + super->s_je = kzalloc(bufsize, GFP_KERNEL); + if (!super->s_je) + return ret; + + super->s_compressed_je = kzalloc(bufsize, GFP_KERNEL); + if (!super->s_compressed_je) + return ret; + + super->s_master_inode = logfs_new_meta_inode(sb, LOGFS_INO_MASTER); + if (IS_ERR(super->s_master_inode)) + return PTR_ERR(super->s_master_inode); + + ret = logfs_read_journal(sb); + if (ret) + return -EIO; + + reserve_sb_and_journal(sb); + logfs_calc_free(sb); + + super->s_journal_area->a_ops = &journal_area_ops; + return 0; +} + +void logfs_cleanup_journal(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + + btree_grim_visitor32(&super->s_reserved_segments, 0, NULL); + destroy_meta_inode(super->s_master_inode); + super->s_master_inode = NULL; + + kfree(super->s_compressed_je); + kfree(super->s_je); +} diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h new file mode 100644 index 000000000000..e3082abe9e3b --- /dev/null +++ b/fs/logfs/logfs.h @@ -0,0 +1,722 @@ +/* + * fs/logfs/logfs.h + * + * As should be obvious for Linux kernel code, license is GPLv2 + * + * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org> + * + * Private header for logfs. + */ +#ifndef FS_LOGFS_LOGFS_H +#define FS_LOGFS_LOGFS_H + +#undef __CHECK_ENDIAN__ +#define __CHECK_ENDIAN__ + +#include <linux/btree.h> +#include <linux/crc32.h> +#include <linux/fs.h> +#include <linux/kernel.h> +#include <linux/mempool.h> +#include <linux/pagemap.h> +#include <linux/mtd/mtd.h> +#include "logfs_abi.h" + +#define LOGFS_DEBUG_SUPER (0x0001) +#define LOGFS_DEBUG_SEGMENT (0x0002) +#define LOGFS_DEBUG_JOURNAL (0x0004) +#define LOGFS_DEBUG_DIR (0x0008) +#define LOGFS_DEBUG_FILE (0x0010) +#define LOGFS_DEBUG_INODE (0x0020) +#define LOGFS_DEBUG_READWRITE (0x0040) +#define LOGFS_DEBUG_GC (0x0080) +#define LOGFS_DEBUG_GC_NOISY (0x0100) +#define LOGFS_DEBUG_ALIASES (0x0200) +#define LOGFS_DEBUG_BLOCKMOVE (0x0400) +#define LOGFS_DEBUG_ALL (0xffffffff) + +#define LOGFS_DEBUG (0x01) +/* + * To enable specific log messages, simply define LOGFS_DEBUG to match any + * or all of the above. + */ +#ifndef LOGFS_DEBUG +#define LOGFS_DEBUG (0) +#endif + +#define log_cond(cond, fmt, arg...) do { \ + if (cond) \ + printk(KERN_DEBUG fmt, ##arg); \ +} while (0) + +#define log_super(fmt, arg...) \ + log_cond(LOGFS_DEBUG & LOGFS_DEBUG_SUPER, fmt, ##arg) +#define log_segment(fmt, arg...) \ + log_cond(LOGFS_DEBUG & LOGFS_DEBUG_SEGMENT, fmt, ##arg) +#define log_journal(fmt, arg...) \ + log_cond(LOGFS_DEBUG & LOGFS_DEBUG_JOURNAL, fmt, ##arg) +#define log_dir(fmt, arg...) \ + log_cond(LOGFS_DEBUG & LOGFS_DEBUG_DIR, fmt, ##arg) +#define log_file(fmt, arg...) \ + log_cond(LOGFS_DEBUG & LOGFS_DEBUG_FILE, fmt, ##arg) +#define log_inode(fmt, arg...) \ + log_cond(LOGFS_DEBUG & LOGFS_DEBUG_INODE, fmt, ##arg) +#define log_readwrite(fmt, arg...) \ + log_cond(LOGFS_DEBUG & LOGFS_DEBUG_READWRITE, fmt, ##arg) +#define log_gc(fmt, arg...) \ + log_cond(LOGFS_DEBUG & LOGFS_DEBUG_GC, fmt, ##arg) +#define log_gc_noisy(fmt, arg...) \ + log_cond(LOGFS_DEBUG & LOGFS_DEBUG_GC_NOISY, fmt, ##arg) +#define log_aliases(fmt, arg...) \ + log_cond(LOGFS_DEBUG & LOGFS_DEBUG_ALIASES, fmt, ##arg) +#define log_blockmove(fmt, arg...) \ + log_cond(LOGFS_DEBUG & LOGFS_DEBUG_BLOCKMOVE, fmt, ##arg) + +#define PG_pre_locked PG_owner_priv_1 +#define PagePreLocked(page) test_bit(PG_pre_locked, &(page)->flags) +#define SetPagePreLocked(page) set_bit(PG_pre_locked, &(page)->flags) +#define ClearPagePreLocked(page) clear_bit(PG_pre_locked, &(page)->flags) + +/* FIXME: This should really be somewhere in the 64bit area. */ +#define LOGFS_LINK_MAX (1<<30) + +/* Read-only filesystem */ +#define LOGFS_SB_FLAG_RO 0x0001 +#define LOGFS_SB_FLAG_SEG_ALIAS 0x0002 +#define LOGFS_SB_FLAG_OBJ_ALIAS 0x0004 +#define LOGFS_SB_FLAG_SHUTDOWN 0x0008 + +/* Write Control Flags */ +#define WF_LOCK 0x01 /* take write lock */ +#define WF_WRITE 0x02 /* write block */ +#define WF_DELETE 0x04 /* delete old block */ + +typedef u8 __bitwise level_t; +typedef u8 __bitwise gc_level_t; + +#define LEVEL(level) ((__force level_t)(level)) +#define GC_LEVEL(gc_level) ((__force gc_level_t)(gc_level)) + +#define SUBLEVEL(level) ( (void)((level) == LEVEL(1)), \ + (__force level_t)((__force u8)(level) - 1) ) + +/** + * struct logfs_area - area management information + * + * @a_sb: the superblock this area belongs to + * @a_is_open: 1 if the area is currently open, else 0 + * @a_segno: segment number of area + * @a_written_bytes: number of bytes already written back + * @a_used_bytes: number of used bytes + * @a_ops: area operations (either journal or ostore) + * @a_erase_count: erase count + * @a_level: GC level + */ +struct logfs_area { /* a segment open for writing */ + struct super_block *a_sb; + int a_is_open; + u32 a_segno; + u32 a_written_bytes; + u32 a_used_bytes; + const struct logfs_area_ops *a_ops; + u32 a_erase_count; + gc_level_t a_level; +}; + +/** + * struct logfs_area_ops - area operations + * + * @get_free_segment: fill area->ofs with the offset of a free segment + * @get_erase_count: fill area->erase_count (needs area->ofs) + * @erase_segment: erase and setup segment + */ +struct logfs_area_ops { + void (*get_free_segment)(struct logfs_area *area); + void (*get_erase_count)(struct logfs_area *area); + int (*erase_segment)(struct logfs_area *area); +}; + +/** + * struct logfs_device_ops - device access operations + * + * @readpage: read one page (mm page) + * @writeseg: write one segment. may be a partial segment + * @erase: erase one segment + * @read: read from the device + * @erase: erase part of the device + */ +struct logfs_device_ops { + struct page *(*find_first_sb)(struct super_block *sb, u64 *ofs); + struct page *(*find_last_sb)(struct super_block *sb, u64 *ofs); + int (*write_sb)(struct super_block *sb, struct page *page); + int (*readpage)(void *_sb, struct page *page); + void (*writeseg)(struct super_block *sb, u64 ofs, size_t len); + int (*erase)(struct super_block *sb, loff_t ofs, size_t len); + void (*sync)(struct super_block *sb); + void (*put_device)(struct super_block *sb); +}; + +/** + * struct candidate_list - list of similar candidates + */ +struct candidate_list { + struct rb_root rb_tree; + int count; + int maxcount; + int sort_by_ec; +}; + +/** + * struct gc_candidate - "candidate" segment to be garbage collected next + * + * @list: list (either free of low) + * @segno: segment number + * @valid: number of valid bytes + * @erase_count: erase count of segment + * @dist: distance from tree root + * + * Candidates can be on two lists. The free list contains electees rather + * than candidates - segments that no longer contain any valid data. The + * low list contains candidates to be picked for GC. It should be kept + * short. It is not required to always pick a perfect candidate. In the + * worst case GC will have to move more data than absolutely necessary. + */ +struct gc_candidate { + struct rb_node rb_node; + struct candidate_list *list; + u32 segno; + u32 valid; + u32 erase_count; + u8 dist; +}; + +/** + * struct logfs_journal_entry - temporary structure used during journal scan + * + * @used: + * @version: normalized version + * @len: length + * @offset: offset + */ +struct logfs_journal_entry { + int used; + s16 version; + u16 len; + u16 datalen; + u64 offset; +}; + +enum transaction_state { + CREATE_1 = 1, + CREATE_2, + UNLINK_1, + UNLINK_2, + CROSS_RENAME_1, + CROSS_RENAME_2, + TARGET_RENAME_1, + TARGET_RENAME_2, + TARGET_RENAME_3 +}; + +/** + * struct logfs_transaction - essential fields to support atomic dirops + * + * @ino: target inode + * @dir: inode of directory containing dentry + * @pos: pos of dentry in directory + */ +struct logfs_transaction { + enum transaction_state state; + u64 ino; + u64 dir; + u64 pos; +}; + +/** + * struct logfs_shadow - old block in the shadow of a not-yet-committed new one + * @old_ofs: offset of old block on medium + * @new_ofs: offset of new block on medium + * @ino: inode number + * @bix: block index + * @old_len: size of old block, including header + * @new_len: size of new block, including header + * @level: block level + */ +struct logfs_shadow { + u64 old_ofs; + u64 new_ofs; + u64 ino; + u64 bix; + int old_len; + int new_len; + gc_level_t gc_level; +}; + +/** + * struct shadow_tree + * @new: shadows where old_ofs==0, indexed by new_ofs + * @old: shadows where old_ofs!=0, indexed by old_ofs + */ +struct shadow_tree { + struct btree_head64 new; + struct btree_head64 old; +}; + +struct object_alias_item { + struct list_head list; + __be64 val; + int child_no; +}; + +/** + * struct logfs_block - contains any block state + * @type: indirect block or inode + * @full: number of fully populated children + * @partial: number of partially populated children + * + * Most blocks are directly represented by page cache pages. But when a block + * becomes dirty, is part of a transaction, contains aliases or is otherwise + * special, a struct logfs_block is allocated to track the additional state. + * Inodes are very similar to indirect blocks, so they can also get one of + * these structures added when appropriate. + */ +#define BLOCK_INDIRECT 1 /* Indirect block */ +#define BLOCK_INODE 2 /* Inode */ +struct logfs_block_ops; +struct logfs_block { + struct list_head alias_list; + struct list_head item_list; + struct super_block *sb; + u64 ino; + u64 bix; + level_t level; + struct page *page; + struct inode *inode; + struct logfs_transaction *ta; + unsigned long alias_map[LOGFS_BLOCK_FACTOR / BITS_PER_LONG]; + struct logfs_block_ops *ops; + int full; + int partial; + int reserved_bytes; +}; + +typedef int write_alias_t(struct super_block *sb, u64 ino, u64 bix, + level_t level, int child_no, __be64 val); +struct logfs_block_ops { + void (*write_block)(struct logfs_block *block); + gc_level_t (*block_level)(struct logfs_block *block); + void (*free_block)(struct super_block *sb, struct logfs_block*block); + int (*write_alias)(struct super_block *sb, + struct logfs_block *block, + write_alias_t *write_one_alias); +}; + +struct logfs_super { + struct mtd_info *s_mtd; /* underlying device */ + struct block_device *s_bdev; /* underlying device */ + const struct logfs_device_ops *s_devops;/* device access */ + struct inode *s_master_inode; /* inode file */ + struct inode *s_segfile_inode; /* segment file */ + struct inode *s_mapping_inode; /* device mapping */ + atomic_t s_pending_writes; /* outstanting bios */ + long s_flags; + mempool_t *s_btree_pool; /* for btree nodes */ + mempool_t *s_alias_pool; /* aliases in segment.c */ + u64 s_feature_incompat; + u64 s_feature_ro_compat; + u64 s_feature_compat; + u64 s_feature_flags; + u64 s_sb_ofs[2]; + /* alias.c fields */ + struct btree_head32 s_segment_alias; /* remapped segments */ + int s_no_object_aliases; + struct list_head s_object_alias; /* remapped objects */ + struct btree_head128 s_object_alias_tree; /* remapped objects */ + struct mutex s_object_alias_mutex; + /* dir.c fields */ + struct mutex s_dirop_mutex; /* for creat/unlink/rename */ + u64 s_victim_ino; /* used for atomic dir-ops */ + u64 s_rename_dir; /* source directory ino */ + u64 s_rename_pos; /* position of source dd */ + /* gc.c fields */ + long s_segsize; /* size of a segment */ + int s_segshift; /* log2 of segment size */ + long s_segmask; /* 1 << s_segshift - 1 */ + long s_no_segs; /* segments on device */ + long s_no_journal_segs; /* segments used for journal */ + long s_no_blocks; /* blocks per segment */ + long s_writesize; /* minimum write size */ + int s_writeshift; /* log2 of write size */ + u64 s_size; /* filesystem size */ + struct logfs_area *s_area[LOGFS_NO_AREAS]; /* open segment array */ + u64 s_gec; /* global erase count */ + u64 s_wl_gec_ostore; /* time of last wl event */ + u64 s_wl_gec_journal; /* time of last wl event */ + u64 s_sweeper; /* current sweeper pos */ + u8 s_ifile_levels; /* max level of ifile */ + u8 s_iblock_levels; /* max level of regular files */ + u8 s_data_levels; /* # of segments to leaf block*/ + u8 s_total_levels; /* sum of above three */ + struct btree_head32 s_cand_tree; /* all candidates */ + struct candidate_list s_free_list; /* 100% free segments */ + struct candidate_list s_reserve_list; /* Bad segment reserve */ + struct candidate_list s_low_list[LOGFS_NO_AREAS];/* good candidates */ + struct candidate_list s_ec_list; /* wear level candidates */ + struct btree_head32 s_reserved_segments;/* sb, journal, bad, etc. */ + /* inode.c fields */ + u64 s_last_ino; /* highest ino used */ + long s_inos_till_wrap; + u32 s_generation; /* i_generation for new files */ + struct list_head s_freeing_list; /* inodes being freed */ + /* journal.c fields */ + struct mutex s_journal_mutex; + void *s_je; /* journal entry to compress */ + void *s_compressed_je; /* block to write to journal */ + u32 s_journal_seg[LOGFS_JOURNAL_SEGS]; /* journal segments */ + u32 s_journal_ec[LOGFS_JOURNAL_SEGS]; /* journal erasecounts */ + u64 s_last_version; + struct logfs_area *s_journal_area; /* open journal segment */ + __be64 s_je_array[64]; + int s_no_je; + + int s_sum_index; /* for the 12 summaries */ + struct shadow_tree s_shadow_tree; + int s_je_fill; /* index of current je */ + /* readwrite.c fields */ + struct mutex s_write_mutex; + int s_lock_count; + mempool_t *s_block_pool; /* struct logfs_block pool */ + mempool_t *s_shadow_pool; /* struct logfs_shadow pool */ + /* + * Space accounting: + * - s_used_bytes specifies space used to store valid data objects. + * - s_dirty_used_bytes is space used to store non-committed data + * objects. Those objects have already been written themselves, + * but they don't become valid until all indirect blocks up to the + * journal have been written as well. + * - s_dirty_free_bytes is space used to store the old copy of a + * replaced object, as long as the replacement is non-committed. + * In other words, it is the amount of space freed when all dirty + * blocks are written back. + * - s_free_bytes is the amount of free space available for any + * purpose. + * - s_root_reserve is the amount of free space available only to + * the root user. Non-privileged users can no longer write once + * this watermark has been reached. + * - s_speed_reserve is space which remains unused to speed up + * garbage collection performance. + * - s_dirty_pages is the space reserved for currently dirty pages. + * It is a pessimistic estimate, so some/most will get freed on + * page writeback. + * + * s_used_bytes + s_free_bytes + s_speed_reserve = total usable size + */ + u64 s_free_bytes; + u64 s_used_bytes; + u64 s_dirty_free_bytes; + u64 s_dirty_used_bytes; + u64 s_root_reserve; + u64 s_speed_reserve; + u64 s_dirty_pages; + /* Bad block handling: + * - s_bad_seg_reserve is a number of segments usually kept + * free. When encountering bad blocks, the affected segment's data + * is _temporarily_ moved to a reserved segment. + * - s_bad_segments is the number of known bad segments. + */ + u32 s_bad_seg_reserve; + u32 s_bad_segments; +}; + +/** + * struct logfs_inode - in-memory inode + * + * @vfs_inode: struct inode + * @li_data: data pointers + * @li_used_bytes: number of used bytes + * @li_freeing_list: used to track inodes currently being freed + * @li_flags: inode flags + * @li_refcount: number of internal (GC-induced) references + */ +struct logfs_inode { + struct inode vfs_inode; + u64 li_data[LOGFS_EMBEDDED_FIELDS]; + u64 li_used_bytes; + struct list_head li_freeing_list; + struct logfs_block *li_block; + u32 li_flags; + u8 li_height; + int li_refcount; +}; + +#define journal_for_each(__i) for (__i = 0; __i < LOGFS_JOURNAL_SEGS; __i++) +#define for_each_area(__i) for (__i = 0; __i < LOGFS_NO_AREAS; __i++) +#define for_each_area_down(__i) for (__i = LOGFS_NO_AREAS - 1; __i >= 0; __i--) + +/* compr.c */ +int logfs_compress(void *in, void *out, size_t inlen, size_t outlen); +int logfs_uncompress(void *in, void *out, size_t inlen, size_t outlen); +int __init logfs_compr_init(void); +void logfs_compr_exit(void); + +/* dev_bdev.c */ +#ifdef CONFIG_BLOCK +int logfs_get_sb_bdev(struct file_system_type *type, int flags, + const char *devname, struct vfsmount *mnt); +#else +static inline int logfs_get_sb_bdev(struct file_system_type *type, int flags, + const char *devname, struct vfsmount *mnt) +{ + return -ENODEV; +} +#endif + +/* dev_mtd.c */ +#ifdef CONFIG_MTD +int logfs_get_sb_mtd(struct file_system_type *type, int flags, + int mtdnr, struct vfsmount *mnt); +#else +static inline int logfs_get_sb_mtd(struct file_system_type *type, int flags, + int mtdnr, struct vfsmount *mnt) +{ + return -ENODEV; +} +#endif + +/* dir.c */ +extern const struct inode_operations logfs_symlink_iops; +extern const struct inode_operations logfs_dir_iops; +extern const struct file_operations logfs_dir_fops; +int logfs_replay_journal(struct super_block *sb); + +/* file.c */ +extern const struct inode_operations logfs_reg_iops; +extern const struct file_operations logfs_reg_fops; +extern const struct address_space_operations logfs_reg_aops; +int logfs_readpage(struct file *file, struct page *page); +int logfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd, + unsigned long arg); +int logfs_fsync(struct file *file, struct dentry *dentry, int datasync); + +/* gc.c */ +u32 get_best_cand(struct super_block *sb, struct candidate_list *list, u32 *ec); +void logfs_gc_pass(struct super_block *sb); +int logfs_check_areas(struct super_block *sb); +int logfs_init_gc(struct super_block *sb); +void logfs_cleanup_gc(struct super_block *sb); + +/* inode.c */ +extern const struct super_operations logfs_super_operations; +struct inode *logfs_iget(struct super_block *sb, ino_t ino); +struct inode *logfs_safe_iget(struct super_block *sb, ino_t ino, int *cookie); +void logfs_safe_iput(struct inode *inode, int cookie); +struct inode *logfs_new_inode(struct inode *dir, int mode); +struct inode *logfs_new_meta_inode(struct super_block *sb, u64 ino); +struct inode *logfs_read_meta_inode(struct super_block *sb, u64 ino); +int logfs_init_inode_cache(void); +void logfs_destroy_inode_cache(void); +void destroy_meta_inode(struct inode *inode); +void logfs_set_blocks(struct inode *inode, u64 no); +/* these logically belong into inode.c but actually reside in readwrite.c */ +int logfs_read_inode(struct inode *inode); +int __logfs_write_inode(struct inode *inode, long flags); +void logfs_delete_inode(struct inode *inode); +void logfs_clear_inode(struct inode *inode); + +/* journal.c */ +void logfs_write_anchor(struct inode *inode); +int logfs_init_journal(struct super_block *sb); +void logfs_cleanup_journal(struct super_block *sb); +int write_alias_journal(struct super_block *sb, u64 ino, u64 bix, + level_t level, int child_no, __be64 val); +void do_logfs_journal_wl_pass(struct super_block *sb); + +/* readwrite.c */ +pgoff_t logfs_pack_index(u64 bix, level_t level); +void logfs_unpack_index(pgoff_t index, u64 *bix, level_t *level); +int logfs_inode_write(struct inode *inode, const void *buf, size_t count, + loff_t bix, long flags, struct shadow_tree *shadow_tree); +int logfs_readpage_nolock(struct page *page); +int logfs_write_buf(struct inode *inode, struct page *page, long flags); +int logfs_delete(struct inode *inode, pgoff_t index, + struct shadow_tree *shadow_tree); +int logfs_rewrite_block(struct inode *inode, u64 bix, u64 ofs, + gc_level_t gc_level, long flags); +int logfs_is_valid_block(struct super_block *sb, u64 ofs, u64 ino, u64 bix, + gc_level_t gc_level); +int logfs_truncate(struct inode *inode, u64 size); +u64 logfs_seek_hole(struct inode *inode, u64 bix); +u64 logfs_seek_data(struct inode *inode, u64 bix); +int logfs_open_segfile(struct super_block *sb); +int logfs_init_rw(struct super_block *sb); +void logfs_cleanup_rw(struct super_block *sb); +void logfs_add_transaction(struct inode *inode, struct logfs_transaction *ta); +void logfs_del_transaction(struct inode *inode, struct logfs_transaction *ta); +void logfs_write_block(struct logfs_block *block, long flags); +int logfs_write_obj_aliases_pagecache(struct super_block *sb); +void logfs_get_segment_entry(struct super_block *sb, u32 segno, + struct logfs_segment_entry *se); +void logfs_set_segment_used(struct super_block *sb, u64 ofs, int increment); +void logfs_set_segment_erased(struct super_block *sb, u32 segno, u32 ec, + gc_level_t gc_level); +void logfs_set_segment_reserved(struct super_block *sb, u32 segno); +void logfs_set_segment_unreserved(struct super_block *sb, u32 segno, u32 ec); +struct logfs_block *__alloc_block(struct super_block *sb, + u64 ino, u64 bix, level_t level); +void __free_block(struct super_block *sb, struct logfs_block *block); +void btree_write_block(struct logfs_block *block); +void initialize_block_counters(struct page *page, struct logfs_block *block, + __be64 *array, int page_is_empty); +int logfs_exist_block(struct inode *inode, u64 bix); +int get_page_reserve(struct inode *inode, struct page *page); +extern struct logfs_block_ops indirect_block_ops; + +/* segment.c */ +int logfs_erase_segment(struct super_block *sb, u32 ofs); +int wbuf_read(struct super_block *sb, u64 ofs, size_t len, void *buf); +int logfs_segment_read(struct inode *inode, struct page *page, u64 ofs, u64 bix, + level_t level); +int logfs_segment_write(struct inode *inode, struct page *page, + struct logfs_shadow *shadow); +int logfs_segment_delete(struct inode *inode, struct logfs_shadow *shadow); +int logfs_load_object_aliases(struct super_block *sb, + struct logfs_obj_alias *oa, int count); +void move_page_to_btree(struct page *page); +int logfs_init_mapping(struct super_block *sb); +void logfs_sync_area(struct logfs_area *area); +void logfs_sync_segments(struct super_block *sb); + +/* area handling */ +int logfs_init_areas(struct super_block *sb); +void logfs_cleanup_areas(struct super_block *sb); +int logfs_open_area(struct logfs_area *area, size_t bytes); +void __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len, + int use_filler); + +static inline void logfs_buf_write(struct logfs_area *area, u64 ofs, + void *buf, size_t len) +{ + __logfs_buf_write(area, ofs, buf, len, 0); +} + +static inline void logfs_buf_recover(struct logfs_area *area, u64 ofs, + void *buf, size_t len) +{ + __logfs_buf_write(area, ofs, buf, len, 1); +} + +/* super.c */ +struct page *emergency_read_begin(struct address_space *mapping, pgoff_t index); +void emergency_read_end(struct page *page); +void logfs_crash_dump(struct super_block *sb); +void *memchr_inv(const void *s, int c, size_t n); +int logfs_statfs(struct dentry *dentry, struct kstatfs *stats); +int logfs_get_sb_device(struct file_system_type *type, int flags, + struct mtd_info *mtd, struct block_device *bdev, + const struct logfs_device_ops *devops, struct vfsmount *mnt); +int logfs_check_ds(struct logfs_disk_super *ds); +int logfs_write_sb(struct super_block *sb); + +static inline struct logfs_super *logfs_super(struct super_block *sb) +{ + return sb->s_fs_info; +} + +static inline struct logfs_inode *logfs_inode(struct inode *inode) +{ + return container_of(inode, struct logfs_inode, vfs_inode); +} + +static inline void logfs_set_ro(struct super_block *sb) +{ + logfs_super(sb)->s_flags |= LOGFS_SB_FLAG_RO; +} + +#define LOGFS_BUG(sb) do { \ + struct super_block *__sb = sb; \ + logfs_crash_dump(__sb); \ + logfs_super(__sb)->s_flags |= LOGFS_SB_FLAG_RO; \ + BUG(); \ +} while (0) + +#define LOGFS_BUG_ON(condition, sb) \ + do { if (unlikely(condition)) LOGFS_BUG((sb)); } while (0) + +static inline __be32 logfs_crc32(void *data, size_t len, size_t skip) +{ + return cpu_to_be32(crc32(~0, data+skip, len-skip)); +} + +static inline u8 logfs_type(struct inode *inode) +{ + return (inode->i_mode >> 12) & 15; +} + +static inline pgoff_t logfs_index(struct super_block *sb, u64 pos) +{ + return pos >> sb->s_blocksize_bits; +} + +static inline u64 dev_ofs(struct super_block *sb, u32 segno, u32 ofs) +{ + return ((u64)segno << logfs_super(sb)->s_segshift) + ofs; +} + +static inline u32 seg_no(struct super_block *sb, u64 ofs) +{ + return ofs >> logfs_super(sb)->s_segshift; +} + +static inline u32 seg_ofs(struct super_block *sb, u64 ofs) +{ + return ofs & logfs_super(sb)->s_segmask; +} + +static inline u64 seg_align(struct super_block *sb, u64 ofs) +{ + return ofs & ~logfs_super(sb)->s_segmask; +} + +static inline struct logfs_block *logfs_block(struct page *page) +{ + return (void *)page->private; +} + +static inline level_t shrink_level(gc_level_t __level) +{ + u8 level = (__force u8)__level; + + if (level >= LOGFS_MAX_LEVELS) + level -= LOGFS_MAX_LEVELS; + return (__force level_t)level; +} + +static inline gc_level_t expand_level(u64 ino, level_t __level) +{ + u8 level = (__force u8)__level; + + if (ino == LOGFS_INO_MASTER) { + /* ifile has seperate areas */ + level += LOGFS_MAX_LEVELS; + } + return (__force gc_level_t)level; +} + +static inline int logfs_block_shift(struct super_block *sb, level_t level) +{ + level = shrink_level((__force gc_level_t)level); + return (__force int)level * (sb->s_blocksize_bits - 3); +} + +static inline u64 logfs_block_mask(struct super_block *sb, level_t level) +{ + return ~0ull << logfs_block_shift(sb, level); +} + +static inline struct logfs_area *get_area(struct super_block *sb, + gc_level_t gc_level) +{ + return logfs_super(sb)->s_area[(__force u8)gc_level]; +} + +#endif diff --git a/fs/logfs/logfs_abi.h b/fs/logfs/logfs_abi.h new file mode 100644 index 000000000000..5d3782ddecc8 --- /dev/null +++ b/fs/logfs/logfs_abi.h @@ -0,0 +1,627 @@ +/* + * fs/logfs/logfs_abi.h + * + * As should be obvious for Linux kernel code, license is GPLv2 + * + * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org> + * + * Public header for logfs. + */ +#ifndef FS_LOGFS_LOGFS_ABI_H +#define FS_LOGFS_LOGFS_ABI_H + +/* For out-of-kernel compiles */ +#ifndef BUILD_BUG_ON +#define BUILD_BUG_ON(condition) /**/ +#endif + +#define SIZE_CHECK(type, size) \ +static inline void check_##type(void) \ +{ \ + BUILD_BUG_ON(sizeof(struct type) != (size)); \ +} + +/* + * Throughout the logfs code, we're constantly dealing with blocks at + * various positions or offsets. To remove confusion, we stricly + * distinguish between a "position" - the logical position within a + * file and an "offset" - the physical location within the device. + * + * Any usage of the term offset for a logical location or position for + * a physical one is a bug and should get fixed. + */ + +/* + * Block are allocated in one of several segments depending on their + * level. The following levels are used: + * 0 - regular data block + * 1 - i1 indirect blocks + * 2 - i2 indirect blocks + * 3 - i3 indirect blocks + * 4 - i4 indirect blocks + * 5 - i5 indirect blocks + * 6 - ifile data blocks + * 7 - ifile i1 indirect blocks + * 8 - ifile i2 indirect blocks + * 9 - ifile i3 indirect blocks + * 10 - ifile i4 indirect blocks + * 11 - ifile i5 indirect blocks + * Potential levels to be used in the future: + * 12 - gc recycled blocks, long-lived data + * 13 - replacement blocks, short-lived data + * + * Levels 1-11 are necessary for robust gc operations and help seperate + * short-lived metadata from longer-lived file data. In the future, + * file data should get seperated into several segments based on simple + * heuristics. Old data recycled during gc operation is expected to be + * long-lived. New data is of uncertain life expectancy. New data + * used to replace older blocks in existing files is expected to be + * short-lived. + */ + + +/* Magic numbers. 64bit for superblock, 32bit for statfs f_type */ +#define LOGFS_MAGIC 0xb21f205ac97e8168ull +#define LOGFS_MAGIC_U32 0xc97e8168u + +/* + * Various blocksize related macros. Blocksize is currently fixed at 4KiB. + * Sooner or later that should become configurable and the macros replaced + * by something superblock-dependent. Pointers in indirect blocks are and + * will remain 64bit. + * + * LOGFS_BLOCKSIZE - self-explaining + * LOGFS_BLOCK_FACTOR - number of pointers per indirect block + * LOGFS_BLOCK_BITS - log2 of LOGFS_BLOCK_FACTOR, used for shifts + */ +#define LOGFS_BLOCKSIZE (4096ull) +#define LOGFS_BLOCK_FACTOR (LOGFS_BLOCKSIZE / sizeof(u64)) +#define LOGFS_BLOCK_BITS (9) + +/* + * Number of blocks at various levels of indirection. There are 16 direct + * block pointers plus a single indirect pointer. + */ +#define I0_BLOCKS (16) +#define I1_BLOCKS LOGFS_BLOCK_FACTOR +#define I2_BLOCKS (LOGFS_BLOCK_FACTOR * I1_BLOCKS) +#define I3_BLOCKS (LOGFS_BLOCK_FACTOR * I2_BLOCKS) +#define I4_BLOCKS (LOGFS_BLOCK_FACTOR * I3_BLOCKS) +#define I5_BLOCKS (LOGFS_BLOCK_FACTOR * I4_BLOCKS) + +#define INDIRECT_INDEX I0_BLOCKS +#define LOGFS_EMBEDDED_FIELDS (I0_BLOCKS + 1) + +/* + * Sizes at which files require another level of indirection. Files smaller + * than LOGFS_EMBEDDED_SIZE can be completely stored in the inode itself, + * similar like ext2 fast symlinks. + * + * Data at a position smaller than LOGFS_I0_SIZE is accessed through the + * direct pointers, else through the 1x indirect pointer and so forth. + */ +#define LOGFS_EMBEDDED_SIZE (LOGFS_EMBEDDED_FIELDS * sizeof(u64)) +#define LOGFS_I0_SIZE (I0_BLOCKS * LOGFS_BLOCKSIZE) +#define LOGFS_I1_SIZE (I1_BLOCKS * LOGFS_BLOCKSIZE) +#define LOGFS_I2_SIZE (I2_BLOCKS * LOGFS_BLOCKSIZE) +#define LOGFS_I3_SIZE (I3_BLOCKS * LOGFS_BLOCKSIZE) +#define LOGFS_I4_SIZE (I4_BLOCKS * LOGFS_BLOCKSIZE) +#define LOGFS_I5_SIZE (I5_BLOCKS * LOGFS_BLOCKSIZE) + +/* + * Each indirect block pointer must have this flag set, if all block pointers + * behind it are set, i.e. there is no hole hidden in the shadow of this + * indirect block pointer. + */ +#define LOGFS_FULLY_POPULATED (1ULL << 63) +#define pure_ofs(ofs) (ofs & ~LOGFS_FULLY_POPULATED) + +/* + * LogFS needs to seperate data into levels. Each level is defined as the + * maximal possible distance from the master inode (inode of the inode file). + * Data blocks reside on level 0, 1x indirect block on level 1, etc. + * Inodes reside on level 6, indirect blocks for the inode file on levels 7-11. + * This effort is necessary to guarantee garbage collection to always make + * progress. + * + * LOGFS_MAX_INDIRECT is the maximal indirection through indirect blocks, + * LOGFS_MAX_LEVELS is one more for the actual data level of a file. It is + * the maximal number of levels for one file. + * LOGFS_NO_AREAS is twice that, as the inode file and regular files are + * effectively stacked on top of each other. + */ +#define LOGFS_MAX_INDIRECT (5) +#define LOGFS_MAX_LEVELS (LOGFS_MAX_INDIRECT + 1) +#define LOGFS_NO_AREAS (2 * LOGFS_MAX_LEVELS) + +/* Maximum size of filenames */ +#define LOGFS_MAX_NAMELEN (255) + +/* Number of segments in the primary journal. */ +#define LOGFS_JOURNAL_SEGS (16) + +/* Maximum number of free/erased/etc. segments in journal entries */ +#define MAX_CACHED_SEGS (64) + + +/* + * LOGFS_OBJECT_HEADERSIZE is the size of a single header in the object store, + * LOGFS_MAX_OBJECTSIZE the size of the largest possible object, including + * its header, + * LOGFS_SEGMENT_RESERVE is the amount of space reserved for each segment for + * its segment header and the padded space at the end when no further objects + * fit. + */ +#define LOGFS_OBJECT_HEADERSIZE (0x1c) +#define LOGFS_SEGMENT_HEADERSIZE (0x18) +#define LOGFS_MAX_OBJECTSIZE (LOGFS_OBJECT_HEADERSIZE + LOGFS_BLOCKSIZE) +#define LOGFS_SEGMENT_RESERVE \ + (LOGFS_SEGMENT_HEADERSIZE + LOGFS_MAX_OBJECTSIZE - 1) + +/* + * Segment types: + * SEG_SUPER - Data or indirect block + * SEG_JOURNAL - Inode + * SEG_OSTORE - Dentry + */ +enum { + SEG_SUPER = 0x01, + SEG_JOURNAL = 0x02, + SEG_OSTORE = 0x03, +}; + +/** + * struct logfs_segment_header - per-segment header in the ostore + * + * @crc: crc32 of header (there is no data) + * @pad: unused, must be 0 + * @type: segment type, see above + * @level: GC level for all objects in this segment + * @segno: segment number + * @ec: erase count for this segment + * @gec: global erase count at time of writing + */ +struct logfs_segment_header { + __be32 crc; + __be16 pad; + __u8 type; + __u8 level; + __be32 segno; + __be32 ec; + __be64 gec; +}; + +SIZE_CHECK(logfs_segment_header, LOGFS_SEGMENT_HEADERSIZE); + +/** + * struct logfs_disk_super - on-medium superblock + * + * @ds_magic: magic number, must equal LOGFS_MAGIC + * @ds_crc: crc32 of structure starting with the next field + * @ds_ifile_levels: maximum number of levels for ifile + * @ds_iblock_levels: maximum number of levels for regular files + * @ds_data_levels: number of seperate levels for data + * @pad0: reserved, must be 0 + * @ds_feature_incompat: incompatible filesystem features + * @ds_feature_ro_compat: read-only compatible filesystem features + * @ds_feature_compat: compatible filesystem features + * @ds_flags: flags + * @ds_segment_shift: log2 of segment size + * @ds_block_shift: log2 of block size + * @ds_write_shift: log2 of write size + * @pad1: reserved, must be 0 + * @ds_journal_seg: segments used by primary journal + * @ds_root_reserve: bytes reserved for the superuser + * @ds_speed_reserve: bytes reserved to speed up GC + * @ds_bad_seg_reserve: number of segments reserved to handle bad blocks + * @pad2: reserved, must be 0 + * @pad3: reserved, must be 0 + * + * Contains only read-only fields. Read-write fields like the amount of used + * space is tracked in the dynamic superblock, which is stored in the journal. + */ +struct logfs_disk_super { + struct logfs_segment_header ds_sh; + __be64 ds_magic; + + __be32 ds_crc; + __u8 ds_ifile_levels; + __u8 ds_iblock_levels; + __u8 ds_data_levels; + __u8 ds_segment_shift; + __u8 ds_block_shift; + __u8 ds_write_shift; + __u8 pad0[6]; + + __be64 ds_filesystem_size; + __be32 ds_segment_size; + __be32 ds_bad_seg_reserve; + + __be64 ds_feature_incompat; + __be64 ds_feature_ro_compat; + + __be64 ds_feature_compat; + __be64 ds_feature_flags; + + __be64 ds_root_reserve; + __be64 ds_speed_reserve; + + __be32 ds_journal_seg[LOGFS_JOURNAL_SEGS]; + + __be64 ds_super_ofs[2]; + __be64 pad3[8]; +}; + +SIZE_CHECK(logfs_disk_super, 256); + +/* + * Object types: + * OBJ_BLOCK - Data or indirect block + * OBJ_INODE - Inode + * OBJ_DENTRY - Dentry + */ +enum { + OBJ_BLOCK = 0x04, + OBJ_INODE = 0x05, + OBJ_DENTRY = 0x06, +}; + +/** + * struct logfs_object_header - per-object header in the ostore + * + * @crc: crc32 of header, excluding data_crc + * @len: length of data + * @type: object type, see above + * @compr: compression type + * @ino: inode number + * @bix: block index + * @data_crc: crc32 of payload + */ +struct logfs_object_header { + __be32 crc; + __be16 len; + __u8 type; + __u8 compr; + __be64 ino; + __be64 bix; + __be32 data_crc; +} __attribute__((packed)); + +SIZE_CHECK(logfs_object_header, LOGFS_OBJECT_HEADERSIZE); + +/* + * Reserved inode numbers: + * LOGFS_INO_MASTER - master inode (for inode file) + * LOGFS_INO_ROOT - root directory + * LOGFS_INO_SEGFILE - per-segment used bytes and erase count + */ +enum { + LOGFS_INO_MAPPING = 0x00, + LOGFS_INO_MASTER = 0x01, + LOGFS_INO_ROOT = 0x02, + LOGFS_INO_SEGFILE = 0x03, + LOGFS_RESERVED_INOS = 0x10, +}; + +/* + * Inode flags. High bits should never be written to the medium. They are + * reserved for in-memory usage. + * Low bits should either remain in sync with the corresponding FS_*_FL or + * reuse slots that obviously don't make sense for logfs. + * + * LOGFS_IF_DIRTY Inode must be written back + * LOGFS_IF_ZOMBIE Inode has been deleted + * LOGFS_IF_STILLBORN -ENOSPC happened when creating inode + */ +#define LOGFS_IF_COMPRESSED 0x00000004 /* == FS_COMPR_FL */ +#define LOGFS_IF_DIRTY 0x20000000 +#define LOGFS_IF_ZOMBIE 0x40000000 +#define LOGFS_IF_STILLBORN 0x80000000 + +/* Flags available to chattr */ +#define LOGFS_FL_USER_VISIBLE (LOGFS_IF_COMPRESSED) +#define LOGFS_FL_USER_MODIFIABLE (LOGFS_IF_COMPRESSED) +/* Flags inherited from parent directory on file/directory creation */ +#define LOGFS_FL_INHERITED (LOGFS_IF_COMPRESSED) + +/** + * struct logfs_disk_inode - on-medium inode + * + * @di_mode: file mode + * @di_pad: reserved, must be 0 + * @di_flags: inode flags, see above + * @di_uid: user id + * @di_gid: group id + * @di_ctime: change time + * @di_mtime: modify time + * @di_refcount: reference count (aka nlink or link count) + * @di_generation: inode generation, for nfs + * @di_used_bytes: number of bytes used + * @di_size: file size + * @di_data: data pointers + */ +struct logfs_disk_inode { + __be16 di_mode; + __u8 di_height; + __u8 di_pad; + __be32 di_flags; + __be32 di_uid; + __be32 di_gid; + + __be64 di_ctime; + __be64 di_mtime; + + __be64 di_atime; + __be32 di_refcount; + __be32 di_generation; + + __be64 di_used_bytes; + __be64 di_size; + + __be64 di_data[LOGFS_EMBEDDED_FIELDS]; +}; + +SIZE_CHECK(logfs_disk_inode, 200); + +#define INODE_POINTER_OFS \ + (offsetof(struct logfs_disk_inode, di_data) / sizeof(__be64)) +#define INODE_USED_OFS \ + (offsetof(struct logfs_disk_inode, di_used_bytes) / sizeof(__be64)) +#define INODE_SIZE_OFS \ + (offsetof(struct logfs_disk_inode, di_size) / sizeof(__be64)) +#define INODE_HEIGHT_OFS (0) + +/** + * struct logfs_disk_dentry - on-medium dentry structure + * + * @ino: inode number + * @namelen: length of file name + * @type: file type, identical to bits 12..15 of mode + * @name: file name + */ +/* FIXME: add 6 bytes of padding to remove the __packed */ +struct logfs_disk_dentry { + __be64 ino; + __be16 namelen; + __u8 type; + __u8 name[LOGFS_MAX_NAMELEN]; +} __attribute__((packed)); + +SIZE_CHECK(logfs_disk_dentry, 266); + +#define RESERVED 0xffffffff +#define BADSEG 0xffffffff +/** + * struct logfs_segment_entry - segment file entry + * + * @ec_level: erase count and level + * @valid: number of valid bytes + * + * Segment file contains one entry for every segment. ec_level contains the + * erasecount in the upper 28 bits and the level in the lower 4 bits. An + * ec_level of BADSEG (-1) identifies bad segments. valid contains the number + * of valid bytes or RESERVED (-1 again) if the segment is used for either the + * superblock or the journal, or when the segment is bad. + */ +struct logfs_segment_entry { + __be32 ec_level; + __be32 valid; +}; + +SIZE_CHECK(logfs_segment_entry, 8); + +/** + * struct logfs_journal_header - header for journal entries (JEs) + * + * @h_crc: crc32 of journal entry + * @h_len: length of compressed journal entry, + * not including header + * @h_datalen: length of uncompressed data + * @h_type: JE type + * @h_version: unnormalized version of journal entry + * @h_compr: compression type + * @h_pad: reserved + */ +struct logfs_journal_header { + __be32 h_crc; + __be16 h_len; + __be16 h_datalen; + __be16 h_type; + __be16 h_version; + __u8 h_compr; + __u8 h_pad[3]; +}; + +SIZE_CHECK(logfs_journal_header, 16); + +/* + * Life expectency of data. + * VIM_DEFAULT - default vim + * VIM_SEGFILE - for segment file only - very short-living + * VIM_GC - GC'd data - likely long-living + */ +enum logfs_vim { + VIM_DEFAULT = 0, + VIM_SEGFILE = 1, +}; + +/** + * struct logfs_je_area - wbuf header + * + * @segno: segment number of area + * @used_bytes: number of bytes already used + * @gc_level: GC level + * @vim: life expectancy of data + * + * "Areas" are segments currently being used for writing. There is at least + * one area per GC level. Several may be used to seperate long-living from + * short-living data. If an area with unknown vim is encountered, it can + * simply be closed. + * The write buffer immediately follow this header. + */ +struct logfs_je_area { + __be32 segno; + __be32 used_bytes; + __u8 gc_level; + __u8 vim; +} __attribute__((packed)); + +SIZE_CHECK(logfs_je_area, 10); + +#define MAX_JOURNAL_HEADER \ + (sizeof(struct logfs_journal_header) + sizeof(struct logfs_je_area)) + +/** + * struct logfs_je_dynsb - dynamic superblock + * + * @ds_gec: global erase count + * @ds_sweeper: current position of GC "sweeper" + * @ds_rename_dir: source directory ino (see dir.c documentation) + * @ds_rename_pos: position of source dd (see dir.c documentation) + * @ds_victim_ino: victims of incomplete dir operation (see dir.c) + * @ds_victim_ino: parent inode of victim (see dir.c) + * @ds_used_bytes: number of used bytes + */ +struct logfs_je_dynsb { + __be64 ds_gec; + __be64 ds_sweeper; + + __be64 ds_rename_dir; + __be64 ds_rename_pos; + + __be64 ds_victim_ino; + __be64 ds_victim_parent; /* XXX */ + + __be64 ds_used_bytes; + __be32 ds_generation; + __be32 pad; +}; + +SIZE_CHECK(logfs_je_dynsb, 64); + +/** + * struct logfs_je_anchor - anchor of filesystem tree, aka master inode + * + * @da_size: size of inode file + * @da_last_ino: last created inode + * @da_used_bytes: number of bytes used + * @da_data: data pointers + */ +struct logfs_je_anchor { + __be64 da_size; + __be64 da_last_ino; + + __be64 da_used_bytes; + u8 da_height; + u8 pad[7]; + + __be64 da_data[LOGFS_EMBEDDED_FIELDS]; +}; + +SIZE_CHECK(logfs_je_anchor, 168); + +/** + * struct logfs_je_spillout - spillout entry (from 1st to 2nd journal) + * + * @so_segment: segments used for 2nd journal + * + * Length of the array is given by h_len field in the header. + */ +struct logfs_je_spillout { + __be64 so_segment[0]; +}; + +SIZE_CHECK(logfs_je_spillout, 0); + +/** + * struct logfs_je_journal_ec - erase counts for all journal segments + * + * @ec: erase count + * + * Length of the array is given by h_len field in the header. + */ +struct logfs_je_journal_ec { + __be32 ec[0]; +}; + +SIZE_CHECK(logfs_je_journal_ec, 0); + +/** + * struct logfs_je_free_segments - list of free segmetns with erase count + */ +struct logfs_je_free_segments { + __be32 segno; + __be32 ec; +}; + +SIZE_CHECK(logfs_je_free_segments, 8); + +/** + * struct logfs_seg_alias - list of segment aliases + */ +struct logfs_seg_alias { + __be32 old_segno; + __be32 new_segno; +}; + +SIZE_CHECK(logfs_seg_alias, 8); + +/** + * struct logfs_obj_alias - list of object aliases + */ +struct logfs_obj_alias { + __be64 ino; + __be64 bix; + __be64 val; + u8 level; + u8 pad[5]; + __be16 child_no; +}; + +SIZE_CHECK(logfs_obj_alias, 32); + +/** + * Compression types. + * + * COMPR_NONE - uncompressed + * COMPR_ZLIB - compressed with zlib + */ +enum { + COMPR_NONE = 0, + COMPR_ZLIB = 1, +}; + +/* + * Journal entries come in groups of 16. First group contains unique + * entries, next groups contain one entry per level + * + * JE_FIRST - smallest possible journal entry number + * + * JEG_BASE - base group, containing unique entries + * JE_COMMIT - commit entry, validates all previous entries + * JE_DYNSB - dynamic superblock, anything that ought to be in the + * superblock but cannot because it is read-write data + * JE_ANCHOR - anchor aka master inode aka inode file's inode + * JE_ERASECOUNT erasecounts for all journal segments + * JE_SPILLOUT - unused + * JE_SEG_ALIAS - aliases segments + * JE_AREA - area description + * + * JE_LAST - largest possible journal entry number + */ +enum { + JE_FIRST = 0x01, + + JEG_BASE = 0x00, + JE_COMMIT = 0x02, + JE_DYNSB = 0x03, + JE_ANCHOR = 0x04, + JE_ERASECOUNT = 0x05, + JE_SPILLOUT = 0x06, + JE_OBJ_ALIAS = 0x0d, + JE_AREA = 0x0e, + + JE_LAST = 0x0e, +}; + +#endif diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c new file mode 100644 index 000000000000..1dbe6e8cccec --- /dev/null +++ b/fs/logfs/readwrite.c @@ -0,0 +1,2246 @@ +/* + * fs/logfs/readwrite.c + * + * As should be obvious for Linux kernel code, license is GPLv2 + * + * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org> + * + * + * Actually contains five sets of very similar functions: + * read read blocks from a file + * seek_hole find next hole + * seek_data find next data block + * valid check whether a block still belongs to a file + * write write blocks to a file + * delete delete a block (for directories and ifile) + * rewrite move existing blocks of a file to a new location (gc helper) + * truncate truncate a file + */ +#include "logfs.h" +#include <linux/sched.h> + +static u64 adjust_bix(u64 bix, level_t level) +{ + switch (level) { + case 0: + return bix; + case LEVEL(1): + return max_t(u64, bix, I0_BLOCKS); + case LEVEL(2): + return max_t(u64, bix, I1_BLOCKS); + case LEVEL(3): + return max_t(u64, bix, I2_BLOCKS); + case LEVEL(4): + return max_t(u64, bix, I3_BLOCKS); + case LEVEL(5): + return max_t(u64, bix, I4_BLOCKS); + default: + WARN_ON(1); + return bix; + } +} + +static inline u64 maxbix(u8 height) +{ + return 1ULL << (LOGFS_BLOCK_BITS * height); +} + +/** + * The inode address space is cut in two halves. Lower half belongs to data + * pages, upper half to indirect blocks. If the high bit (INDIRECT_BIT) is + * set, the actual block index (bix) and level can be derived from the page + * index. + * + * The lowest three bits of the block index are set to 0 after packing and + * unpacking. Since the lowest n bits (9 for 4KiB blocksize) are ignored + * anyway this is harmless. + */ +#define ARCH_SHIFT (BITS_PER_LONG - 32) +#define INDIRECT_BIT (0x80000000UL << ARCH_SHIFT) +#define LEVEL_SHIFT (28 + ARCH_SHIFT) +static inline pgoff_t first_indirect_block(void) +{ + return INDIRECT_BIT | (1ULL << LEVEL_SHIFT); +} + +pgoff_t logfs_pack_index(u64 bix, level_t level) +{ + pgoff_t index; + + BUG_ON(bix >= INDIRECT_BIT); + if (level == 0) + return bix; + + index = INDIRECT_BIT; + index |= (__force long)level << LEVEL_SHIFT; + index |= bix >> ((__force u8)level * LOGFS_BLOCK_BITS); + return index; +} + +void logfs_unpack_index(pgoff_t index, u64 *bix, level_t *level) +{ + u8 __level; + + if (!(index & INDIRECT_BIT)) { + *bix = index; + *level = 0; + return; + } + + __level = (index & ~INDIRECT_BIT) >> LEVEL_SHIFT; + *level = LEVEL(__level); + *bix = (index << (__level * LOGFS_BLOCK_BITS)) & ~INDIRECT_BIT; + *bix = adjust_bix(*bix, *level); + return; +} +#undef ARCH_SHIFT +#undef INDIRECT_BIT +#undef LEVEL_SHIFT + +/* + * Time is stored as nanoseconds since the epoch. + */ +static struct timespec be64_to_timespec(__be64 betime) +{ + return ns_to_timespec(be64_to_cpu(betime)); +} + +static __be64 timespec_to_be64(struct timespec tsp) +{ + return cpu_to_be64((u64)tsp.tv_sec * NSEC_PER_SEC + tsp.tv_nsec); +} + +static void logfs_disk_to_inode(struct logfs_disk_inode *di, struct inode*inode) +{ + struct logfs_inode *li = logfs_inode(inode); + int i; + + inode->i_mode = be16_to_cpu(di->di_mode); + li->li_height = di->di_height; + li->li_flags = be32_to_cpu(di->di_flags); + inode->i_uid = be32_to_cpu(di->di_uid); + inode->i_gid = be32_to_cpu(di->di_gid); + inode->i_size = be64_to_cpu(di->di_size); + logfs_set_blocks(inode, be64_to_cpu(di->di_used_bytes)); + inode->i_atime = be64_to_timespec(di->di_atime); + inode->i_ctime = be64_to_timespec(di->di_ctime); + inode->i_mtime = be64_to_timespec(di->di_mtime); + inode->i_nlink = be32_to_cpu(di->di_refcount); + inode->i_generation = be32_to_cpu(di->di_generation); + + switch (inode->i_mode & S_IFMT) { + case S_IFSOCK: /* fall through */ + case S_IFBLK: /* fall through */ + case S_IFCHR: /* fall through */ + case S_IFIFO: + inode->i_rdev = be64_to_cpu(di->di_data[0]); + break; + case S_IFDIR: /* fall through */ + case S_IFREG: /* fall through */ + case S_IFLNK: + for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++) + li->li_data[i] = be64_to_cpu(di->di_data[i]); + break; + default: + BUG(); + } +} + +static void logfs_inode_to_disk(struct inode *inode, struct logfs_disk_inode*di) +{ + struct logfs_inode *li = logfs_inode(inode); + int i; + + di->di_mode = cpu_to_be16(inode->i_mode); + di->di_height = li->li_height; + di->di_pad = 0; + di->di_flags = cpu_to_be32(li->li_flags); + di->di_uid = cpu_to_be32(inode->i_uid); + di->di_gid = cpu_to_be32(inode->i_gid); + di->di_size = cpu_to_be64(i_size_read(inode)); + di->di_used_bytes = cpu_to_be64(li->li_used_bytes); + di->di_atime = timespec_to_be64(inode->i_atime); + di->di_ctime = timespec_to_be64(inode->i_ctime); + di->di_mtime = timespec_to_be64(inode->i_mtime); + di->di_refcount = cpu_to_be32(inode->i_nlink); + di->di_generation = cpu_to_be32(inode->i_generation); + + switch (inode->i_mode & S_IFMT) { + case S_IFSOCK: /* fall through */ + case S_IFBLK: /* fall through */ + case S_IFCHR: /* fall through */ + case S_IFIFO: + di->di_data[0] = cpu_to_be64(inode->i_rdev); + break; + case S_IFDIR: /* fall through */ + case S_IFREG: /* fall through */ + case S_IFLNK: + for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++) + di->di_data[i] = cpu_to_be64(li->li_data[i]); + break; + default: + BUG(); + } +} + +static void __logfs_set_blocks(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + struct logfs_inode *li = logfs_inode(inode); + + inode->i_blocks = ULONG_MAX; + if (li->li_used_bytes >> sb->s_blocksize_bits < ULONG_MAX) + inode->i_blocks = ALIGN(li->li_used_bytes, 512) >> 9; +} + +void logfs_set_blocks(struct inode *inode, u64 bytes) +{ + struct logfs_inode *li = logfs_inode(inode); + + li->li_used_bytes = bytes; + __logfs_set_blocks(inode); +} + +static void prelock_page(struct super_block *sb, struct page *page, int lock) +{ + struct logfs_super *super = logfs_super(sb); + + BUG_ON(!PageLocked(page)); + if (lock) { + BUG_ON(PagePreLocked(page)); + SetPagePreLocked(page); + } else { + /* We are in GC path. */ + if (PagePreLocked(page)) + super->s_lock_count++; + else + SetPagePreLocked(page); + } +} + +static void preunlock_page(struct super_block *sb, struct page *page, int lock) +{ + struct logfs_super *super = logfs_super(sb); + + BUG_ON(!PageLocked(page)); + if (lock) + ClearPagePreLocked(page); + else { + /* We are in GC path. */ + BUG_ON(!PagePreLocked(page)); + if (super->s_lock_count) + super->s_lock_count--; + else + ClearPagePreLocked(page); + } +} + +/* + * Logfs is prone to an AB-BA deadlock where one task tries to acquire + * s_write_mutex with a locked page and GC tries to get that page while holding + * s_write_mutex. + * To solve this issue logfs will ignore the page lock iff the page in question + * is waiting for s_write_mutex. We annotate this fact by setting PG_pre_locked + * in addition to PG_locked. + */ +static void logfs_get_wblocks(struct super_block *sb, struct page *page, + int lock) +{ + struct logfs_super *super = logfs_super(sb); + + if (page) + prelock_page(sb, page, lock); + + if (lock) { + mutex_lock(&super->s_write_mutex); + logfs_gc_pass(sb); + /* FIXME: We also have to check for shadowed space + * and mempool fill grade */ + } +} + +static void logfs_put_wblocks(struct super_block *sb, struct page *page, + int lock) +{ + struct logfs_super *super = logfs_super(sb); + + if (page) + preunlock_page(sb, page, lock); + /* Order matters - we must clear PG_pre_locked before releasing + * s_write_mutex or we could race against another task. */ + if (lock) + mutex_unlock(&super->s_write_mutex); +} + +static struct page *logfs_get_read_page(struct inode *inode, u64 bix, + level_t level) +{ + return find_or_create_page(inode->i_mapping, + logfs_pack_index(bix, level), GFP_NOFS); +} + +static void logfs_put_read_page(struct page *page) +{ + unlock_page(page); + page_cache_release(page); +} + +static void logfs_lock_write_page(struct page *page) +{ + int loop = 0; + + while (unlikely(!trylock_page(page))) { + if (loop++ > 0x1000) { + /* Has been observed once so far... */ + printk(KERN_ERR "stack at %p\n", &loop); + BUG(); + } + if (PagePreLocked(page)) { + /* Holder of page lock is waiting for us, it + * is safe to use this page. */ + break; + } + /* Some other process has this page locked and has + * nothing to do with us. Wait for it to finish. + */ + schedule(); + } + BUG_ON(!PageLocked(page)); +} + +static struct page *logfs_get_write_page(struct inode *inode, u64 bix, + level_t level) +{ + struct address_space *mapping = inode->i_mapping; + pgoff_t index = logfs_pack_index(bix, level); + struct page *page; + int err; + +repeat: + page = find_get_page(mapping, index); + if (!page) { + page = __page_cache_alloc(GFP_NOFS); + if (!page) + return NULL; + err = add_to_page_cache_lru(page, mapping, index, GFP_NOFS); + if (unlikely(err)) { + page_cache_release(page); + if (err == -EEXIST) + goto repeat; + return NULL; + } + } else logfs_lock_write_page(page); + BUG_ON(!PageLocked(page)); + return page; +} + +static void logfs_unlock_write_page(struct page *page) +{ + if (!PagePreLocked(page)) + unlock_page(page); +} + +static void logfs_put_write_page(struct page *page) +{ + logfs_unlock_write_page(page); + page_cache_release(page); +} + +static struct page *logfs_get_page(struct inode *inode, u64 bix, level_t level, + int rw) +{ + if (rw == READ) + return logfs_get_read_page(inode, bix, level); + else + return logfs_get_write_page(inode, bix, level); +} + +static void logfs_put_page(struct page *page, int rw) +{ + if (rw == READ) + logfs_put_read_page(page); + else + logfs_put_write_page(page); +} + +static unsigned long __get_bits(u64 val, int skip, int no) +{ + u64 ret = val; + + ret >>= skip * no; + ret <<= 64 - no; + ret >>= 64 - no; + return ret; +} + +static unsigned long get_bits(u64 val, level_t skip) +{ + return __get_bits(val, (__force int)skip, LOGFS_BLOCK_BITS); +} + +static inline void init_shadow_tree(struct super_block *sb, + struct shadow_tree *tree) +{ + struct logfs_super *super = logfs_super(sb); + + btree_init_mempool64(&tree->new, super->s_btree_pool); + btree_init_mempool64(&tree->old, super->s_btree_pool); +} + +static void indirect_write_block(struct logfs_block *block) +{ + struct page *page; + struct inode *inode; + int ret; + + page = block->page; + inode = page->mapping->host; + logfs_lock_write_page(page); + ret = logfs_write_buf(inode, page, 0); + logfs_unlock_write_page(page); + /* + * This needs some rework. Unless you want your filesystem to run + * completely synchronously (you don't), the filesystem will always + * report writes as 'successful' before the actual work has been + * done. The actual work gets done here and this is where any errors + * will show up. And there isn't much we can do about it, really. + * + * Some attempts to fix the errors (move from bad blocks, retry io,...) + * have already been done, so anything left should be either a broken + * device or a bug somewhere in logfs itself. Being relatively new, + * the odds currently favor a bug, so for now the line below isn't + * entirely tasteles. + */ + BUG_ON(ret); +} + +static void inode_write_block(struct logfs_block *block) +{ + struct inode *inode; + int ret; + + inode = block->inode; + if (inode->i_ino == LOGFS_INO_MASTER) + logfs_write_anchor(inode); + else { + ret = __logfs_write_inode(inode, 0); + /* see indirect_write_block comment */ + BUG_ON(ret); + } +} + +static gc_level_t inode_block_level(struct logfs_block *block) +{ + BUG_ON(block->inode->i_ino == LOGFS_INO_MASTER); + return GC_LEVEL(LOGFS_MAX_LEVELS); +} + +static gc_level_t indirect_block_level(struct logfs_block *block) +{ + struct page *page; + struct inode *inode; + u64 bix; + level_t level; + + page = block->page; + inode = page->mapping->host; + logfs_unpack_index(page->index, &bix, &level); + return expand_level(inode->i_ino, level); +} + +/* + * This silences a false, yet annoying gcc warning. I hate it when my editor + * jumps into bitops.h each time I recompile this file. + * TODO: Complain to gcc folks about this and upgrade compiler. + */ +static unsigned long fnb(const unsigned long *addr, + unsigned long size, unsigned long offset) +{ + return find_next_bit(addr, size, offset); +} + +static __be64 inode_val0(struct inode *inode) +{ + struct logfs_inode *li = logfs_inode(inode); + u64 val; + + /* + * Explicit shifting generates good code, but must match the format + * of the structure. Add some paranoia just in case. + */ + BUILD_BUG_ON(offsetof(struct logfs_disk_inode, di_mode) != 0); + BUILD_BUG_ON(offsetof(struct logfs_disk_inode, di_height) != 2); + BUILD_BUG_ON(offsetof(struct logfs_disk_inode, di_flags) != 4); + + val = (u64)inode->i_mode << 48 | + (u64)li->li_height << 40 | + (u64)li->li_flags; + return cpu_to_be64(val); +} + +static int inode_write_alias(struct super_block *sb, + struct logfs_block *block, write_alias_t *write_one_alias) +{ + struct inode *inode = block->inode; + struct logfs_inode *li = logfs_inode(inode); + unsigned long pos; + u64 ino , bix; + __be64 val; + level_t level; + int err; + + for (pos = 0; ; pos++) { + pos = fnb(block->alias_map, LOGFS_BLOCK_FACTOR, pos); + if (pos >= LOGFS_EMBEDDED_FIELDS + INODE_POINTER_OFS) + return 0; + + switch (pos) { + case INODE_HEIGHT_OFS: + val = inode_val0(inode); + break; + case INODE_USED_OFS: + val = cpu_to_be64(li->li_used_bytes);; + break; + case INODE_SIZE_OFS: + val = cpu_to_be64(i_size_read(inode)); + break; + case INODE_POINTER_OFS ... INODE_POINTER_OFS + LOGFS_EMBEDDED_FIELDS - 1: + val = cpu_to_be64(li->li_data[pos - INODE_POINTER_OFS]); + break; + default: + BUG(); + } + + ino = LOGFS_INO_MASTER; + bix = inode->i_ino; + level = LEVEL(0); + err = write_one_alias(sb, ino, bix, level, pos, val); + if (err) + return err; + } +} + +static int indirect_write_alias(struct super_block *sb, + struct logfs_block *block, write_alias_t *write_one_alias) +{ + unsigned long pos; + struct page *page = block->page; + u64 ino , bix; + __be64 *child, val; + level_t level; + int err; + + for (pos = 0; ; pos++) { + pos = fnb(block->alias_map, LOGFS_BLOCK_FACTOR, pos); + if (pos >= LOGFS_BLOCK_FACTOR) + return 0; + + ino = page->mapping->host->i_ino; + logfs_unpack_index(page->index, &bix, &level); + child = kmap_atomic(page, KM_USER0); + val = child[pos]; + kunmap_atomic(child, KM_USER0); + err = write_one_alias(sb, ino, bix, level, pos, val); + if (err) + return err; + } +} + +int logfs_write_obj_aliases_pagecache(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_block *block; + int err; + + list_for_each_entry(block, &super->s_object_alias, alias_list) { + err = block->ops->write_alias(sb, block, write_alias_journal); + if (err) + return err; + } + return 0; +} + +void __free_block(struct super_block *sb, struct logfs_block *block) +{ + BUG_ON(!list_empty(&block->item_list)); + list_del(&block->alias_list); + mempool_free(block, logfs_super(sb)->s_block_pool); +} + +static void inode_free_block(struct super_block *sb, struct logfs_block *block) +{ + struct inode *inode = block->inode; + + logfs_inode(inode)->li_block = NULL; + __free_block(sb, block); +} + +static void indirect_free_block(struct super_block *sb, + struct logfs_block *block) +{ + ClearPagePrivate(block->page); + block->page->private = 0; + __free_block(sb, block); +} + + +static struct logfs_block_ops inode_block_ops = { + .write_block = inode_write_block, + .block_level = inode_block_level, + .free_block = inode_free_block, + .write_alias = inode_write_alias, +}; + +struct logfs_block_ops indirect_block_ops = { + .write_block = indirect_write_block, + .block_level = indirect_block_level, + .free_block = indirect_free_block, + .write_alias = indirect_write_alias, +}; + +struct logfs_block *__alloc_block(struct super_block *sb, + u64 ino, u64 bix, level_t level) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_block *block; + + block = mempool_alloc(super->s_block_pool, GFP_NOFS); + memset(block, 0, sizeof(*block)); + INIT_LIST_HEAD(&block->alias_list); + INIT_LIST_HEAD(&block->item_list); + block->sb = sb; + block->ino = ino; + block->bix = bix; + block->level = level; + return block; +} + +static void alloc_inode_block(struct inode *inode) +{ + struct logfs_inode *li = logfs_inode(inode); + struct logfs_block *block; + + if (li->li_block) + return; + + block = __alloc_block(inode->i_sb, LOGFS_INO_MASTER, inode->i_ino, 0); + block->inode = inode; + li->li_block = block; + block->ops = &inode_block_ops; +} + +void initialize_block_counters(struct page *page, struct logfs_block *block, + __be64 *array, int page_is_empty) +{ + u64 ptr; + int i, start; + + block->partial = 0; + block->full = 0; + start = 0; + if (page->index < first_indirect_block()) { + /* Counters are pointless on level 0 */ + return; + } + if (page->index == first_indirect_block()) { + /* Skip unused pointers */ + start = I0_BLOCKS; + block->full = I0_BLOCKS; + } + if (!page_is_empty) { + for (i = start; i < LOGFS_BLOCK_FACTOR; i++) { + ptr = be64_to_cpu(array[i]); + if (ptr) + block->partial++; + if (ptr & LOGFS_FULLY_POPULATED) + block->full++; + } + } +} + +static void alloc_data_block(struct inode *inode, struct page *page) +{ + struct logfs_block *block; + u64 bix; + level_t level; + + if (PagePrivate(page)) + return; + + logfs_unpack_index(page->index, &bix, &level); + block = __alloc_block(inode->i_sb, inode->i_ino, bix, level); + block->page = page; + SetPagePrivate(page); + page->private = (unsigned long)block; + block->ops = &indirect_block_ops; +} + +static void alloc_indirect_block(struct inode *inode, struct page *page, + int page_is_empty) +{ + struct logfs_block *block; + __be64 *array; + + if (PagePrivate(page)) + return; + + alloc_data_block(inode, page); + + block = logfs_block(page); + array = kmap_atomic(page, KM_USER0); + initialize_block_counters(page, block, array, page_is_empty); + kunmap_atomic(array, KM_USER0); +} + +static void block_set_pointer(struct page *page, int index, u64 ptr) +{ + struct logfs_block *block = logfs_block(page); + __be64 *array; + u64 oldptr; + + BUG_ON(!block); + array = kmap_atomic(page, KM_USER0); + oldptr = be64_to_cpu(array[index]); + array[index] = cpu_to_be64(ptr); + kunmap_atomic(array, KM_USER0); + SetPageUptodate(page); + + block->full += !!(ptr & LOGFS_FULLY_POPULATED) + - !!(oldptr & LOGFS_FULLY_POPULATED); + block->partial += !!ptr - !!oldptr; +} + +static u64 block_get_pointer(struct page *page, int index) +{ + __be64 *block; + u64 ptr; + + block = kmap_atomic(page, KM_USER0); + ptr = be64_to_cpu(block[index]); + kunmap_atomic(block, KM_USER0); + return ptr; +} + +static int logfs_read_empty(struct page *page) +{ + zero_user_segment(page, 0, PAGE_CACHE_SIZE); + return 0; +} + +static int logfs_read_direct(struct inode *inode, struct page *page) +{ + struct logfs_inode *li = logfs_inode(inode); + pgoff_t index = page->index; + u64 block; + + block = li->li_data[index]; + if (!block) + return logfs_read_empty(page); + + return logfs_segment_read(inode, page, block, index, 0); +} + +static int logfs_read_loop(struct inode *inode, struct page *page, + int rw_context) +{ + struct logfs_inode *li = logfs_inode(inode); + u64 bix, bofs = li->li_data[INDIRECT_INDEX]; + level_t level, target_level; + int ret; + struct page *ipage; + + logfs_unpack_index(page->index, &bix, &target_level); + if (!bofs) + return logfs_read_empty(page); + + if (bix >= maxbix(li->li_height)) + return logfs_read_empty(page); + + for (level = LEVEL(li->li_height); + (__force u8)level > (__force u8)target_level; + level = SUBLEVEL(level)){ + ipage = logfs_get_page(inode, bix, level, rw_context); + if (!ipage) + return -ENOMEM; + + ret = logfs_segment_read(inode, ipage, bofs, bix, level); + if (ret) { + logfs_put_read_page(ipage); + return ret; + } + + bofs = block_get_pointer(ipage, get_bits(bix, SUBLEVEL(level))); + logfs_put_page(ipage, rw_context); + if (!bofs) + return logfs_read_empty(page); + } + + return logfs_segment_read(inode, page, bofs, bix, 0); +} + +static int logfs_read_block(struct inode *inode, struct page *page, + int rw_context) +{ + pgoff_t index = page->index; + + if (index < I0_BLOCKS) + return logfs_read_direct(inode, page); + return logfs_read_loop(inode, page, rw_context); +} + +static int logfs_exist_loop(struct inode *inode, u64 bix) +{ + struct logfs_inode *li = logfs_inode(inode); + u64 bofs = li->li_data[INDIRECT_INDEX]; + level_t level; + int ret; + struct page *ipage; + + if (!bofs) + return 0; + if (bix >= maxbix(li->li_height)) + return 0; + + for (level = LEVEL(li->li_height); level != 0; level = SUBLEVEL(level)) { + ipage = logfs_get_read_page(inode, bix, level); + if (!ipage) + return -ENOMEM; + + ret = logfs_segment_read(inode, ipage, bofs, bix, level); + if (ret) { + logfs_put_read_page(ipage); + return ret; + } + + bofs = block_get_pointer(ipage, get_bits(bix, SUBLEVEL(level))); + logfs_put_read_page(ipage); + if (!bofs) + return 0; + } + + return 1; +} + +int logfs_exist_block(struct inode *inode, u64 bix) +{ + struct logfs_inode *li = logfs_inode(inode); + + if (bix < I0_BLOCKS) + return !!li->li_data[bix]; + return logfs_exist_loop(inode, bix); +} + +static u64 seek_holedata_direct(struct inode *inode, u64 bix, int data) +{ + struct logfs_inode *li = logfs_inode(inode); + + for (; bix < I0_BLOCKS; bix++) + if (data ^ (li->li_data[bix] == 0)) + return bix; + return I0_BLOCKS; +} + +static u64 seek_holedata_loop(struct inode *inode, u64 bix, int data) +{ + struct logfs_inode *li = logfs_inode(inode); + __be64 *rblock; + u64 increment, bofs = li->li_data[INDIRECT_INDEX]; + level_t level; + int ret, slot; + struct page *page; + + BUG_ON(!bofs); + + for (level = LEVEL(li->li_height); level != 0; level = SUBLEVEL(level)) { + increment = 1 << (LOGFS_BLOCK_BITS * ((__force u8)level-1)); + page = logfs_get_read_page(inode, bix, level); + if (!page) + return bix; + + ret = logfs_segment_read(inode, page, bofs, bix, level); + if (ret) { + logfs_put_read_page(page); + return bix; + } + + slot = get_bits(bix, SUBLEVEL(level)); + rblock = kmap_atomic(page, KM_USER0); + while (slot < LOGFS_BLOCK_FACTOR) { + if (data && (rblock[slot] != 0)) + break; + if (!data && !(be64_to_cpu(rblock[slot]) & LOGFS_FULLY_POPULATED)) + break; + slot++; + bix += increment; + bix &= ~(increment - 1); + } + if (slot >= LOGFS_BLOCK_FACTOR) { + kunmap_atomic(rblock, KM_USER0); + logfs_put_read_page(page); + return bix; + } + bofs = be64_to_cpu(rblock[slot]); + kunmap_atomic(rblock, KM_USER0); + logfs_put_read_page(page); + if (!bofs) { + BUG_ON(data); + return bix; + } + } + return bix; +} + +/** + * logfs_seek_hole - find next hole starting at a given block index + * @inode: inode to search in + * @bix: block index to start searching + * + * Returns next hole. If the file doesn't contain any further holes, the + * block address next to eof is returned instead. + */ +u64 logfs_seek_hole(struct inode *inode, u64 bix) +{ + struct logfs_inode *li = logfs_inode(inode); + + if (bix < I0_BLOCKS) { + bix = seek_holedata_direct(inode, bix, 0); + if (bix < I0_BLOCKS) + return bix; + } + + if (!li->li_data[INDIRECT_INDEX]) + return bix; + else if (li->li_data[INDIRECT_INDEX] & LOGFS_FULLY_POPULATED) + bix = maxbix(li->li_height); + else { + bix = seek_holedata_loop(inode, bix, 0); + if (bix < maxbix(li->li_height)) + return bix; + /* Should not happen anymore. But if some port writes semi- + * corrupt images (as this one used to) we might run into it. + */ + WARN_ON_ONCE(bix == maxbix(li->li_height)); + } + + return bix; +} + +static u64 __logfs_seek_data(struct inode *inode, u64 bix) +{ + struct logfs_inode *li = logfs_inode(inode); + + if (bix < I0_BLOCKS) { + bix = seek_holedata_direct(inode, bix, 1); + if (bix < I0_BLOCKS) + return bix; + } + + if (bix < maxbix(li->li_height)) { + if (!li->li_data[INDIRECT_INDEX]) + bix = maxbix(li->li_height); + else + return seek_holedata_loop(inode, bix, 1); + } + + return bix; +} + +/** + * logfs_seek_data - find next data block after a given block index + * @inode: inode to search in + * @bix: block index to start searching + * + * Returns next data block. If the file doesn't contain any further data + * blocks, the last block in the file is returned instead. + */ +u64 logfs_seek_data(struct inode *inode, u64 bix) +{ + struct super_block *sb = inode->i_sb; + u64 ret, end; + + ret = __logfs_seek_data(inode, bix); + end = i_size_read(inode) >> sb->s_blocksize_bits; + if (ret >= end) + ret = max(bix, end); + return ret; +} + +static int logfs_is_valid_direct(struct logfs_inode *li, u64 bix, u64 ofs) +{ + return pure_ofs(li->li_data[bix]) == ofs; +} + +static int __logfs_is_valid_loop(struct inode *inode, u64 bix, + u64 ofs, u64 bofs) +{ + struct logfs_inode *li = logfs_inode(inode); + level_t level; + int ret; + struct page *page; + + for (level = LEVEL(li->li_height); level != 0; level = SUBLEVEL(level)){ + page = logfs_get_write_page(inode, bix, level); + BUG_ON(!page); + + ret = logfs_segment_read(inode, page, bofs, bix, level); + if (ret) { + logfs_put_write_page(page); + return 0; + } + + bofs = block_get_pointer(page, get_bits(bix, SUBLEVEL(level))); + logfs_put_write_page(page); + if (!bofs) + return 0; + + if (pure_ofs(bofs) == ofs) + return 1; + } + return 0; +} + +static int logfs_is_valid_loop(struct inode *inode, u64 bix, u64 ofs) +{ + struct logfs_inode *li = logfs_inode(inode); + u64 bofs = li->li_data[INDIRECT_INDEX]; + + if (!bofs) + return 0; + + if (bix >= maxbix(li->li_height)) + return 0; + + if (pure_ofs(bofs) == ofs) + return 1; + + return __logfs_is_valid_loop(inode, bix, ofs, bofs); +} + +static int __logfs_is_valid_block(struct inode *inode, u64 bix, u64 ofs) +{ + struct logfs_inode *li = logfs_inode(inode); + + if ((inode->i_nlink == 0) && atomic_read(&inode->i_count) == 1) + return 0; + + if (bix < I0_BLOCKS) + return logfs_is_valid_direct(li, bix, ofs); + return logfs_is_valid_loop(inode, bix, ofs); +} + +/** + * logfs_is_valid_block - check whether this block is still valid + * + * @sb - superblock + * @ofs - block physical offset + * @ino - block inode number + * @bix - block index + * @level - block level + * + * Returns 0 if the block is invalid, 1 if it is valid and 2 if it will + * become invalid once the journal is written. + */ +int logfs_is_valid_block(struct super_block *sb, u64 ofs, u64 ino, u64 bix, + gc_level_t gc_level) +{ + struct logfs_super *super = logfs_super(sb); + struct inode *inode; + int ret, cookie; + + /* Umount closes a segment with free blocks remaining. Those + * blocks are by definition invalid. */ + if (ino == -1) + return 0; + + LOGFS_BUG_ON((u64)(u_long)ino != ino, sb); + + inode = logfs_safe_iget(sb, ino, &cookie); + if (IS_ERR(inode)) + goto invalid; + + ret = __logfs_is_valid_block(inode, bix, ofs); + logfs_safe_iput(inode, cookie); + if (ret) + return ret; + +invalid: + /* Block is nominally invalid, but may still sit in the shadow tree, + * waiting for a journal commit. + */ + if (btree_lookup64(&super->s_shadow_tree.old, ofs)) + return 2; + return 0; +} + +int logfs_readpage_nolock(struct page *page) +{ + struct inode *inode = page->mapping->host; + int ret = -EIO; + + ret = logfs_read_block(inode, page, READ); + + if (ret) { + ClearPageUptodate(page); + SetPageError(page); + } else { + SetPageUptodate(page); + ClearPageError(page); + } + flush_dcache_page(page); + + return ret; +} + +static int logfs_reserve_bytes(struct inode *inode, int bytes) +{ + struct logfs_super *super = logfs_super(inode->i_sb); + u64 available = super->s_free_bytes + super->s_dirty_free_bytes + - super->s_dirty_used_bytes - super->s_dirty_pages; + + if (!bytes) + return 0; + + if (available < bytes) + return -ENOSPC; + + if (available < bytes + super->s_root_reserve && + !capable(CAP_SYS_RESOURCE)) + return -ENOSPC; + + return 0; +} + +int get_page_reserve(struct inode *inode, struct page *page) +{ + struct logfs_super *super = logfs_super(inode->i_sb); + int ret; + + if (logfs_block(page) && logfs_block(page)->reserved_bytes) + return 0; + + logfs_get_wblocks(inode->i_sb, page, WF_LOCK); + ret = logfs_reserve_bytes(inode, 6 * LOGFS_MAX_OBJECTSIZE); + if (!ret) { + alloc_data_block(inode, page); + logfs_block(page)->reserved_bytes += 6 * LOGFS_MAX_OBJECTSIZE; + super->s_dirty_pages += 6 * LOGFS_MAX_OBJECTSIZE; + } + logfs_put_wblocks(inode->i_sb, page, WF_LOCK); + return ret; +} + +/* + * We are protected by write lock. Push victims up to superblock level + * and release transaction when appropriate. + */ +/* FIXME: This is currently called from the wrong spots. */ +static void logfs_handle_transaction(struct inode *inode, + struct logfs_transaction *ta) +{ + struct logfs_super *super = logfs_super(inode->i_sb); + + if (!ta) + return; + logfs_inode(inode)->li_block->ta = NULL; + + if (inode->i_ino != LOGFS_INO_MASTER) { + BUG(); /* FIXME: Yes, this needs more thought */ + /* just remember the transaction until inode is written */ + //BUG_ON(logfs_inode(inode)->li_transaction); + //logfs_inode(inode)->li_transaction = ta; + return; + } + + switch (ta->state) { + case CREATE_1: /* fall through */ + case UNLINK_1: + BUG_ON(super->s_victim_ino); + super->s_victim_ino = ta->ino; + break; + case CREATE_2: /* fall through */ + case UNLINK_2: + BUG_ON(super->s_victim_ino != ta->ino); + super->s_victim_ino = 0; + /* transaction ends here - free it */ + kfree(ta); + break; + case CROSS_RENAME_1: + BUG_ON(super->s_rename_dir); + BUG_ON(super->s_rename_pos); + super->s_rename_dir = ta->dir; + super->s_rename_pos = ta->pos; + break; + case CROSS_RENAME_2: + BUG_ON(super->s_rename_dir != ta->dir); + BUG_ON(super->s_rename_pos != ta->pos); + super->s_rename_dir = 0; + super->s_rename_pos = 0; + kfree(ta); + break; + case TARGET_RENAME_1: + BUG_ON(super->s_rename_dir); + BUG_ON(super->s_rename_pos); + BUG_ON(super->s_victim_ino); + super->s_rename_dir = ta->dir; + super->s_rename_pos = ta->pos; + super->s_victim_ino = ta->ino; + break; + case TARGET_RENAME_2: + BUG_ON(super->s_rename_dir != ta->dir); + BUG_ON(super->s_rename_pos != ta->pos); + BUG_ON(super->s_victim_ino != ta->ino); + super->s_rename_dir = 0; + super->s_rename_pos = 0; + break; + case TARGET_RENAME_3: + BUG_ON(super->s_rename_dir); + BUG_ON(super->s_rename_pos); + BUG_ON(super->s_victim_ino != ta->ino); + super->s_victim_ino = 0; + kfree(ta); + break; + default: + BUG(); + } +} + +/* + * Not strictly a reservation, but rather a check that we still have enough + * space to satisfy the write. + */ +static int logfs_reserve_blocks(struct inode *inode, int blocks) +{ + return logfs_reserve_bytes(inode, blocks * LOGFS_MAX_OBJECTSIZE); +} + +struct write_control { + u64 ofs; + long flags; +}; + +static struct logfs_shadow *alloc_shadow(struct inode *inode, u64 bix, + level_t level, u64 old_ofs) +{ + struct logfs_super *super = logfs_super(inode->i_sb); + struct logfs_shadow *shadow; + + shadow = mempool_alloc(super->s_shadow_pool, GFP_NOFS); + memset(shadow, 0, sizeof(*shadow)); + shadow->ino = inode->i_ino; + shadow->bix = bix; + shadow->gc_level = expand_level(inode->i_ino, level); + shadow->old_ofs = old_ofs & ~LOGFS_FULLY_POPULATED; + return shadow; +} + +static void free_shadow(struct inode *inode, struct logfs_shadow *shadow) +{ + struct logfs_super *super = logfs_super(inode->i_sb); + + mempool_free(shadow, super->s_shadow_pool); +} + +/** + * fill_shadow_tree - Propagate shadow tree changes due to a write + * @inode: Inode owning the page + * @page: Struct page that was written + * @shadow: Shadow for the current write + * + * Writes in logfs can result in two semi-valid objects. The old object + * is still valid as long as it can be reached by following pointers on + * the medium. Only when writes propagate all the way up to the journal + * has the new object safely replaced the old one. + * + * To handle this problem, a struct logfs_shadow is used to represent + * every single write. It is attached to the indirect block, which is + * marked dirty. When the indirect block is written, its shadows are + * handed up to the next indirect block (or inode). Untimately they + * will reach the master inode and be freed upon journal commit. + * + * This function handles a single step in the propagation. It adds the + * shadow for the current write to the tree, along with any shadows in + * the page's tree, in case it was an indirect block. If a page is + * written, the inode parameter is left NULL, if an inode is written, + * the page parameter is left NULL. + */ +static void fill_shadow_tree(struct inode *inode, struct page *page, + struct logfs_shadow *shadow) +{ + struct logfs_super *super = logfs_super(inode->i_sb); + struct logfs_block *block = logfs_block(page); + struct shadow_tree *tree = &super->s_shadow_tree; + + if (PagePrivate(page)) { + if (block->alias_map) + super->s_no_object_aliases -= bitmap_weight( + block->alias_map, LOGFS_BLOCK_FACTOR); + logfs_handle_transaction(inode, block->ta); + block->ops->free_block(inode->i_sb, block); + } + if (shadow) { + if (shadow->old_ofs) + btree_insert64(&tree->old, shadow->old_ofs, shadow, + GFP_NOFS); + else + btree_insert64(&tree->new, shadow->new_ofs, shadow, + GFP_NOFS); + + super->s_dirty_used_bytes += shadow->new_len; + super->s_dirty_free_bytes += shadow->old_len; + } +} + +static void logfs_set_alias(struct super_block *sb, struct logfs_block *block, + long child_no) +{ + struct logfs_super *super = logfs_super(sb); + + if (block->inode && block->inode->i_ino == LOGFS_INO_MASTER) { + /* Aliases in the master inode are pointless. */ + return; + } + + if (!test_bit(child_no, block->alias_map)) { + set_bit(child_no, block->alias_map); + super->s_no_object_aliases++; + } + list_move_tail(&block->alias_list, &super->s_object_alias); +} + +/* + * Object aliases can and often do change the size and occupied space of a + * file. So not only do we have to change the pointers, we also have to + * change inode->i_size and li->li_used_bytes. Which is done by setting + * another two object aliases for the inode itself. + */ +static void set_iused(struct inode *inode, struct logfs_shadow *shadow) +{ + struct logfs_inode *li = logfs_inode(inode); + + if (shadow->new_len == shadow->old_len) + return; + + alloc_inode_block(inode); + li->li_used_bytes += shadow->new_len - shadow->old_len; + __logfs_set_blocks(inode); + logfs_set_alias(inode->i_sb, li->li_block, INODE_USED_OFS); + logfs_set_alias(inode->i_sb, li->li_block, INODE_SIZE_OFS); +} + +static int logfs_write_i0(struct inode *inode, struct page *page, + struct write_control *wc) +{ + struct logfs_shadow *shadow; + u64 bix; + level_t level; + int full, err = 0; + + logfs_unpack_index(page->index, &bix, &level); + if (wc->ofs == 0) + if (logfs_reserve_blocks(inode, 1)) + return -ENOSPC; + + shadow = alloc_shadow(inode, bix, level, wc->ofs); + if (wc->flags & WF_WRITE) + err = logfs_segment_write(inode, page, shadow); + if (wc->flags & WF_DELETE) + logfs_segment_delete(inode, shadow); + if (err) { + free_shadow(inode, shadow); + return err; + } + + set_iused(inode, shadow); + full = 1; + if (level != 0) { + alloc_indirect_block(inode, page, 0); + full = logfs_block(page)->full == LOGFS_BLOCK_FACTOR; + } + fill_shadow_tree(inode, page, shadow); + wc->ofs = shadow->new_ofs; + if (wc->ofs && full) + wc->ofs |= LOGFS_FULLY_POPULATED; + return 0; +} + +static int logfs_write_direct(struct inode *inode, struct page *page, + long flags) +{ + struct logfs_inode *li = logfs_inode(inode); + struct write_control wc = { + .ofs = li->li_data[page->index], + .flags = flags, + }; + int err; + + alloc_inode_block(inode); + + err = logfs_write_i0(inode, page, &wc); + if (err) + return err; + + li->li_data[page->index] = wc.ofs; + logfs_set_alias(inode->i_sb, li->li_block, + page->index + INODE_POINTER_OFS); + return 0; +} + +static int ptr_change(u64 ofs, struct page *page) +{ + struct logfs_block *block = logfs_block(page); + int empty0, empty1, full0, full1; + + empty0 = ofs == 0; + empty1 = block->partial == 0; + if (empty0 != empty1) + return 1; + + /* The !! is necessary to shrink result to int */ + full0 = !!(ofs & LOGFS_FULLY_POPULATED); + full1 = block->full == LOGFS_BLOCK_FACTOR; + if (full0 != full1) + return 1; + return 0; +} + +static int __logfs_write_rec(struct inode *inode, struct page *page, + struct write_control *this_wc, + pgoff_t bix, level_t target_level, level_t level) +{ + int ret, page_empty = 0; + int child_no = get_bits(bix, SUBLEVEL(level)); + struct page *ipage; + struct write_control child_wc = { + .flags = this_wc->flags, + }; + + ipage = logfs_get_write_page(inode, bix, level); + if (!ipage) + return -ENOMEM; + + if (this_wc->ofs) { + ret = logfs_segment_read(inode, ipage, this_wc->ofs, bix, level); + if (ret) + goto out; + } else if (!PageUptodate(ipage)) { + page_empty = 1; + logfs_read_empty(ipage); + } + + child_wc.ofs = block_get_pointer(ipage, child_no); + + if ((__force u8)level-1 > (__force u8)target_level) + ret = __logfs_write_rec(inode, page, &child_wc, bix, + target_level, SUBLEVEL(level)); + else + ret = logfs_write_i0(inode, page, &child_wc); + + if (ret) + goto out; + + alloc_indirect_block(inode, ipage, page_empty); + block_set_pointer(ipage, child_no, child_wc.ofs); + /* FIXME: first condition seems superfluous */ + if (child_wc.ofs || logfs_block(ipage)->partial) + this_wc->flags |= WF_WRITE; + /* the condition on this_wc->ofs ensures that we won't consume extra + * space for indirect blocks in the future, which we cannot reserve */ + if (!this_wc->ofs || ptr_change(this_wc->ofs, ipage)) + ret = logfs_write_i0(inode, ipage, this_wc); + else + logfs_set_alias(inode->i_sb, logfs_block(ipage), child_no); +out: + logfs_put_write_page(ipage); + return ret; +} + +static int logfs_write_rec(struct inode *inode, struct page *page, + pgoff_t bix, level_t target_level, long flags) +{ + struct logfs_inode *li = logfs_inode(inode); + struct write_control wc = { + .ofs = li->li_data[INDIRECT_INDEX], + .flags = flags, + }; + int ret; + + alloc_inode_block(inode); + + if (li->li_height > (__force u8)target_level) + ret = __logfs_write_rec(inode, page, &wc, bix, target_level, + LEVEL(li->li_height)); + else + ret = logfs_write_i0(inode, page, &wc); + if (ret) + return ret; + + if (li->li_data[INDIRECT_INDEX] != wc.ofs) { + li->li_data[INDIRECT_INDEX] = wc.ofs; + logfs_set_alias(inode->i_sb, li->li_block, + INDIRECT_INDEX + INODE_POINTER_OFS); + } + return ret; +} + +void logfs_add_transaction(struct inode *inode, struct logfs_transaction *ta) +{ + alloc_inode_block(inode); + logfs_inode(inode)->li_block->ta = ta; +} + +void logfs_del_transaction(struct inode *inode, struct logfs_transaction *ta) +{ + struct logfs_block *block = logfs_inode(inode)->li_block; + + if (block && block->ta) + block->ta = NULL; +} + +static int grow_inode(struct inode *inode, u64 bix, level_t level) +{ + struct logfs_inode *li = logfs_inode(inode); + u8 height = (__force u8)level; + struct page *page; + struct write_control wc = { + .flags = WF_WRITE, + }; + int err; + + BUG_ON(height > 5 || li->li_height > 5); + while (height > li->li_height || bix >= maxbix(li->li_height)) { + page = logfs_get_write_page(inode, I0_BLOCKS + 1, + LEVEL(li->li_height + 1)); + if (!page) + return -ENOMEM; + logfs_read_empty(page); + alloc_indirect_block(inode, page, 1); + block_set_pointer(page, 0, li->li_data[INDIRECT_INDEX]); + err = logfs_write_i0(inode, page, &wc); + logfs_put_write_page(page); + if (err) + return err; + li->li_data[INDIRECT_INDEX] = wc.ofs; + wc.ofs = 0; + li->li_height++; + logfs_set_alias(inode->i_sb, li->li_block, INODE_HEIGHT_OFS); + } + return 0; +} + +static int __logfs_write_buf(struct inode *inode, struct page *page, long flags) +{ + struct logfs_super *super = logfs_super(inode->i_sb); + pgoff_t index = page->index; + u64 bix; + level_t level; + int err; + + flags |= WF_WRITE | WF_DELETE; + inode->i_ctime = inode->i_mtime = CURRENT_TIME; + + logfs_unpack_index(index, &bix, &level); + if (logfs_block(page) && logfs_block(page)->reserved_bytes) + super->s_dirty_pages -= logfs_block(page)->reserved_bytes; + + if (index < I0_BLOCKS) + return logfs_write_direct(inode, page, flags); + + bix = adjust_bix(bix, level); + err = grow_inode(inode, bix, level); + if (err) + return err; + return logfs_write_rec(inode, page, bix, level, flags); +} + +int logfs_write_buf(struct inode *inode, struct page *page, long flags) +{ + struct super_block *sb = inode->i_sb; + int ret; + + logfs_get_wblocks(sb, page, flags & WF_LOCK); + ret = __logfs_write_buf(inode, page, flags); + logfs_put_wblocks(sb, page, flags & WF_LOCK); + return ret; +} + +static int __logfs_delete(struct inode *inode, struct page *page) +{ + long flags = WF_DELETE; + + inode->i_ctime = inode->i_mtime = CURRENT_TIME; + + if (page->index < I0_BLOCKS) + return logfs_write_direct(inode, page, flags); + return logfs_write_rec(inode, page, page->index, 0, flags); +} + +int logfs_delete(struct inode *inode, pgoff_t index, + struct shadow_tree *shadow_tree) +{ + struct super_block *sb = inode->i_sb; + struct page *page; + int ret; + + page = logfs_get_read_page(inode, index, 0); + if (!page) + return -ENOMEM; + + logfs_get_wblocks(sb, page, 1); + ret = __logfs_delete(inode, page); + logfs_put_wblocks(sb, page, 1); + + logfs_put_read_page(page); + + return ret; +} + +/* Rewrite cannot mark the inode dirty but has to write it immediatly. */ +int logfs_rewrite_block(struct inode *inode, u64 bix, u64 ofs, + gc_level_t gc_level, long flags) +{ + level_t level = shrink_level(gc_level); + struct page *page; + int err; + + page = logfs_get_write_page(inode, bix, level); + if (!page) + return -ENOMEM; + + err = logfs_segment_read(inode, page, ofs, bix, level); + if (!err) { + if (level != 0) + alloc_indirect_block(inode, page, 0); + err = logfs_write_buf(inode, page, flags); + } + logfs_put_write_page(page); + return err; +} + +static int truncate_data_block(struct inode *inode, struct page *page, + u64 ofs, struct logfs_shadow *shadow, u64 size) +{ + loff_t pageofs = page->index << inode->i_sb->s_blocksize_bits; + u64 bix; + level_t level; + int err; + + /* Does truncation happen within this page? */ + if (size <= pageofs || size - pageofs >= PAGE_SIZE) + return 0; + + logfs_unpack_index(page->index, &bix, &level); + BUG_ON(level != 0); + + err = logfs_segment_read(inode, page, ofs, bix, level); + if (err) + return err; + + zero_user_segment(page, size - pageofs, PAGE_CACHE_SIZE); + return logfs_segment_write(inode, page, shadow); +} + +static int logfs_truncate_i0(struct inode *inode, struct page *page, + struct write_control *wc, u64 size) +{ + struct logfs_shadow *shadow; + u64 bix; + level_t level; + int err = 0; + + logfs_unpack_index(page->index, &bix, &level); + BUG_ON(level != 0); + shadow = alloc_shadow(inode, bix, level, wc->ofs); + + err = truncate_data_block(inode, page, wc->ofs, shadow, size); + if (err) { + free_shadow(inode, shadow); + return err; + } + + logfs_segment_delete(inode, shadow); + set_iused(inode, shadow); + fill_shadow_tree(inode, page, shadow); + wc->ofs = shadow->new_ofs; + return 0; +} + +static int logfs_truncate_direct(struct inode *inode, u64 size) +{ + struct logfs_inode *li = logfs_inode(inode); + struct write_control wc; + struct page *page; + int e; + int err; + + alloc_inode_block(inode); + + for (e = I0_BLOCKS - 1; e >= 0; e--) { + if (size > (e+1) * LOGFS_BLOCKSIZE) + break; + + wc.ofs = li->li_data[e]; + if (!wc.ofs) + continue; + + page = logfs_get_write_page(inode, e, 0); + if (!page) + return -ENOMEM; + err = logfs_segment_read(inode, page, wc.ofs, e, 0); + if (err) { + logfs_put_write_page(page); + return err; + } + err = logfs_truncate_i0(inode, page, &wc, size); + logfs_put_write_page(page); + if (err) + return err; + + li->li_data[e] = wc.ofs; + } + return 0; +} + +/* FIXME: these need to become per-sb once we support different blocksizes */ +static u64 __logfs_step[] = { + 1, + I1_BLOCKS, + I2_BLOCKS, + I3_BLOCKS, +}; + +static u64 __logfs_start_index[] = { + I0_BLOCKS, + I1_BLOCKS, + I2_BLOCKS, + I3_BLOCKS +}; + +static inline u64 logfs_step(level_t level) +{ + return __logfs_step[(__force u8)level]; +} + +static inline u64 logfs_factor(u8 level) +{ + return __logfs_step[level] * LOGFS_BLOCKSIZE; +} + +static inline u64 logfs_start_index(level_t level) +{ + return __logfs_start_index[(__force u8)level]; +} + +static void logfs_unpack_raw_index(pgoff_t index, u64 *bix, level_t *level) +{ + logfs_unpack_index(index, bix, level); + if (*bix <= logfs_start_index(SUBLEVEL(*level))) + *bix = 0; +} + +static int __logfs_truncate_rec(struct inode *inode, struct page *ipage, + struct write_control *this_wc, u64 size) +{ + int truncate_happened = 0; + int e, err = 0; + u64 bix, child_bix, next_bix; + level_t level; + struct page *page; + struct write_control child_wc = { /* FIXME: flags */ }; + + logfs_unpack_raw_index(ipage->index, &bix, &level); + err = logfs_segment_read(inode, ipage, this_wc->ofs, bix, level); + if (err) + return err; + + for (e = LOGFS_BLOCK_FACTOR - 1; e >= 0; e--) { + child_bix = bix + e * logfs_step(SUBLEVEL(level)); + next_bix = child_bix + logfs_step(SUBLEVEL(level)); + if (size > next_bix * LOGFS_BLOCKSIZE) + break; + + child_wc.ofs = pure_ofs(block_get_pointer(ipage, e)); + if (!child_wc.ofs) + continue; + + page = logfs_get_write_page(inode, child_bix, SUBLEVEL(level)); + if (!page) + return -ENOMEM; + + if ((__force u8)level > 1) + err = __logfs_truncate_rec(inode, page, &child_wc, size); + else + err = logfs_truncate_i0(inode, page, &child_wc, size); + logfs_put_write_page(page); + if (err) + return err; + + truncate_happened = 1; + alloc_indirect_block(inode, ipage, 0); + block_set_pointer(ipage, e, child_wc.ofs); + } + + if (!truncate_happened) { + printk("ineffectual truncate (%lx, %lx, %llx)\n", inode->i_ino, ipage->index, size); + return 0; + } + + this_wc->flags = WF_DELETE; + if (logfs_block(ipage)->partial) + this_wc->flags |= WF_WRITE; + + return logfs_write_i0(inode, ipage, this_wc); +} + +static int logfs_truncate_rec(struct inode *inode, u64 size) +{ + struct logfs_inode *li = logfs_inode(inode); + struct write_control wc = { + .ofs = li->li_data[INDIRECT_INDEX], + }; + struct page *page; + int err; + + alloc_inode_block(inode); + + if (!wc.ofs) + return 0; + + page = logfs_get_write_page(inode, 0, LEVEL(li->li_height)); + if (!page) + return -ENOMEM; + + err = __logfs_truncate_rec(inode, page, &wc, size); + logfs_put_write_page(page); + if (err) + return err; + + if (li->li_data[INDIRECT_INDEX] != wc.ofs) + li->li_data[INDIRECT_INDEX] = wc.ofs; + return 0; +} + +static int __logfs_truncate(struct inode *inode, u64 size) +{ + int ret; + + if (size >= logfs_factor(logfs_inode(inode)->li_height)) + return 0; + + ret = logfs_truncate_rec(inode, size); + if (ret) + return ret; + + return logfs_truncate_direct(inode, size); +} + +int logfs_truncate(struct inode *inode, u64 size) +{ + struct super_block *sb = inode->i_sb; + int err; + + logfs_get_wblocks(sb, NULL, 1); + err = __logfs_truncate(inode, size); + if (!err) + err = __logfs_write_inode(inode, 0); + logfs_put_wblocks(sb, NULL, 1); + + if (!err) + err = vmtruncate(inode, size); + + /* I don't trust error recovery yet. */ + WARN_ON(err); + return err; +} + +static void move_page_to_inode(struct inode *inode, struct page *page) +{ + struct logfs_inode *li = logfs_inode(inode); + struct logfs_block *block = logfs_block(page); + + if (!block) + return; + + log_blockmove("move_page_to_inode(%llx, %llx, %x)\n", + block->ino, block->bix, block->level); + BUG_ON(li->li_block); + block->ops = &inode_block_ops; + block->inode = inode; + li->li_block = block; + + block->page = NULL; + page->private = 0; + ClearPagePrivate(page); +} + +static void move_inode_to_page(struct page *page, struct inode *inode) +{ + struct logfs_inode *li = logfs_inode(inode); + struct logfs_block *block = li->li_block; + + if (!block) + return; + + log_blockmove("move_inode_to_page(%llx, %llx, %x)\n", + block->ino, block->bix, block->level); + BUG_ON(PagePrivate(page)); + block->ops = &indirect_block_ops; + block->page = page; + page->private = (unsigned long)block; + SetPagePrivate(page); + + block->inode = NULL; + li->li_block = NULL; +} + +int logfs_read_inode(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + struct logfs_super *super = logfs_super(sb); + struct inode *master_inode = super->s_master_inode; + struct page *page; + struct logfs_disk_inode *di; + u64 ino = inode->i_ino; + + if (ino << sb->s_blocksize_bits > i_size_read(master_inode)) + return -ENODATA; + if (!logfs_exist_block(master_inode, ino)) + return -ENODATA; + + page = read_cache_page(master_inode->i_mapping, ino, + (filler_t *)logfs_readpage, NULL); + if (IS_ERR(page)) + return PTR_ERR(page); + + di = kmap_atomic(page, KM_USER0); + logfs_disk_to_inode(di, inode); + kunmap_atomic(di, KM_USER0); + move_page_to_inode(inode, page); + page_cache_release(page); + return 0; +} + +/* Caller must logfs_put_write_page(page); */ +static struct page *inode_to_page(struct inode *inode) +{ + struct inode *master_inode = logfs_super(inode->i_sb)->s_master_inode; + struct logfs_disk_inode *di; + struct page *page; + + BUG_ON(inode->i_ino == LOGFS_INO_MASTER); + + page = logfs_get_write_page(master_inode, inode->i_ino, 0); + if (!page) + return NULL; + + di = kmap_atomic(page, KM_USER0); + logfs_inode_to_disk(inode, di); + kunmap_atomic(di, KM_USER0); + move_inode_to_page(page, inode); + return page; +} + +/* Cheaper version of write_inode. All changes are concealed in + * aliases, which are moved back. No write to the medium happens. + */ +void logfs_clear_inode(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + struct logfs_inode *li = logfs_inode(inode); + struct logfs_block *block = li->li_block; + struct page *page; + + /* Only deleted files may be dirty at this point */ + BUG_ON(inode->i_state & I_DIRTY && inode->i_nlink); + if (!block) + return; + if ((logfs_super(sb)->s_flags & LOGFS_SB_FLAG_SHUTDOWN)) { + block->ops->free_block(inode->i_sb, block); + return; + } + + BUG_ON(inode->i_ino < LOGFS_RESERVED_INOS); + page = inode_to_page(inode); + BUG_ON(!page); /* FIXME: Use emergency page */ + logfs_put_write_page(page); +} + +static int do_write_inode(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + struct inode *master_inode = logfs_super(sb)->s_master_inode; + loff_t size = (inode->i_ino + 1) << inode->i_sb->s_blocksize_bits; + struct page *page; + int err; + + BUG_ON(inode->i_ino == LOGFS_INO_MASTER); + /* FIXME: lock inode */ + + if (i_size_read(master_inode) < size) + i_size_write(master_inode, size); + + /* TODO: Tell vfs this inode is clean now */ + + page = inode_to_page(inode); + if (!page) + return -ENOMEM; + + /* FIXME: transaction is part of logfs_block now. Is that enough? */ + err = logfs_write_buf(master_inode, page, 0); + logfs_put_write_page(page); + return err; +} + +static void logfs_mod_segment_entry(struct super_block *sb, u32 segno, + int write, + void (*change_se)(struct logfs_segment_entry *, long), + long arg) +{ + struct logfs_super *super = logfs_super(sb); + struct inode *inode; + struct page *page; + struct logfs_segment_entry *se; + pgoff_t page_no; + int child_no; + + page_no = segno >> (sb->s_blocksize_bits - 3); + child_no = segno & ((sb->s_blocksize >> 3) - 1); + + inode = super->s_segfile_inode; + page = logfs_get_write_page(inode, page_no, 0); + BUG_ON(!page); /* FIXME: We need some reserve page for this case */ + if (!PageUptodate(page)) + logfs_read_block(inode, page, WRITE); + + if (write) + alloc_indirect_block(inode, page, 0); + se = kmap_atomic(page, KM_USER0); + change_se(se + child_no, arg); + if (write) { + logfs_set_alias(sb, logfs_block(page), child_no); + BUG_ON((int)be32_to_cpu(se[child_no].valid) > super->s_segsize); + } + kunmap_atomic(se, KM_USER0); + + logfs_put_write_page(page); +} + +static void __get_segment_entry(struct logfs_segment_entry *se, long _target) +{ + struct logfs_segment_entry *target = (void *)_target; + + *target = *se; +} + +void logfs_get_segment_entry(struct super_block *sb, u32 segno, + struct logfs_segment_entry *se) +{ + logfs_mod_segment_entry(sb, segno, 0, __get_segment_entry, (long)se); +} + +static void __set_segment_used(struct logfs_segment_entry *se, long increment) +{ + u32 valid; + + valid = be32_to_cpu(se->valid); + valid += increment; + se->valid = cpu_to_be32(valid); +} + +void logfs_set_segment_used(struct super_block *sb, u64 ofs, int increment) +{ + struct logfs_super *super = logfs_super(sb); + u32 segno = ofs >> super->s_segshift; + + if (!increment) + return; + + logfs_mod_segment_entry(sb, segno, 1, __set_segment_used, increment); +} + +static void __set_segment_erased(struct logfs_segment_entry *se, long ec_level) +{ + se->ec_level = cpu_to_be32(ec_level); +} + +void logfs_set_segment_erased(struct super_block *sb, u32 segno, u32 ec, + gc_level_t gc_level) +{ + u32 ec_level = ec << 4 | (__force u8)gc_level; + + logfs_mod_segment_entry(sb, segno, 1, __set_segment_erased, ec_level); +} + +static void __set_segment_reserved(struct logfs_segment_entry *se, long ignore) +{ + se->valid = cpu_to_be32(RESERVED); +} + +void logfs_set_segment_reserved(struct super_block *sb, u32 segno) +{ + logfs_mod_segment_entry(sb, segno, 1, __set_segment_reserved, 0); +} + +static void __set_segment_unreserved(struct logfs_segment_entry *se, + long ec_level) +{ + se->valid = 0; + se->ec_level = cpu_to_be32(ec_level); +} + +void logfs_set_segment_unreserved(struct super_block *sb, u32 segno, u32 ec) +{ + u32 ec_level = ec << 4; + + logfs_mod_segment_entry(sb, segno, 1, __set_segment_unreserved, + ec_level); +} + +int __logfs_write_inode(struct inode *inode, long flags) +{ + struct super_block *sb = inode->i_sb; + int ret; + + logfs_get_wblocks(sb, NULL, flags & WF_LOCK); + ret = do_write_inode(inode); + logfs_put_wblocks(sb, NULL, flags & WF_LOCK); + return ret; +} + +static int do_delete_inode(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + struct inode *master_inode = logfs_super(sb)->s_master_inode; + struct page *page; + int ret; + + page = logfs_get_write_page(master_inode, inode->i_ino, 0); + if (!page) + return -ENOMEM; + + move_inode_to_page(page, inode); + + logfs_get_wblocks(sb, page, 1); + ret = __logfs_delete(master_inode, page); + logfs_put_wblocks(sb, page, 1); + + logfs_put_write_page(page); + return ret; +} + +/* + * ZOMBIE inodes have already been deleted before and should remain dead, + * if it weren't for valid checking. No need to kill them again here. + */ +void logfs_delete_inode(struct inode *inode) +{ + struct logfs_inode *li = logfs_inode(inode); + + if (!(li->li_flags & LOGFS_IF_ZOMBIE)) { + li->li_flags |= LOGFS_IF_ZOMBIE; + if (i_size_read(inode) > 0) + logfs_truncate(inode, 0); + do_delete_inode(inode); + } + truncate_inode_pages(&inode->i_data, 0); + clear_inode(inode); +} + +void btree_write_block(struct logfs_block *block) +{ + struct inode *inode; + struct page *page; + int err, cookie; + + inode = logfs_safe_iget(block->sb, block->ino, &cookie); + page = logfs_get_write_page(inode, block->bix, block->level); + + err = logfs_readpage_nolock(page); + BUG_ON(err); + BUG_ON(!PagePrivate(page)); + BUG_ON(logfs_block(page) != block); + err = __logfs_write_buf(inode, page, 0); + BUG_ON(err); + BUG_ON(PagePrivate(page) || page->private); + + logfs_put_write_page(page); + logfs_safe_iput(inode, cookie); +} + +/** + * logfs_inode_write - write inode or dentry objects + * + * @inode: parent inode (ifile or directory) + * @buf: object to write (inode or dentry) + * @n: object size + * @_pos: object number (file position in blocks/objects) + * @flags: write flags + * @lock: 0 if write lock is already taken, 1 otherwise + * @shadow_tree: shadow below this inode + * + * FIXME: All caller of this put a 200-300 byte variable on the stack, + * only to call here and do a memcpy from that stack variable. A good + * example of wasted performance and stack space. + */ +int logfs_inode_write(struct inode *inode, const void *buf, size_t count, + loff_t bix, long flags, struct shadow_tree *shadow_tree) +{ + loff_t pos = bix << inode->i_sb->s_blocksize_bits; + int err; + struct page *page; + void *pagebuf; + + BUG_ON(pos & (LOGFS_BLOCKSIZE-1)); + BUG_ON(count > LOGFS_BLOCKSIZE); + page = logfs_get_write_page(inode, bix, 0); + if (!page) + return -ENOMEM; + + pagebuf = kmap_atomic(page, KM_USER0); + memcpy(pagebuf, buf, count); + flush_dcache_page(page); + kunmap_atomic(pagebuf, KM_USER0); + + if (i_size_read(inode) < pos + LOGFS_BLOCKSIZE) + i_size_write(inode, pos + LOGFS_BLOCKSIZE); + + err = logfs_write_buf(inode, page, flags); + logfs_put_write_page(page); + return err; +} + +int logfs_open_segfile(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + struct inode *inode; + + inode = logfs_read_meta_inode(sb, LOGFS_INO_SEGFILE); + if (IS_ERR(inode)) + return PTR_ERR(inode); + super->s_segfile_inode = inode; + return 0; +} + +int logfs_init_rw(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + int min_fill = 3 * super->s_no_blocks; + + INIT_LIST_HEAD(&super->s_object_alias); + mutex_init(&super->s_write_mutex); + super->s_block_pool = mempool_create_kmalloc_pool(min_fill, + sizeof(struct logfs_block)); + super->s_shadow_pool = mempool_create_kmalloc_pool(min_fill, + sizeof(struct logfs_shadow)); + return 0; +} + +void logfs_cleanup_rw(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + + destroy_meta_inode(super->s_segfile_inode); + if (super->s_block_pool) + mempool_destroy(super->s_block_pool); + if (super->s_shadow_pool) + mempool_destroy(super->s_shadow_pool); +} diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c new file mode 100644 index 000000000000..5f58b74516ca --- /dev/null +++ b/fs/logfs/segment.c @@ -0,0 +1,924 @@ +/* + * fs/logfs/segment.c - Handling the Object Store + * + * As should be obvious for Linux kernel code, license is GPLv2 + * + * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org> + * + * Object store or ostore makes up the complete device with exception of + * the superblock and journal areas. Apart from its own metadata it stores + * three kinds of objects: inodes, dentries and blocks, both data and indirect. + */ +#include "logfs.h" + +static int logfs_mark_segment_bad(struct super_block *sb, u32 segno) +{ + struct logfs_super *super = logfs_super(sb); + struct btree_head32 *head = &super->s_reserved_segments; + int err; + + err = btree_insert32(head, segno, (void *)1, GFP_NOFS); + if (err) + return err; + logfs_super(sb)->s_bad_segments++; + /* FIXME: write to journal */ + return 0; +} + +int logfs_erase_segment(struct super_block *sb, u32 segno) +{ + struct logfs_super *super = logfs_super(sb); + + super->s_gec++; + + return super->s_devops->erase(sb, (u64)segno << super->s_segshift, + super->s_segsize); +} + +static s64 logfs_get_free_bytes(struct logfs_area *area, size_t bytes) +{ + s32 ofs; + + logfs_open_area(area, bytes); + + ofs = area->a_used_bytes; + area->a_used_bytes += bytes; + BUG_ON(area->a_used_bytes >= logfs_super(area->a_sb)->s_segsize); + + return dev_ofs(area->a_sb, area->a_segno, ofs); +} + +static struct page *get_mapping_page(struct super_block *sb, pgoff_t index, + int use_filler) +{ + struct logfs_super *super = logfs_super(sb); + struct address_space *mapping = super->s_mapping_inode->i_mapping; + filler_t *filler = super->s_devops->readpage; + struct page *page; + + BUG_ON(mapping_gfp_mask(mapping) & __GFP_FS); + if (use_filler) + page = read_cache_page(mapping, index, filler, sb); + else { + page = find_or_create_page(mapping, index, GFP_NOFS); + unlock_page(page); + } + return page; +} + +void __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len, + int use_filler) +{ + pgoff_t index = ofs >> PAGE_SHIFT; + struct page *page; + long offset = ofs & (PAGE_SIZE-1); + long copylen; + + /* Only logfs_wbuf_recover may use len==0 */ + BUG_ON(!len && !use_filler); + do { + copylen = min((ulong)len, PAGE_SIZE - offset); + + page = get_mapping_page(area->a_sb, index, use_filler); + SetPageUptodate(page); + BUG_ON(!page); /* FIXME: reserve a pool */ + memcpy(page_address(page) + offset, buf, copylen); + SetPagePrivate(page); + page_cache_release(page); + + buf += copylen; + len -= copylen; + offset = 0; + index++; + } while (len); +} + +/* + * bdev_writeseg will write full pages. Memset the tail to prevent data leaks. + */ +static void pad_wbuf(struct logfs_area *area, int final) +{ + struct super_block *sb = area->a_sb; + struct logfs_super *super = logfs_super(sb); + struct page *page; + u64 ofs = dev_ofs(sb, area->a_segno, area->a_used_bytes); + pgoff_t index = ofs >> PAGE_SHIFT; + long offset = ofs & (PAGE_SIZE-1); + u32 len = PAGE_SIZE - offset; + + if (len == PAGE_SIZE) { + /* The math in this function can surely use some love */ + len = 0; + } + if (len) { + BUG_ON(area->a_used_bytes >= super->s_segsize); + + page = get_mapping_page(area->a_sb, index, 0); + BUG_ON(!page); /* FIXME: reserve a pool */ + memset(page_address(page) + offset, 0xff, len); + SetPagePrivate(page); + page_cache_release(page); + } + + if (!final) + return; + + area->a_used_bytes += len; + for ( ; area->a_used_bytes < super->s_segsize; + area->a_used_bytes += PAGE_SIZE) { + /* Memset another page */ + index++; + page = get_mapping_page(area->a_sb, index, 0); + BUG_ON(!page); /* FIXME: reserve a pool */ + memset(page_address(page), 0xff, PAGE_SIZE); + SetPagePrivate(page); + page_cache_release(page); + } +} + +/* + * We have to be careful with the alias tree. Since lookup is done by bix, + * it needs to be normalized, so 14, 15, 16, etc. all match when dealing with + * indirect blocks. So always use it through accessor functions. + */ +static void *alias_tree_lookup(struct super_block *sb, u64 ino, u64 bix, + level_t level) +{ + struct btree_head128 *head = &logfs_super(sb)->s_object_alias_tree; + pgoff_t index = logfs_pack_index(bix, level); + + return btree_lookup128(head, ino, index); +} + +static int alias_tree_insert(struct super_block *sb, u64 ino, u64 bix, + level_t level, void *val) +{ + struct btree_head128 *head = &logfs_super(sb)->s_object_alias_tree; + pgoff_t index = logfs_pack_index(bix, level); + + return btree_insert128(head, ino, index, val, GFP_NOFS); +} + +static int btree_write_alias(struct super_block *sb, struct logfs_block *block, + write_alias_t *write_one_alias) +{ + struct object_alias_item *item; + int err; + + list_for_each_entry(item, &block->item_list, list) { + err = write_alias_journal(sb, block->ino, block->bix, + block->level, item->child_no, item->val); + if (err) + return err; + } + return 0; +} + +static gc_level_t btree_block_level(struct logfs_block *block) +{ + return expand_level(block->ino, block->level); +} + +static struct logfs_block_ops btree_block_ops = { + .write_block = btree_write_block, + .block_level = btree_block_level, + .free_block = __free_block, + .write_alias = btree_write_alias, +}; + +int logfs_load_object_aliases(struct super_block *sb, + struct logfs_obj_alias *oa, int count) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_block *block; + struct object_alias_item *item; + u64 ino, bix; + level_t level; + int i, err; + + super->s_flags |= LOGFS_SB_FLAG_OBJ_ALIAS; + count /= sizeof(*oa); + for (i = 0; i < count; i++) { + item = mempool_alloc(super->s_alias_pool, GFP_NOFS); + if (!item) + return -ENOMEM; + memset(item, 0, sizeof(*item)); + + super->s_no_object_aliases++; + item->val = oa[i].val; + item->child_no = be16_to_cpu(oa[i].child_no); + + ino = be64_to_cpu(oa[i].ino); + bix = be64_to_cpu(oa[i].bix); + level = LEVEL(oa[i].level); + + log_aliases("logfs_load_object_aliases(%llx, %llx, %x, %x) %llx\n", + ino, bix, level, item->child_no, + be64_to_cpu(item->val)); + block = alias_tree_lookup(sb, ino, bix, level); + if (!block) { + block = __alloc_block(sb, ino, bix, level); + block->ops = &btree_block_ops; + err = alias_tree_insert(sb, ino, bix, level, block); + BUG_ON(err); /* mempool empty */ + } + if (test_and_set_bit(item->child_no, block->alias_map)) { + printk(KERN_ERR"LogFS: Alias collision detected\n"); + return -EIO; + } + list_move_tail(&block->alias_list, &super->s_object_alias); + list_add(&item->list, &block->item_list); + } + return 0; +} + +static void kill_alias(void *_block, unsigned long ignore0, + u64 ignore1, u64 ignore2, size_t ignore3) +{ + struct logfs_block *block = _block; + struct super_block *sb = block->sb; + struct logfs_super *super = logfs_super(sb); + struct object_alias_item *item; + + while (!list_empty(&block->item_list)) { + item = list_entry(block->item_list.next, typeof(*item), list); + list_del(&item->list); + mempool_free(item, super->s_alias_pool); + } + block->ops->free_block(sb, block); +} + +static int obj_type(struct inode *inode, level_t level) +{ + if (level == 0) { + if (S_ISDIR(inode->i_mode)) + return OBJ_DENTRY; + if (inode->i_ino == LOGFS_INO_MASTER) + return OBJ_INODE; + } + return OBJ_BLOCK; +} + +static int obj_len(struct super_block *sb, int obj_type) +{ + switch (obj_type) { + case OBJ_DENTRY: + return sizeof(struct logfs_disk_dentry); + case OBJ_INODE: + return sizeof(struct logfs_disk_inode); + case OBJ_BLOCK: + return sb->s_blocksize; + default: + BUG(); + } +} + +static int __logfs_segment_write(struct inode *inode, void *buf, + struct logfs_shadow *shadow, int type, int len, int compr) +{ + struct logfs_area *area; + struct super_block *sb = inode->i_sb; + s64 ofs; + struct logfs_object_header h; + int acc_len; + + if (shadow->gc_level == 0) + acc_len = len; + else + acc_len = obj_len(sb, type); + + area = get_area(sb, shadow->gc_level); + ofs = logfs_get_free_bytes(area, len + LOGFS_OBJECT_HEADERSIZE); + LOGFS_BUG_ON(ofs <= 0, sb); + /* + * Order is important. logfs_get_free_bytes(), by modifying the + * segment file, may modify the content of the very page we're about + * to write now. Which is fine, as long as the calculated crc and + * written data still match. So do the modifications _before_ + * calculating the crc. + */ + + h.len = cpu_to_be16(len); + h.type = type; + h.compr = compr; + h.ino = cpu_to_be64(inode->i_ino); + h.bix = cpu_to_be64(shadow->bix); + h.crc = logfs_crc32(&h, sizeof(h) - 4, 4); + h.data_crc = logfs_crc32(buf, len, 0); + + logfs_buf_write(area, ofs, &h, sizeof(h)); + logfs_buf_write(area, ofs + LOGFS_OBJECT_HEADERSIZE, buf, len); + + shadow->new_ofs = ofs; + shadow->new_len = acc_len + LOGFS_OBJECT_HEADERSIZE; + + return 0; +} + +static s64 logfs_segment_write_compress(struct inode *inode, void *buf, + struct logfs_shadow *shadow, int type, int len) +{ + struct super_block *sb = inode->i_sb; + void *compressor_buf = logfs_super(sb)->s_compressed_je; + ssize_t compr_len; + int ret; + + mutex_lock(&logfs_super(sb)->s_journal_mutex); + compr_len = logfs_compress(buf, compressor_buf, len, len); + + if (compr_len >= 0) { + ret = __logfs_segment_write(inode, compressor_buf, shadow, + type, compr_len, COMPR_ZLIB); + } else { + ret = __logfs_segment_write(inode, buf, shadow, type, len, + COMPR_NONE); + } + mutex_unlock(&logfs_super(sb)->s_journal_mutex); + return ret; +} + +/** + * logfs_segment_write - write data block to object store + * @inode: inode containing data + * + * Returns an errno or zero. + */ +int logfs_segment_write(struct inode *inode, struct page *page, + struct logfs_shadow *shadow) +{ + struct super_block *sb = inode->i_sb; + struct logfs_super *super = logfs_super(sb); + int do_compress, type, len; + int ret; + void *buf; + + BUG_ON(logfs_super(sb)->s_flags & LOGFS_SB_FLAG_SHUTDOWN); + do_compress = logfs_inode(inode)->li_flags & LOGFS_IF_COMPRESSED; + if (shadow->gc_level != 0) { + /* temporarily disable compression for indirect blocks */ + do_compress = 0; + } + + type = obj_type(inode, shrink_level(shadow->gc_level)); + len = obj_len(sb, type); + buf = kmap(page); + if (do_compress) + ret = logfs_segment_write_compress(inode, buf, shadow, type, + len); + else + ret = __logfs_segment_write(inode, buf, shadow, type, len, + COMPR_NONE); + kunmap(page); + + log_segment("logfs_segment_write(%llx, %llx, %x) %llx->%llx %x->%x\n", + shadow->ino, shadow->bix, shadow->gc_level, + shadow->old_ofs, shadow->new_ofs, + shadow->old_len, shadow->new_len); + /* this BUG_ON did catch a locking bug. useful */ + BUG_ON(!(shadow->new_ofs & (super->s_segsize - 1))); + return ret; +} + +int wbuf_read(struct super_block *sb, u64 ofs, size_t len, void *buf) +{ + pgoff_t index = ofs >> PAGE_SHIFT; + struct page *page; + long offset = ofs & (PAGE_SIZE-1); + long copylen; + + while (len) { + copylen = min((ulong)len, PAGE_SIZE - offset); + + page = get_mapping_page(sb, index, 1); + if (IS_ERR(page)) + return PTR_ERR(page); + memcpy(buf, page_address(page) + offset, copylen); + page_cache_release(page); + + buf += copylen; + len -= copylen; + offset = 0; + index++; + } + return 0; +} + +/* + * The "position" of indirect blocks is ambiguous. It can be the position + * of any data block somewhere behind this indirect block. So we need to + * normalize the positions through logfs_block_mask() before comparing. + */ +static int check_pos(struct super_block *sb, u64 pos1, u64 pos2, level_t level) +{ + return (pos1 & logfs_block_mask(sb, level)) != + (pos2 & logfs_block_mask(sb, level)); +} + +#if 0 +static int read_seg_header(struct super_block *sb, u64 ofs, + struct logfs_segment_header *sh) +{ + __be32 crc; + int err; + + err = wbuf_read(sb, ofs, sizeof(*sh), sh); + if (err) + return err; + crc = logfs_crc32(sh, sizeof(*sh), 4); + if (crc != sh->crc) { + printk(KERN_ERR"LOGFS: header crc error at %llx: expected %x, " + "got %x\n", ofs, be32_to_cpu(sh->crc), + be32_to_cpu(crc)); + return -EIO; + } + return 0; +} +#endif + +static int read_obj_header(struct super_block *sb, u64 ofs, + struct logfs_object_header *oh) +{ + __be32 crc; + int err; + + err = wbuf_read(sb, ofs, sizeof(*oh), oh); + if (err) + return err; + crc = logfs_crc32(oh, sizeof(*oh) - 4, 4); + if (crc != oh->crc) { + printk(KERN_ERR"LOGFS: header crc error at %llx: expected %x, " + "got %x\n", ofs, be32_to_cpu(oh->crc), + be32_to_cpu(crc)); + return -EIO; + } + return 0; +} + +static void move_btree_to_page(struct inode *inode, struct page *page, + __be64 *data) +{ + struct super_block *sb = inode->i_sb; + struct logfs_super *super = logfs_super(sb); + struct btree_head128 *head = &super->s_object_alias_tree; + struct logfs_block *block; + struct object_alias_item *item, *next; + + if (!(super->s_flags & LOGFS_SB_FLAG_OBJ_ALIAS)) + return; + + block = btree_remove128(head, inode->i_ino, page->index); + if (!block) + return; + + log_blockmove("move_btree_to_page(%llx, %llx, %x)\n", + block->ino, block->bix, block->level); + list_for_each_entry_safe(item, next, &block->item_list, list) { + data[item->child_no] = item->val; + list_del(&item->list); + mempool_free(item, super->s_alias_pool); + } + block->page = page; + SetPagePrivate(page); + page->private = (unsigned long)block; + block->ops = &indirect_block_ops; + initialize_block_counters(page, block, data, 0); +} + +/* + * This silences a false, yet annoying gcc warning. I hate it when my editor + * jumps into bitops.h each time I recompile this file. + * TODO: Complain to gcc folks about this and upgrade compiler. + */ +static unsigned long fnb(const unsigned long *addr, + unsigned long size, unsigned long offset) +{ + return find_next_bit(addr, size, offset); +} + +void move_page_to_btree(struct page *page) +{ + struct logfs_block *block = logfs_block(page); + struct super_block *sb = block->sb; + struct logfs_super *super = logfs_super(sb); + struct object_alias_item *item; + unsigned long pos; + __be64 *child; + int err; + + if (super->s_flags & LOGFS_SB_FLAG_SHUTDOWN) { + block->ops->free_block(sb, block); + return; + } + log_blockmove("move_page_to_btree(%llx, %llx, %x)\n", + block->ino, block->bix, block->level); + super->s_flags |= LOGFS_SB_FLAG_OBJ_ALIAS; + + for (pos = 0; ; pos++) { + pos = fnb(block->alias_map, LOGFS_BLOCK_FACTOR, pos); + if (pos >= LOGFS_BLOCK_FACTOR) + break; + + item = mempool_alloc(super->s_alias_pool, GFP_NOFS); + BUG_ON(!item); /* mempool empty */ + memset(item, 0, sizeof(*item)); + + child = kmap_atomic(page, KM_USER0); + item->val = child[pos]; + kunmap_atomic(child, KM_USER0); + item->child_no = pos; + list_add(&item->list, &block->item_list); + } + block->page = NULL; + ClearPagePrivate(page); + page->private = 0; + block->ops = &btree_block_ops; + err = alias_tree_insert(block->sb, block->ino, block->bix, block->level, + block); + BUG_ON(err); /* mempool empty */ + ClearPageUptodate(page); +} + +static int __logfs_segment_read(struct inode *inode, void *buf, + u64 ofs, u64 bix, level_t level) +{ + struct super_block *sb = inode->i_sb; + void *compressor_buf = logfs_super(sb)->s_compressed_je; + struct logfs_object_header oh; + __be32 crc; + u16 len; + int err, block_len; + + block_len = obj_len(sb, obj_type(inode, level)); + err = read_obj_header(sb, ofs, &oh); + if (err) + goto out_err; + + err = -EIO; + if (be64_to_cpu(oh.ino) != inode->i_ino + || check_pos(sb, be64_to_cpu(oh.bix), bix, level)) { + printk(KERN_ERR"LOGFS: (ino, bix) don't match at %llx: " + "expected (%lx, %llx), got (%llx, %llx)\n", + ofs, inode->i_ino, bix, + be64_to_cpu(oh.ino), be64_to_cpu(oh.bix)); + goto out_err; + } + + len = be16_to_cpu(oh.len); + + switch (oh.compr) { + case COMPR_NONE: + err = wbuf_read(sb, ofs + LOGFS_OBJECT_HEADERSIZE, len, buf); + if (err) + goto out_err; + crc = logfs_crc32(buf, len, 0); + if (crc != oh.data_crc) { + printk(KERN_ERR"LOGFS: uncompressed data crc error at " + "%llx: expected %x, got %x\n", ofs, + be32_to_cpu(oh.data_crc), + be32_to_cpu(crc)); + goto out_err; + } + break; + case COMPR_ZLIB: + mutex_lock(&logfs_super(sb)->s_journal_mutex); + err = wbuf_read(sb, ofs + LOGFS_OBJECT_HEADERSIZE, len, + compressor_buf); + if (err) { + mutex_unlock(&logfs_super(sb)->s_journal_mutex); + goto out_err; + } + crc = logfs_crc32(compressor_buf, len, 0); + if (crc != oh.data_crc) { + printk(KERN_ERR"LOGFS: compressed data crc error at " + "%llx: expected %x, got %x\n", ofs, + be32_to_cpu(oh.data_crc), + be32_to_cpu(crc)); + mutex_unlock(&logfs_super(sb)->s_journal_mutex); + goto out_err; + } + err = logfs_uncompress(compressor_buf, buf, len, block_len); + mutex_unlock(&logfs_super(sb)->s_journal_mutex); + if (err) { + printk(KERN_ERR"LOGFS: uncompress error at %llx\n", ofs); + goto out_err; + } + break; + default: + LOGFS_BUG(sb); + err = -EIO; + goto out_err; + } + return 0; + +out_err: + logfs_set_ro(sb); + printk(KERN_ERR"LOGFS: device is read-only now\n"); + LOGFS_BUG(sb); + return err; +} + +/** + * logfs_segment_read - read data block from object store + * @inode: inode containing data + * @buf: data buffer + * @ofs: physical data offset + * @bix: block index + * @level: block level + * + * Returns 0 on success or a negative errno. + */ +int logfs_segment_read(struct inode *inode, struct page *page, + u64 ofs, u64 bix, level_t level) +{ + int err; + void *buf; + + if (PageUptodate(page)) + return 0; + + ofs &= ~LOGFS_FULLY_POPULATED; + + buf = kmap(page); + err = __logfs_segment_read(inode, buf, ofs, bix, level); + if (!err) { + move_btree_to_page(inode, page, buf); + SetPageUptodate(page); + } + kunmap(page); + log_segment("logfs_segment_read(%lx, %llx, %x) %llx (%d)\n", + inode->i_ino, bix, level, ofs, err); + return err; +} + +int logfs_segment_delete(struct inode *inode, struct logfs_shadow *shadow) +{ + struct super_block *sb = inode->i_sb; + struct logfs_object_header h; + u16 len; + int err; + + BUG_ON(logfs_super(sb)->s_flags & LOGFS_SB_FLAG_SHUTDOWN); + BUG_ON(shadow->old_ofs & LOGFS_FULLY_POPULATED); + if (!shadow->old_ofs) + return 0; + + log_segment("logfs_segment_delete(%llx, %llx, %x) %llx->%llx %x->%x\n", + shadow->ino, shadow->bix, shadow->gc_level, + shadow->old_ofs, shadow->new_ofs, + shadow->old_len, shadow->new_len); + err = read_obj_header(sb, shadow->old_ofs, &h); + LOGFS_BUG_ON(err, sb); + LOGFS_BUG_ON(be64_to_cpu(h.ino) != inode->i_ino, sb); + LOGFS_BUG_ON(check_pos(sb, shadow->bix, be64_to_cpu(h.bix), + shrink_level(shadow->gc_level)), sb); + + if (shadow->gc_level == 0) + len = be16_to_cpu(h.len); + else + len = obj_len(sb, h.type); + shadow->old_len = len + sizeof(h); + return 0; +} + +static void freeseg(struct super_block *sb, u32 segno) +{ + struct logfs_super *super = logfs_super(sb); + struct address_space *mapping = super->s_mapping_inode->i_mapping; + struct page *page; + u64 ofs, start, end; + + start = dev_ofs(sb, segno, 0); + end = dev_ofs(sb, segno + 1, 0); + for (ofs = start; ofs < end; ofs += PAGE_SIZE) { + page = find_get_page(mapping, ofs >> PAGE_SHIFT); + if (!page) + continue; + ClearPagePrivate(page); + page_cache_release(page); + } +} + +int logfs_open_area(struct logfs_area *area, size_t bytes) +{ + struct super_block *sb = area->a_sb; + struct logfs_super *super = logfs_super(sb); + int err, closed = 0; + + if (area->a_is_open && area->a_used_bytes + bytes <= super->s_segsize) + return 0; + + if (area->a_is_open) { + u64 ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes); + u32 len = super->s_segsize - area->a_written_bytes; + + log_gc("logfs_close_area(%x)\n", area->a_segno); + pad_wbuf(area, 1); + super->s_devops->writeseg(area->a_sb, ofs, len); + freeseg(sb, area->a_segno); + closed = 1; + } + + area->a_used_bytes = 0; + area->a_written_bytes = 0; +again: + area->a_ops->get_free_segment(area); + area->a_ops->get_erase_count(area); + + log_gc("logfs_open_area(%x, %x)\n", area->a_segno, area->a_level); + err = area->a_ops->erase_segment(area); + if (err) { + printk(KERN_WARNING "LogFS: Error erasing segment %x\n", + area->a_segno); + logfs_mark_segment_bad(sb, area->a_segno); + goto again; + } + area->a_is_open = 1; + return closed; +} + +void logfs_sync_area(struct logfs_area *area) +{ + struct super_block *sb = area->a_sb; + struct logfs_super *super = logfs_super(sb); + u64 ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes); + u32 len = (area->a_used_bytes - area->a_written_bytes); + + if (super->s_writesize) + len &= ~(super->s_writesize - 1); + if (len == 0) + return; + pad_wbuf(area, 0); + super->s_devops->writeseg(sb, ofs, len); + area->a_written_bytes += len; +} + +void logfs_sync_segments(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + int i; + + for_each_area(i) + logfs_sync_area(super->s_area[i]); +} + +/* + * Pick a free segment to be used for this area. Effectively takes a + * candidate from the free list (not really a candidate anymore). + */ +static void ostore_get_free_segment(struct logfs_area *area) +{ + struct super_block *sb = area->a_sb; + struct logfs_super *super = logfs_super(sb); + + if (super->s_free_list.count == 0) { + printk(KERN_ERR"LOGFS: ran out of free segments\n"); + LOGFS_BUG(sb); + } + + area->a_segno = get_best_cand(sb, &super->s_free_list, NULL); +} + +static void ostore_get_erase_count(struct logfs_area *area) +{ + struct logfs_segment_entry se; + u32 ec_level; + + logfs_get_segment_entry(area->a_sb, area->a_segno, &se); + BUG_ON(se.ec_level == cpu_to_be32(BADSEG) || + se.valid == cpu_to_be32(RESERVED)); + + ec_level = be32_to_cpu(se.ec_level); + area->a_erase_count = (ec_level >> 4) + 1; +} + +static int ostore_erase_segment(struct logfs_area *area) +{ + struct super_block *sb = area->a_sb; + struct logfs_segment_header sh; + u64 ofs; + int err; + + err = logfs_erase_segment(sb, area->a_segno); + if (err) + return err; + + sh.pad = 0; + sh.type = SEG_OSTORE; + sh.level = (__force u8)area->a_level; + sh.segno = cpu_to_be32(area->a_segno); + sh.ec = cpu_to_be32(area->a_erase_count); + sh.gec = cpu_to_be64(logfs_super(sb)->s_gec); + sh.crc = logfs_crc32(&sh, sizeof(sh), 4); + + logfs_set_segment_erased(sb, area->a_segno, area->a_erase_count, + area->a_level); + + ofs = dev_ofs(sb, area->a_segno, 0); + area->a_used_bytes = sizeof(sh); + logfs_buf_write(area, ofs, &sh, sizeof(sh)); + return 0; +} + +static const struct logfs_area_ops ostore_area_ops = { + .get_free_segment = ostore_get_free_segment, + .get_erase_count = ostore_get_erase_count, + .erase_segment = ostore_erase_segment, +}; + +static void free_area(struct logfs_area *area) +{ + if (area) + freeseg(area->a_sb, area->a_segno); + kfree(area); +} + +static struct logfs_area *alloc_area(struct super_block *sb) +{ + struct logfs_area *area; + + area = kzalloc(sizeof(*area), GFP_KERNEL); + if (!area) + return NULL; + + area->a_sb = sb; + return area; +} + +static void map_invalidatepage(struct page *page, unsigned long l) +{ + BUG(); +} + +static int map_releasepage(struct page *page, gfp_t g) +{ + /* Don't release these pages */ + return 0; +} + +static const struct address_space_operations mapping_aops = { + .invalidatepage = map_invalidatepage, + .releasepage = map_releasepage, + .set_page_dirty = __set_page_dirty_nobuffers, +}; + +int logfs_init_mapping(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + struct address_space *mapping; + struct inode *inode; + + inode = logfs_new_meta_inode(sb, LOGFS_INO_MAPPING); + if (IS_ERR(inode)) + return PTR_ERR(inode); + super->s_mapping_inode = inode; + mapping = inode->i_mapping; + mapping->a_ops = &mapping_aops; + /* Would it be possible to use __GFP_HIGHMEM as well? */ + mapping_set_gfp_mask(mapping, GFP_NOFS); + return 0; +} + +int logfs_init_areas(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + int i = -1; + + super->s_alias_pool = mempool_create_kmalloc_pool(600, + sizeof(struct object_alias_item)); + if (!super->s_alias_pool) + return -ENOMEM; + + super->s_journal_area = alloc_area(sb); + if (!super->s_journal_area) + goto err; + + for_each_area(i) { + super->s_area[i] = alloc_area(sb); + if (!super->s_area[i]) + goto err; + super->s_area[i]->a_level = GC_LEVEL(i); + super->s_area[i]->a_ops = &ostore_area_ops; + } + btree_init_mempool128(&super->s_object_alias_tree, + super->s_btree_pool); + return 0; + +err: + for (i--; i >= 0; i--) + free_area(super->s_area[i]); + free_area(super->s_journal_area); + mempool_destroy(super->s_alias_pool); + return -ENOMEM; +} + +void logfs_cleanup_areas(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + int i; + + btree_grim_visitor128(&super->s_object_alias_tree, 0, kill_alias); + for_each_area(i) + free_area(super->s_area[i]); + free_area(super->s_journal_area); + destroy_meta_inode(super->s_mapping_inode); +} diff --git a/fs/logfs/super.c b/fs/logfs/super.c new file mode 100644 index 000000000000..d128a2c1c8d1 --- /dev/null +++ b/fs/logfs/super.c @@ -0,0 +1,634 @@ +/* + * fs/logfs/super.c + * + * As should be obvious for Linux kernel code, license is GPLv2 + * + * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org> + * + * Generally contains mount/umount code and also serves as a dump area for + * any functions that don't fit elsewhere and neither justify a file of their + * own. + */ +#include "logfs.h" +#include <linux/bio.h> +#include <linux/mtd/mtd.h> +#include <linux/statfs.h> +#include <linux/buffer_head.h> + +static DEFINE_MUTEX(emergency_mutex); +static struct page *emergency_page; + +struct page *emergency_read_begin(struct address_space *mapping, pgoff_t index) +{ + filler_t *filler = (filler_t *)mapping->a_ops->readpage; + struct page *page; + int err; + + page = read_cache_page(mapping, index, filler, NULL); + if (page) + return page; + + /* No more pages available, switch to emergency page */ + printk(KERN_INFO"Logfs: Using emergency page\n"); + mutex_lock(&emergency_mutex); + err = filler(NULL, emergency_page); + if (err) { + mutex_unlock(&emergency_mutex); + printk(KERN_EMERG"Logfs: Error reading emergency page\n"); + return ERR_PTR(err); + } + return emergency_page; +} + +void emergency_read_end(struct page *page) +{ + if (page == emergency_page) + mutex_unlock(&emergency_mutex); + else + page_cache_release(page); +} + +static void dump_segfile(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_segment_entry se; + u32 segno; + + for (segno = 0; segno < super->s_no_segs; segno++) { + logfs_get_segment_entry(sb, segno, &se); + printk("%3x: %6x %8x", segno, be32_to_cpu(se.ec_level), + be32_to_cpu(se.valid)); + if (++segno < super->s_no_segs) { + logfs_get_segment_entry(sb, segno, &se); + printk(" %6x %8x", be32_to_cpu(se.ec_level), + be32_to_cpu(se.valid)); + } + if (++segno < super->s_no_segs) { + logfs_get_segment_entry(sb, segno, &se); + printk(" %6x %8x", be32_to_cpu(se.ec_level), + be32_to_cpu(se.valid)); + } + if (++segno < super->s_no_segs) { + logfs_get_segment_entry(sb, segno, &se); + printk(" %6x %8x", be32_to_cpu(se.ec_level), + be32_to_cpu(se.valid)); + } + printk("\n"); + } +} + +/* + * logfs_crash_dump - dump debug information to device + * + * The LogFS superblock only occupies part of a segment. This function will + * write as much debug information as it can gather into the spare space. + */ +void logfs_crash_dump(struct super_block *sb) +{ + dump_segfile(sb); +} + +/* + * TODO: move to lib/string.c + */ +/** + * memchr_inv - Find a character in an area of memory. + * @s: The memory area + * @c: The byte to search for + * @n: The size of the area. + * + * returns the address of the first character other than @c, or %NULL + * if the whole buffer contains just @c. + */ +void *memchr_inv(const void *s, int c, size_t n) +{ + const unsigned char *p = s; + while (n-- != 0) + if ((unsigned char)c != *p++) + return (void *)(p - 1); + + return NULL; +} + +/* + * FIXME: There should be a reserve for root, similar to ext2. + */ +int logfs_statfs(struct dentry *dentry, struct kstatfs *stats) +{ + struct super_block *sb = dentry->d_sb; + struct logfs_super *super = logfs_super(sb); + + stats->f_type = LOGFS_MAGIC_U32; + stats->f_bsize = sb->s_blocksize; + stats->f_blocks = super->s_size >> LOGFS_BLOCK_BITS >> 3; + stats->f_bfree = super->s_free_bytes >> sb->s_blocksize_bits; + stats->f_bavail = super->s_free_bytes >> sb->s_blocksize_bits; + stats->f_files = 0; + stats->f_ffree = 0; + stats->f_namelen = LOGFS_MAX_NAMELEN; + return 0; +} + +static int logfs_sb_set(struct super_block *sb, void *_super) +{ + struct logfs_super *super = _super; + + sb->s_fs_info = super; + sb->s_mtd = super->s_mtd; + sb->s_bdev = super->s_bdev; + return 0; +} + +static int logfs_sb_test(struct super_block *sb, void *_super) +{ + struct logfs_super *super = _super; + struct mtd_info *mtd = super->s_mtd; + + if (mtd && sb->s_mtd == mtd) + return 1; + if (super->s_bdev && sb->s_bdev == super->s_bdev) + return 1; + return 0; +} + +static void set_segment_header(struct logfs_segment_header *sh, u8 type, + u8 level, u32 segno, u32 ec) +{ + sh->pad = 0; + sh->type = type; + sh->level = level; + sh->segno = cpu_to_be32(segno); + sh->ec = cpu_to_be32(ec); + sh->gec = cpu_to_be64(segno); + sh->crc = logfs_crc32(sh, LOGFS_SEGMENT_HEADERSIZE, 4); +} + +static void logfs_write_ds(struct super_block *sb, struct logfs_disk_super *ds, + u32 segno, u32 ec) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_segment_header *sh = &ds->ds_sh; + int i; + + memset(ds, 0, sizeof(*ds)); + set_segment_header(sh, SEG_SUPER, 0, segno, ec); + + ds->ds_ifile_levels = super->s_ifile_levels; + ds->ds_iblock_levels = super->s_iblock_levels; + ds->ds_data_levels = super->s_data_levels; /* XXX: Remove */ + ds->ds_segment_shift = super->s_segshift; + ds->ds_block_shift = sb->s_blocksize_bits; + ds->ds_write_shift = super->s_writeshift; + ds->ds_filesystem_size = cpu_to_be64(super->s_size); + ds->ds_segment_size = cpu_to_be32(super->s_segsize); + ds->ds_bad_seg_reserve = cpu_to_be32(super->s_bad_seg_reserve); + ds->ds_feature_incompat = cpu_to_be64(super->s_feature_incompat); + ds->ds_feature_ro_compat= cpu_to_be64(super->s_feature_ro_compat); + ds->ds_feature_compat = cpu_to_be64(super->s_feature_compat); + ds->ds_feature_flags = cpu_to_be64(super->s_feature_flags); + ds->ds_root_reserve = cpu_to_be64(super->s_root_reserve); + ds->ds_speed_reserve = cpu_to_be64(super->s_speed_reserve); + journal_for_each(i) + ds->ds_journal_seg[i] = cpu_to_be32(super->s_journal_seg[i]); + ds->ds_magic = cpu_to_be64(LOGFS_MAGIC); + ds->ds_crc = logfs_crc32(ds, sizeof(*ds), + LOGFS_SEGMENT_HEADERSIZE + 12); +} + +static int write_one_sb(struct super_block *sb, + struct page *(*find_sb)(struct super_block *sb, u64 *ofs)) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_disk_super *ds; + struct logfs_segment_entry se; + struct page *page; + u64 ofs; + u32 ec, segno; + int err; + + page = find_sb(sb, &ofs); + if (!page) + return -EIO; + ds = page_address(page); + segno = seg_no(sb, ofs); + logfs_get_segment_entry(sb, segno, &se); + ec = be32_to_cpu(se.ec_level) >> 4; + ec++; + logfs_set_segment_erased(sb, segno, ec, 0); + logfs_write_ds(sb, ds, segno, ec); + err = super->s_devops->write_sb(sb, page); + page_cache_release(page); + return err; +} + +int logfs_write_sb(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + int err; + + /* First superblock */ + err = write_one_sb(sb, super->s_devops->find_first_sb); + if (err) + return err; + + /* Last superblock */ + err = write_one_sb(sb, super->s_devops->find_last_sb); + if (err) + return err; + return 0; +} + +static int ds_cmp(const void *ds0, const void *ds1) +{ + size_t len = sizeof(struct logfs_disk_super); + + /* We know the segment headers differ, so ignore them */ + len -= LOGFS_SEGMENT_HEADERSIZE; + ds0 += LOGFS_SEGMENT_HEADERSIZE; + ds1 += LOGFS_SEGMENT_HEADERSIZE; + return memcmp(ds0, ds1, len); +} + +static int logfs_recover_sb(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_disk_super _ds0, *ds0 = &_ds0; + struct logfs_disk_super _ds1, *ds1 = &_ds1; + int err, valid0, valid1; + + /* read first superblock */ + err = wbuf_read(sb, super->s_sb_ofs[0], sizeof(*ds0), ds0); + if (err) + return err; + /* read last superblock */ + err = wbuf_read(sb, super->s_sb_ofs[1], sizeof(*ds1), ds1); + if (err) + return err; + valid0 = logfs_check_ds(ds0) == 0; + valid1 = logfs_check_ds(ds1) == 0; + + if (!valid0 && valid1) { + printk(KERN_INFO"First superblock is invalid - fixing.\n"); + return write_one_sb(sb, super->s_devops->find_first_sb); + } + if (valid0 && !valid1) { + printk(KERN_INFO"Last superblock is invalid - fixing.\n"); + return write_one_sb(sb, super->s_devops->find_last_sb); + } + if (valid0 && valid1 && ds_cmp(ds0, ds1)) { + printk(KERN_INFO"Superblocks don't match - fixing.\n"); + return write_one_sb(sb, super->s_devops->find_last_sb); + } + /* If neither is valid now, something's wrong. Didn't we properly + * check them before?!? */ + BUG_ON(!valid0 && !valid1); + return 0; +} + +static int logfs_make_writeable(struct super_block *sb) +{ + int err; + + /* Repair any broken superblock copies */ + err = logfs_recover_sb(sb); + if (err) + return err; + + /* Check areas for trailing unaccounted data */ + err = logfs_check_areas(sb); + if (err) + return err; + + err = logfs_open_segfile(sb); + if (err) + return err; + + /* Do one GC pass before any data gets dirtied */ + logfs_gc_pass(sb); + + /* after all initializations are done, replay the journal + * for rw-mounts, if necessary */ + err = logfs_replay_journal(sb); + if (err) + return err; + + return 0; +} + +static int logfs_get_sb_final(struct super_block *sb, struct vfsmount *mnt) +{ + struct inode *rootdir; + int err; + + /* root dir */ + rootdir = logfs_iget(sb, LOGFS_INO_ROOT); + if (IS_ERR(rootdir)) + goto fail; + + sb->s_root = d_alloc_root(rootdir); + if (!sb->s_root) + goto fail; + + /* FIXME: check for read-only mounts */ + err = logfs_make_writeable(sb); + if (err) + goto fail2; + + log_super("LogFS: Finished mounting\n"); + simple_set_mnt(mnt, sb); + return 0; + +fail2: + iput(rootdir); +fail: + iput(logfs_super(sb)->s_master_inode); + return -EIO; +} + +int logfs_check_ds(struct logfs_disk_super *ds) +{ + struct logfs_segment_header *sh = &ds->ds_sh; + + if (ds->ds_magic != cpu_to_be64(LOGFS_MAGIC)) + return -EINVAL; + if (sh->crc != logfs_crc32(sh, LOGFS_SEGMENT_HEADERSIZE, 4)) + return -EINVAL; + if (ds->ds_crc != logfs_crc32(ds, sizeof(*ds), + LOGFS_SEGMENT_HEADERSIZE + 12)) + return -EINVAL; + return 0; +} + +static struct page *find_super_block(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + struct page *first, *last; + + first = super->s_devops->find_first_sb(sb, &super->s_sb_ofs[0]); + if (!first || IS_ERR(first)) + return NULL; + last = super->s_devops->find_last_sb(sb, &super->s_sb_ofs[1]); + if (!last || IS_ERR(first)) { + page_cache_release(first); + return NULL; + } + + if (!logfs_check_ds(page_address(first))) { + page_cache_release(last); + return first; + } + + /* First one didn't work, try the second superblock */ + if (!logfs_check_ds(page_address(last))) { + page_cache_release(first); + return last; + } + + /* Neither worked, sorry folks */ + page_cache_release(first); + page_cache_release(last); + return NULL; +} + +static int __logfs_read_sb(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + struct page *page; + struct logfs_disk_super *ds; + int i; + + page = find_super_block(sb); + if (!page) + return -EIO; + + ds = page_address(page); + super->s_size = be64_to_cpu(ds->ds_filesystem_size); + super->s_root_reserve = be64_to_cpu(ds->ds_root_reserve); + super->s_speed_reserve = be64_to_cpu(ds->ds_speed_reserve); + super->s_bad_seg_reserve = be32_to_cpu(ds->ds_bad_seg_reserve); + super->s_segsize = 1 << ds->ds_segment_shift; + super->s_segmask = (1 << ds->ds_segment_shift) - 1; + super->s_segshift = ds->ds_segment_shift; + sb->s_blocksize = 1 << ds->ds_block_shift; + sb->s_blocksize_bits = ds->ds_block_shift; + super->s_writesize = 1 << ds->ds_write_shift; + super->s_writeshift = ds->ds_write_shift; + super->s_no_segs = super->s_size >> super->s_segshift; + super->s_no_blocks = super->s_segsize >> sb->s_blocksize_bits; + super->s_feature_incompat = be64_to_cpu(ds->ds_feature_incompat); + super->s_feature_ro_compat = be64_to_cpu(ds->ds_feature_ro_compat); + super->s_feature_compat = be64_to_cpu(ds->ds_feature_compat); + super->s_feature_flags = be64_to_cpu(ds->ds_feature_flags); + + journal_for_each(i) + super->s_journal_seg[i] = be32_to_cpu(ds->ds_journal_seg[i]); + + super->s_ifile_levels = ds->ds_ifile_levels; + super->s_iblock_levels = ds->ds_iblock_levels; + super->s_data_levels = ds->ds_data_levels; + super->s_total_levels = super->s_ifile_levels + super->s_iblock_levels + + super->s_data_levels; + page_cache_release(page); + return 0; +} + +static int logfs_read_sb(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + int ret; + + super->s_btree_pool = mempool_create(32, btree_alloc, btree_free, NULL); + if (!super->s_btree_pool) + return -ENOMEM; + + btree_init_mempool64(&super->s_shadow_tree.new, super->s_btree_pool); + btree_init_mempool64(&super->s_shadow_tree.old, super->s_btree_pool); + + ret = logfs_init_mapping(sb); + if (ret) + return ret; + + ret = __logfs_read_sb(sb); + if (ret) + return ret; + + mutex_init(&super->s_dirop_mutex); + mutex_init(&super->s_object_alias_mutex); + INIT_LIST_HEAD(&super->s_freeing_list); + + ret = logfs_init_rw(sb); + if (ret) + return ret; + + ret = logfs_init_areas(sb); + if (ret) + return ret; + + ret = logfs_init_gc(sb); + if (ret) + return ret; + + ret = logfs_init_journal(sb); + if (ret) + return ret; + + return 0; +} + +static void logfs_kill_sb(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + + log_super("LogFS: Start unmounting\n"); + /* Alias entries slow down mount, so evict as many as possible */ + sync_filesystem(sb); + logfs_write_anchor(super->s_master_inode); + + /* + * From this point on alias entries are simply dropped - and any + * writes to the object store are considered bugs. + */ + super->s_flags |= LOGFS_SB_FLAG_SHUTDOWN; + log_super("LogFS: Now in shutdown\n"); + generic_shutdown_super(sb); + + BUG_ON(super->s_dirty_used_bytes || super->s_dirty_free_bytes); + + logfs_cleanup_gc(sb); + logfs_cleanup_journal(sb); + logfs_cleanup_areas(sb); + logfs_cleanup_rw(sb); + super->s_devops->put_device(sb); + mempool_destroy(super->s_btree_pool); + mempool_destroy(super->s_alias_pool); + kfree(super); + log_super("LogFS: Finished unmounting\n"); +} + +int logfs_get_sb_device(struct file_system_type *type, int flags, + struct mtd_info *mtd, struct block_device *bdev, + const struct logfs_device_ops *devops, struct vfsmount *mnt) +{ + struct logfs_super *super; + struct super_block *sb; + int err = -ENOMEM; + static int mount_count; + + log_super("LogFS: Start mount %x\n", mount_count++); + super = kzalloc(sizeof(*super), GFP_KERNEL); + if (!super) + goto err0; + + super->s_mtd = mtd; + super->s_bdev = bdev; + err = -EINVAL; + sb = sget(type, logfs_sb_test, logfs_sb_set, super); + if (IS_ERR(sb)) + goto err0; + + if (sb->s_root) { + /* Device is already in use */ + err = 0; + simple_set_mnt(mnt, sb); + goto err0; + } + + super->s_devops = devops; + + /* + * sb->s_maxbytes is limited to 8TB. On 32bit systems, the page cache + * only covers 16TB and the upper 8TB are used for indirect blocks. + * On 64bit system we could bump up the limit, but that would make + * the filesystem incompatible with 32bit systems. + */ + sb->s_maxbytes = (1ull << 43) - 1; + sb->s_op = &logfs_super_operations; + sb->s_flags = flags | MS_NOATIME; + + err = logfs_read_sb(sb); + if (err) + goto err1; + + sb->s_flags |= MS_ACTIVE; + err = logfs_get_sb_final(sb, mnt); + if (err) + goto err1; + return 0; + +err1: + up_write(&sb->s_umount); + deactivate_super(sb); + return err; +err0: + kfree(super); + //devops->put_device(sb); + return err; +} + +static int logfs_get_sb(struct file_system_type *type, int flags, + const char *devname, void *data, struct vfsmount *mnt) +{ + ulong mtdnr; + + if (!devname) + return logfs_get_sb_bdev(type, flags, devname, mnt); + if (strncmp(devname, "mtd", 3)) + return logfs_get_sb_bdev(type, flags, devname, mnt); + + { + char *garbage; + mtdnr = simple_strtoul(devname+3, &garbage, 0); + if (*garbage) + return -EINVAL; + } + + return logfs_get_sb_mtd(type, flags, mtdnr, mnt); +} + +static struct file_system_type logfs_fs_type = { + .owner = THIS_MODULE, + .name = "logfs", + .get_sb = logfs_get_sb, + .kill_sb = logfs_kill_sb, + .fs_flags = FS_REQUIRES_DEV, + +}; + +static int __init logfs_init(void) +{ + int ret; + + emergency_page = alloc_pages(GFP_KERNEL, 0); + if (!emergency_page) + return -ENOMEM; + + ret = logfs_compr_init(); + if (ret) + goto out1; + + ret = logfs_init_inode_cache(); + if (ret) + goto out2; + + return register_filesystem(&logfs_fs_type); +out2: + logfs_compr_exit(); +out1: + __free_pages(emergency_page, 0); + return ret; +} + +static void __exit logfs_exit(void) +{ + unregister_filesystem(&logfs_fs_type); + logfs_destroy_inode_cache(); + logfs_compr_exit(); + __free_pages(emergency_page, 0); +} + +module_init(logfs_init); +module_exit(logfs_exit); + +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Joern Engel <joern@logfs.org>"); +MODULE_DESCRIPTION("scalable flash filesystem"); -- cgit v1.2.3 From ddfd1f04b7bc557c1fe9b110e99cebb2e19d4993 Mon Sep 17 00:00:00 2001 From: Joern Engel <joern@logfs.org> Date: Mon, 23 Nov 2009 14:29:12 +0100 Subject: [LogFS] Plug memory leak on error paths Spotted by Dan Carpenter. --- fs/logfs/dir.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c index 89104e6f81c4..e7659b15a907 100644 --- a/fs/logfs/dir.c +++ b/fs/logfs/dir.c @@ -224,10 +224,14 @@ static int logfs_unlink(struct inode *dir, struct dentry *dentry) inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; page = logfs_get_dd_page(dir, dentry); - if (!page) + if (!page) { + kfree(ta); return -ENOENT; - if (IS_ERR(page)) + } + if (IS_ERR(page)) { + kfree(ta); return PTR_ERR(page); + } index = page->index; page_cache_release(page); -- cgit v1.2.3 From 30835cd074381048b0ea5d53e27725bbd0bdf5b7 Mon Sep 17 00:00:00 2001 From: Joern Engel <joern@logfs.org> Date: Sat, 28 Nov 2009 13:14:08 +0100 Subject: [LogFS] Prevent 64bit divisions in hash_index Randy Dunlap caught this built error on i386: fs/built-in.o: In function `hash_index': dir.c:(.text+0x6c1f2): undefined reference to `__umoddi3' --- fs/logfs/dir.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c index e7659b15a907..56a8bfbb0120 100644 --- a/fs/logfs/dir.c +++ b/fs/logfs/dir.c @@ -133,17 +133,22 @@ static u32 hash_32(const char *s, int len, u32 seed) */ static pgoff_t hash_index(u32 hash, int round) { + u32 i0_blocks = I0_BLOCKS; + u32 i1_blocks = I1_BLOCKS; + u32 i2_blocks = I2_BLOCKS; + u32 i3_blocks = I3_BLOCKS; + switch (round) { case 0: - return hash % I0_BLOCKS; + return hash % i0_blocks; case 1: - return I0_BLOCKS + hash % (I1_BLOCKS - I0_BLOCKS); + return i0_blocks + hash % (i1_blocks - i0_blocks); case 2: - return I1_BLOCKS + hash % (I2_BLOCKS - I1_BLOCKS); + return i1_blocks + hash % (i2_blocks - i1_blocks); case 3: - return I2_BLOCKS + hash % (I3_BLOCKS - I2_BLOCKS); + return i2_blocks + hash % (i3_blocks - i2_blocks); case 4 ... 19: - return I3_BLOCKS + 16 * (hash % (((1<<31) - I3_BLOCKS) / 16)) + return i3_blocks + 16 * (hash % (((1<<31) - i3_blocks) / 16)) + round - 4; } BUG(); -- cgit v1.2.3 From 5c564c2a04d4bb6ba79eeb83bd06de584479f362 Mon Sep 17 00:00:00 2001 From: Joern Engel <joern@logfs.org> Date: Mon, 7 Dec 2009 12:34:43 +0100 Subject: [LogFS] Silence gcc Andrew Morton sayeth: fs/logfs/journal.c: In function 'logfs_init_journal': fs/logfs/journal.c:266: warning: 'last_len' may be used uninitialized in this function Can this be squished please? --- fs/logfs/journal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/logfs/journal.c b/fs/logfs/journal.c index 7a023dbba9f8..2f2e8e4fd02d 100644 --- a/fs/logfs/journal.c +++ b/fs/logfs/journal.c @@ -263,7 +263,7 @@ static int logfs_read_segment(struct super_block *sb, u32 segno) struct logfs_journal_header *jh = super->s_compressed_je; u64 ofs, seg_ofs = dev_ofs(sb, segno, 0); u32 h_ofs, last_ofs = 0; - u16 len, datalen, last_len; + u16 len, datalen, last_len = 0; int i, err; /* search for most recent commit */ -- cgit v1.2.3 From a1de02dccf906faba2ee2d99cac56799bda3b96a Mon Sep 17 00:00:00 2001 From: Eric Sandeen <sandeen@redhat.com> Date: Thu, 4 Feb 2010 23:58:38 -0500 Subject: ext4: fix async i/o writes beyond 4GB to a sparse file The "offset" member in ext4_io_end holds bytes, not blocks, so ext4_lblk_t is wrong - and too small (u32). This caused the async i/o writes to sparse files beyond 4GB to fail when they wrapped around to 0. Also fix up the type of arguments to ext4_convert_unwritten_extents(), it gets ssize_t from ext4_end_aio_dio_nolock() and ext4_ext_direct_IO(). Reported-by: Giel de Nijs <giel@vectorwise.com> Signed-off-by: Eric Sandeen <sandeen@redhat.com> --- fs/ext4/ext4.h | 6 +++--- fs/ext4/extents.c | 2 +- fs/ext4/inode.c | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 874d169a193e..602d5ad6f5e7 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -139,8 +139,8 @@ typedef struct ext4_io_end { struct inode *inode; /* file being written to */ unsigned int flag; /* unwritten or not */ int error; /* I/O error code */ - ext4_lblk_t offset; /* offset in the file */ - size_t size; /* size of the extent */ + loff_t offset; /* offset in the file */ + ssize_t size; /* size of the extent */ struct work_struct work; /* data work queue */ } ext4_io_end_t; @@ -1744,7 +1744,7 @@ extern void ext4_ext_release(struct super_block *); extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len); extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, - loff_t len); + ssize_t len); extern int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, unsigned int max_blocks, struct buffer_head *bh, int flags); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 765a4826b118..c56877972b0e 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3603,7 +3603,7 @@ retry: * Returns 0 on success. */ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, - loff_t len) + ssize_t len) { handle_t *handle; ext4_lblk_t block; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index e11952404e02..2059c34ac4c8 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3551,7 +3551,7 @@ static int ext4_end_aio_dio_nolock(ext4_io_end_t *io) { struct inode *inode = io->inode; loff_t offset = io->offset; - size_t size = io->size; + ssize_t size = io->size; int ret = 0; ext4_debug("end_aio_dio_onlock: io 0x%p from inode %lu,list->next 0x%p," -- cgit v1.2.3 From 15121c18a22ae483279f76dc9e554334b800d0f7 Mon Sep 17 00:00:00 2001 From: Eric Sandeen <sandeen@redhat.com> Date: Mon, 15 Feb 2010 20:17:55 -0500 Subject: ext4: Fix optional-arg mount options We have 2 mount options, "barrier" and "auto_da_alloc" which may or may not take a 1/0 argument. This causes the ext4 superblock mount code to subtract uninitialized pointers and pass the result to kmalloc, which results in very noisy failures. Per Ted's suggestion, initialize the args struct so that we know whether match_token() found an argument for the option, and skip match_int() if not. Also, return error (0) from parse_options if we thought we found an argument, but match_int() Fails. Reported-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Eric Sandeen <sandeen@redhat.com> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu> --- fs/ext4/super.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 735c20d5fd56..68a55dffb360 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1229,6 +1229,11 @@ static int parse_options(char *options, struct super_block *sb, if (!*p) continue; + /* + * Initialize args struct so we know whether arg was + * found; some options take optional arguments. + */ + args[0].to = args[0].from = 0; token = match_token(p, tokens, args); switch (token) { case Opt_bsd_df: @@ -1518,10 +1523,11 @@ set_qf_format: clear_opt(sbi->s_mount_opt, BARRIER); break; case Opt_barrier: - if (match_int(&args[0], &option)) { - set_opt(sbi->s_mount_opt, BARRIER); - break; - } + if (args[0].from) { + if (match_int(&args[0], &option)) + return 0; + } else + option = 1; /* No argument, default to 1 */ if (option) set_opt(sbi->s_mount_opt, BARRIER); else @@ -1594,10 +1600,11 @@ set_qf_format: set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC); break; case Opt_auto_da_alloc: - if (match_int(&args[0], &option)) { - clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC); - break; - } + if (args[0].from) { + if (match_int(&args[0], &option)) + return 0; + } else + option = 1; /* No argument, default to 1 */ if (option) clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC); else -- cgit v1.2.3 From 1f2acb6017d8528135ec3b01ab7cd2be6ea0630b Mon Sep 17 00:00:00 2001 From: Theodore Ts'o <tytso@mit.edu> Date: Fri, 22 Jan 2010 17:40:42 -0500 Subject: ext4: Add block validity check when truncating indirect block mapped inodes Add checks to ext4_free_branches() to make sure a block number found in an indirect block are valid before trying to free it. If a bad block number is found, stop freeing the indirect block immediately, since the file system is corrupt and we will need to run fsck anyway. This also avoids spamming the logs, and specifically avoids driver-level "attempt to access beyond end of device" errors obscure what is really going on. If you get *really*, *really*, *really* unlucky, without this patch, a supposed indirect block containing garbage might contain a reference to a primary block group descriptor, in which case ext4_free_branches() could end up zero'ing out a block group descriptor block, and if then one of the block bitmaps for a block group described by that bg descriptor block is not in memory, and is read in by ext4_read_block_bitmap(). This function calls ext4_valid_block_bitmap(), which assumes that bg_inode_table() was validated at mount time and hasn't been modified since. Since this assumption is no longer valid, it's possible for the value (ext4_inode_table(sb, desc) - group_first_block) to go negative, which will cause ext4_find_next_zero_bit() to trigger a kernel GPF. Addresses-Google-Bug: #2220436 Signed-off-by: "Theodore Ts'o" <tytso@mit.edu> --- fs/ext4/ext4.h | 1 + fs/ext4/inode.c | 39 ++++++++++++++++++++++++++++++--------- fs/ext4/mballoc.c | 7 ++++--- 3 files changed, 35 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 602d5ad6f5e7..307ecd13a762 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -377,6 +377,7 @@ struct ext4_new_group_data { */ #define EXT4_FREE_BLOCKS_METADATA 0x0001 #define EXT4_FREE_BLOCKS_FORGET 0x0002 +#define EXT4_FREE_BLOCKS_VALIDATED 0x0004 /* * ioctl commands diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 2059c34ac4c8..3e8afd969236 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4130,18 +4130,27 @@ no_top: * We release `count' blocks on disk, but (last - first) may be greater * than `count' because there can be holes in there. */ -static void ext4_clear_blocks(handle_t *handle, struct inode *inode, - struct buffer_head *bh, - ext4_fsblk_t block_to_free, - unsigned long count, __le32 *first, - __le32 *last) +static int ext4_clear_blocks(handle_t *handle, struct inode *inode, + struct buffer_head *bh, + ext4_fsblk_t block_to_free, + unsigned long count, __le32 *first, + __le32 *last) { __le32 *p; - int flags = EXT4_FREE_BLOCKS_FORGET; + int flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED; if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) flags |= EXT4_FREE_BLOCKS_METADATA; + if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free, + count)) { + ext4_error(inode->i_sb, __func__, "inode #%lu: " + "attempt to clear blocks %llu len %lu, invalid", + inode->i_ino, (unsigned long long) block_to_free, + count); + return 1; + } + if (try_to_extend_transaction(handle, inode)) { if (bh) { BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); @@ -4160,6 +4169,7 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode, *p = 0; ext4_free_blocks(handle, inode, 0, block_to_free, count, flags); + return 0; } /** @@ -4215,9 +4225,10 @@ static void ext4_free_data(handle_t *handle, struct inode *inode, } else if (nr == block_to_free + count) { count++; } else { - ext4_clear_blocks(handle, inode, this_bh, - block_to_free, - count, block_to_free_p, p); + if (ext4_clear_blocks(handle, inode, this_bh, + block_to_free, count, + block_to_free_p, p)) + break; block_to_free = nr; block_to_free_p = p; count = 1; @@ -4281,6 +4292,16 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, if (!nr) continue; /* A hole */ + if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), + nr, 1)) { + ext4_error(inode->i_sb, __func__, + "indirect mapped block in inode " + "#%lu invalid (level %d, blk #%lu)", + inode->i_ino, depth, + (unsigned long) nr); + break; + } + /* Go read the buffer for the next level down */ bh = sb_bread(inode->i_sb, nr); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index d34afad3e137..d129c1039f1d 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4476,10 +4476,11 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, sbi = EXT4_SB(sb); es = EXT4_SB(sb)->s_es; - if (!ext4_data_block_valid(sbi, block, count)) { + if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) && + !ext4_data_block_valid(sbi, block, count)) { ext4_error(sb, __func__, - "Freeing blocks not in datazone - " - "block = %llu, count = %lu", block, count); + "Freeing blocks not in datazone - " + "block = %llu, count = %lu", block, count); goto error_return; } -- cgit v1.2.3 From 71f2be213a0009098819e5c04f75ff19f84f2122 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o <tytso@mit.edu> Date: Wed, 23 Dec 2009 07:45:44 -0500 Subject: ext4: Add new tracepoint for jbd2_cleanup_journal_tail Signed-off-by: "Theodore Ts'o" <tytso@mit.edu> --- fs/jbd2/checkpoint.c | 1 + include/trace/events/jbd2.h | 28 ++++++++++++++++++++++++++++ 2 files changed, 29 insertions(+) (limited to 'fs') diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 886849370950..30beb11ef928 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -507,6 +507,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal) if (blocknr < journal->j_tail) freed = freed + journal->j_last - journal->j_first; + trace_jbd2_cleanup_journal_tail(journal, first_tid, blocknr, freed); jbd_debug(1, "Cleaning journal tail from %d to %d (offset %lu), " "freeing %lu\n", diff --git a/include/trace/events/jbd2.h b/include/trace/events/jbd2.h index 96b370a050de..bf16545cc977 100644 --- a/include/trace/events/jbd2.h +++ b/include/trace/events/jbd2.h @@ -199,6 +199,34 @@ TRACE_EVENT(jbd2_checkpoint_stats, __entry->forced_to_close, __entry->written, __entry->dropped) ); +TRACE_EVENT(jbd2_cleanup_journal_tail, + + TP_PROTO(journal_t *journal, tid_t first_tid, + unsigned long block_nr, unsigned long freed), + + TP_ARGS(journal, first_tid, block_nr, freed), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( tid_t, tail_sequence ) + __field( tid_t, first_tid ) + __field(unsigned long, block_nr ) + __field(unsigned long, freed ) + ), + + TP_fast_assign( + __entry->dev = journal->j_fs_dev->bd_dev; + __entry->tail_sequence = journal->j_tail_sequence; + __entry->first_tid = first_tid; + __entry->block_nr = block_nr; + __entry->freed = freed; + ), + + TP_printk("dev %s from %u to %u offset %lu freed %lu", + jbd2_dev_to_name(__entry->dev), __entry->tail_sequence, + __entry->first_tid, __entry->block_nr, __entry->freed) +); + #endif /* _TRACE_JBD2_H */ /* This part must be outside protection */ -- cgit v1.2.3 From f8ec9d6837241865cf99bed97bb99f4399fd5a03 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o <tytso@mit.edu> Date: Fri, 1 Jan 2010 01:00:21 -0500 Subject: ext4: Add new tracepoints to debug delayed allocation space functions Add tracepoints for ext4_da_reserve_space(), ext4_da_update_reserve_space(), and ext4_da_release_space(). Signed-off-by: "Theodore Ts'o" <tytso@mit.edu> --- fs/ext4/inode.c | 2 + include/trace/events/ext4.h | 101 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 103 insertions(+) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 3e8afd969236..1a3d7b232cd7 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1061,6 +1061,7 @@ void ext4_da_update_reserve_space(struct inode *inode, int mdb_free = 0, allocated_meta_blocks = 0; spin_lock(&ei->i_block_reservation_lock); + trace_ext4_da_update_reserve_space(inode, used); if (unlikely(used > ei->i_reserved_data_blocks)) { ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d " "with only %d reserved data blocks\n", @@ -1846,6 +1847,7 @@ repeat: spin_lock(&ei->i_block_reservation_lock); md_reserved = ei->i_reserved_meta_blocks; md_needed = ext4_calc_metadata_amount(inode, lblock); + trace_ext4_da_reserve_space(inode, md_needed); spin_unlock(&ei->i_block_reservation_lock); /* diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index d0b6cd3afb2f..2aa6aa3e8f61 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -874,6 +874,107 @@ TRACE_EVENT(ext4_forget, __entry->mode, __entry->is_metadata, __entry->block) ); +TRACE_EVENT(ext4_da_update_reserve_space, + TP_PROTO(struct inode *inode, int used_blocks), + + TP_ARGS(inode, used_blocks), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( umode_t, mode ) + __field( __u64, i_blocks ) + __field( int, used_blocks ) + __field( int, reserved_data_blocks ) + __field( int, reserved_meta_blocks ) + __field( int, allocated_meta_blocks ) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->mode = inode->i_mode; + __entry->i_blocks = inode->i_blocks; + __entry->used_blocks = used_blocks; + __entry->reserved_data_blocks = EXT4_I(inode)->i_reserved_data_blocks; + __entry->reserved_meta_blocks = EXT4_I(inode)->i_reserved_meta_blocks; + __entry->allocated_meta_blocks = EXT4_I(inode)->i_allocated_meta_blocks; + ), + + TP_printk("dev %s ino %lu mode 0%o i_blocks %llu used_blocks %d reserved_data_blocks %d reserved_meta_blocks %d allocated_meta_blocks %d", + jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, + __entry->mode, (unsigned long long) __entry->i_blocks, + __entry->used_blocks, __entry->reserved_data_blocks, + __entry->reserved_meta_blocks, __entry->allocated_meta_blocks) +); + +TRACE_EVENT(ext4_da_reserve_space, + TP_PROTO(struct inode *inode, int md_needed), + + TP_ARGS(inode, md_needed), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( umode_t, mode ) + __field( __u64, i_blocks ) + __field( int, md_needed ) + __field( int, reserved_data_blocks ) + __field( int, reserved_meta_blocks ) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->mode = inode->i_mode; + __entry->i_blocks = inode->i_blocks; + __entry->md_needed = md_needed; + __entry->reserved_data_blocks = EXT4_I(inode)->i_reserved_data_blocks; + __entry->reserved_meta_blocks = EXT4_I(inode)->i_reserved_meta_blocks; + ), + + TP_printk("dev %s ino %lu mode 0%o i_blocks %llu md_needed %d reserved_data_blocks %d reserved_meta_blocks %d", + jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, + __entry->mode, (unsigned long long) __entry->i_blocks, + __entry->md_needed, __entry->reserved_data_blocks, + __entry->reserved_meta_blocks) +); + +TRACE_EVENT(ext4_da_release_space, + TP_PROTO(struct inode *inode, int freed_blocks), + + TP_ARGS(inode, freed_blocks), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( umode_t, mode ) + __field( __u64, i_blocks ) + __field( int, freed_blocks ) + __field( int, reserved_data_blocks ) + __field( int, reserved_meta_blocks ) + __field( int, allocated_meta_blocks ) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->mode = inode->i_mode; + __entry->i_blocks = inode->i_blocks; + __entry->freed_blocks = freed_blocks; + __entry->reserved_data_blocks = EXT4_I(inode)->i_reserved_data_blocks; + __entry->reserved_meta_blocks = EXT4_I(inode)->i_reserved_meta_blocks; + __entry->allocated_meta_blocks = EXT4_I(inode)->i_allocated_meta_blocks; + ), + + TP_printk("dev %s ino %lu mode 0%o i_blocks %llu freed_blocks %d reserved_data_blocks %d reserved_meta_blocks %d allocated_meta_blocks %d", + jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, + __entry->mode, (unsigned long long) __entry->i_blocks, + __entry->freed_blocks, __entry->reserved_data_blocks, + __entry->reserved_meta_blocks, __entry->allocated_meta_blocks) +); + + #endif /* _TRACE_EXT4_H */ /* This part must be outside protection */ -- cgit v1.2.3 From d2eecb03936878ec574ade5532fa83df7d75dde7 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o <tytso@mit.edu> Date: Mon, 7 Dec 2009 10:36:20 -0500 Subject: ext4: Use slab allocator for sub-page sized allocations Now that the SLUB seems to be fixed so that it respects the requested alignment, use kmem_cache_alloc() to allocator if the block size of the buffer heads to be allocated is less than the page size. Previously, we were using 16k page on a Power system for each buffer, even when the file system was using 1k or 4k block size. Signed-off-by: "Theodore Ts'o" <tytso@mit.edu> --- fs/jbd2/journal.c | 132 +++++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/jbd2.h | 11 +---- 2 files changed, 134 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index ac0d027595d0..c03d4dce4d76 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -39,6 +39,8 @@ #include <linux/seq_file.h> #include <linux/math64.h> #include <linux/hash.h> +#include <linux/log2.h> +#include <linux/vmalloc.h> #define CREATE_TRACE_POINTS #include <trace/events/jbd2.h> @@ -93,6 +95,7 @@ EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate); static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *); static void __journal_abort_soft (journal_t *journal, int errno); +static int jbd2_journal_create_slab(size_t slab_size); /* * Helper function used to manage commit timeouts @@ -1248,6 +1251,13 @@ int jbd2_journal_load(journal_t *journal) } } + /* + * Create a slab for this blocksize + */ + err = jbd2_journal_create_slab(be32_to_cpu(sb->s_blocksize)); + if (err) + return err; + /* Let the recovery code check whether it needs to recover any * data from the journal. */ if (jbd2_journal_recover(journal)) @@ -1806,6 +1816,127 @@ size_t journal_tag_bytes(journal_t *journal) return JBD2_TAG_SIZE32; } +/* + * JBD memory management + * + * These functions are used to allocate block-sized chunks of memory + * used for making copies of buffer_head data. Very often it will be + * page-sized chunks of data, but sometimes it will be in + * sub-page-size chunks. (For example, 16k pages on Power systems + * with a 4k block file system.) For blocks smaller than a page, we + * use a SLAB allocator. There are slab caches for each block size, + * which are allocated at mount time, if necessary, and we only free + * (all of) the slab caches when/if the jbd2 module is unloaded. For + * this reason we don't need to a mutex to protect access to + * jbd2_slab[] allocating or releasing memory; only in + * jbd2_journal_create_slab(). + */ +#define JBD2_MAX_SLABS 8 +static struct kmem_cache *jbd2_slab[JBD2_MAX_SLABS]; +static DECLARE_MUTEX(jbd2_slab_create_sem); + +static const char *jbd2_slab_names[JBD2_MAX_SLABS] = { + "jbd2_1k", "jbd2_2k", "jbd2_4k", "jbd2_8k", + "jbd2_16k", "jbd2_32k", "jbd2_64k", "jbd2_128k" +}; + + +static void jbd2_journal_destroy_slabs(void) +{ + int i; + + for (i = 0; i < JBD2_MAX_SLABS; i++) { + if (jbd2_slab[i]) + kmem_cache_destroy(jbd2_slab[i]); + jbd2_slab[i] = NULL; + } +} + +static int jbd2_journal_create_slab(size_t size) +{ + int i = order_base_2(size) - 10; + size_t slab_size; + + if (size == PAGE_SIZE) + return 0; + + if (i >= JBD2_MAX_SLABS) + return -EINVAL; + + if (unlikely(i < 0)) + i = 0; + down(&jbd2_slab_create_sem); + if (jbd2_slab[i]) { + up(&jbd2_slab_create_sem); + return 0; /* Already created */ + } + + slab_size = 1 << (i+10); + jbd2_slab[i] = kmem_cache_create(jbd2_slab_names[i], slab_size, + slab_size, 0, NULL); + up(&jbd2_slab_create_sem); + if (!jbd2_slab[i]) { + printk(KERN_EMERG "JBD2: no memory for jbd2_slab cache\n"); + return -ENOMEM; + } + return 0; +} + +static struct kmem_cache *get_slab(size_t size) +{ + int i = order_base_2(size) - 10; + + BUG_ON(i >= JBD2_MAX_SLABS); + if (unlikely(i < 0)) + i = 0; + BUG_ON(jbd2_slab[i] == 0); + return jbd2_slab[i]; +} + +void *jbd2_alloc(size_t size, gfp_t flags) +{ + void *ptr; + + BUG_ON(size & (size-1)); /* Must be a power of 2 */ + + flags |= __GFP_REPEAT; + if (size == PAGE_SIZE) + ptr = (void *)__get_free_pages(flags, 0); + else if (size > PAGE_SIZE) { + int order = get_order(size); + + if (order < 3) + ptr = (void *)__get_free_pages(flags, order); + else + ptr = vmalloc(size); + } else + ptr = kmem_cache_alloc(get_slab(size), flags); + + /* Check alignment; SLUB has gotten this wrong in the past, + * and this can lead to user data corruption! */ + BUG_ON(((unsigned long) ptr) & (size-1)); + + return ptr; +} + +void jbd2_free(void *ptr, size_t size) +{ + if (size == PAGE_SIZE) { + free_pages((unsigned long)ptr, 0); + return; + } + if (size > PAGE_SIZE) { + int order = get_order(size); + + if (order < 3) + free_pages((unsigned long)ptr, order); + else + vfree(ptr); + return; + } + kmem_cache_free(get_slab(size), ptr); +}; + /* * Journal_head storage management */ @@ -2204,6 +2335,7 @@ static void jbd2_journal_destroy_caches(void) jbd2_journal_destroy_revoke_caches(); jbd2_journal_destroy_jbd2_journal_head_cache(); jbd2_journal_destroy_handle_cache(); + jbd2_journal_destroy_slabs(); } static int __init journal_init(void) diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 638ce4554c76..8ada2a129d08 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -69,15 +69,8 @@ extern u8 jbd2_journal_enable_debug; #define jbd_debug(f, a...) /**/ #endif -static inline void *jbd2_alloc(size_t size, gfp_t flags) -{ - return (void *)__get_free_pages(flags, get_order(size)); -} - -static inline void jbd2_free(void *ptr, size_t size) -{ - free_pages((unsigned long)ptr, get_order(size)); -}; +extern void *jbd2_alloc(size_t size, gfp_t flags); +extern void jbd2_free(void *ptr, size_t size); #define JBD2_MIN_JOURNAL_BLOCKS 1024 -- cgit v1.2.3 From e03a72e13648ac6277bf2bab6b8324d51f89c0fa Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" <martin.petersen@oracle.com> Date: Mon, 11 Jan 2010 03:21:51 -0500 Subject: block: Stop using byte offsets All callers of the stacking functions use 512-byte sector units rather than byte offsets. Simplify the code so the stacking functions take sectors when specifying data offsets. Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com> Signed-off-by: Jens Axboe <jens.axboe@oracle.com> --- block/blk-settings.c | 26 +++++++++----------------- fs/partitions/check.c | 7 ++++--- include/linux/blkdev.h | 17 +++++------------ 3 files changed, 18 insertions(+), 32 deletions(-) (limited to 'fs') diff --git a/block/blk-settings.c b/block/blk-settings.c index 5eeb9e0d256e..78549c723783 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -507,7 +507,7 @@ static unsigned int lcm(unsigned int a, unsigned int b) * blk_stack_limits - adjust queue_limits for stacked devices * @t: the stacking driver limits (top device) * @b: the underlying queue limits (bottom, component device) - * @offset: offset to beginning of data within component device + * @start: first data sector within component device * * Description: * This function is used by stacking drivers like MD and DM to ensure @@ -525,10 +525,9 @@ static unsigned int lcm(unsigned int a, unsigned int b) * the alignment_offset is undefined. */ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, - sector_t offset) + sector_t start) { - sector_t alignment; - unsigned int top, bottom, ret = 0; + unsigned int top, bottom, alignment, ret = 0; t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors); t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors); @@ -548,7 +547,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, t->misaligned |= b->misaligned; - alignment = queue_limit_alignment_offset(b, offset); + alignment = queue_limit_alignment_offset(b, start); /* Bottom device has different alignment. Check that it is * compatible with the current top alignment. @@ -611,11 +610,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, /* Discard alignment and granularity */ if (b->discard_granularity) { - unsigned int granularity = b->discard_granularity; - offset &= granularity - 1; - - alignment = (granularity + b->discard_alignment - offset) - & (granularity - 1); + alignment = queue_limit_discard_alignment(b, start); if (t->discard_granularity != 0 && t->discard_alignment != alignment) { @@ -657,7 +652,7 @@ int bdev_stack_limits(struct queue_limits *t, struct block_device *bdev, start += get_start_sect(bdev); - return blk_stack_limits(t, &bq->limits, start << 9); + return blk_stack_limits(t, &bq->limits, start); } EXPORT_SYMBOL(bdev_stack_limits); @@ -668,9 +663,8 @@ EXPORT_SYMBOL(bdev_stack_limits); * @offset: offset to beginning of data within component device * * Description: - * Merges the limits for two queues. Returns 0 if alignment - * didn't change. Returns -1 if adding the bottom device caused - * misalignment. + * Merges the limits for a top level gendisk and a bottom level + * block_device. */ void disk_stack_limits(struct gendisk *disk, struct block_device *bdev, sector_t offset) @@ -678,9 +672,7 @@ void disk_stack_limits(struct gendisk *disk, struct block_device *bdev, struct request_queue *t = disk->queue; struct request_queue *b = bdev_get_queue(bdev); - offset += get_start_sect(bdev) << 9; - - if (blk_stack_limits(&t->limits, &b->limits, offset) < 0) { + if (bdev_stack_limits(&t->limits, bdev, offset >> 9) < 0) { char top[BDEVNAME_SIZE], bottom[BDEVNAME_SIZE]; disk_name(disk, 0, top); diff --git a/fs/partitions/check.c b/fs/partitions/check.c index 64bc8998ac9a..e8865c11777f 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -412,9 +412,10 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno, pdev = part_to_dev(p); p->start_sect = start; - p->alignment_offset = queue_sector_alignment_offset(disk->queue, start); - p->discard_alignment = queue_sector_discard_alignment(disk->queue, - start); + p->alignment_offset = + queue_limit_alignment_offset(&disk->queue->limits, start); + p->discard_alignment = + queue_limit_discard_alignment(&disk->queue->limits, start); p->nr_sects = len; p->partno = partno; p->policy = get_disk_ro(disk); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 5c8018977efa..ffb13ad35716 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1112,18 +1112,13 @@ static inline int queue_alignment_offset(struct request_queue *q) return q->limits.alignment_offset; } -static inline int queue_limit_alignment_offset(struct queue_limits *lim, sector_t offset) +static inline int queue_limit_alignment_offset(struct queue_limits *lim, sector_t sector) { unsigned int granularity = max(lim->physical_block_size, lim->io_min); + unsigned int alignment = (sector << 9) & (granularity - 1); - offset &= granularity - 1; - return (granularity + lim->alignment_offset - offset) & (granularity - 1); -} - -static inline int queue_sector_alignment_offset(struct request_queue *q, - sector_t sector) -{ - return queue_limit_alignment_offset(&q->limits, sector << 9); + return (granularity + lim->alignment_offset - alignment) + & (granularity - 1); } static inline int bdev_alignment_offset(struct block_device *bdev) @@ -1147,10 +1142,8 @@ static inline int queue_discard_alignment(struct request_queue *q) return q->limits.discard_alignment; } -static inline int queue_sector_discard_alignment(struct request_queue *q, - sector_t sector) +static inline int queue_limit_discard_alignment(struct queue_limits *lim, sector_t sector) { - struct queue_limits *lim = &q->limits; unsigned int alignment = (sector << 9) & (lim->discard_granularity - 1); return (lim->discard_granularity + lim->discard_alignment - alignment) -- cgit v1.2.3 From c551866e649bac66a5145d100f34086d6edb581e Mon Sep 17 00:00:00 2001 From: Ricardo Labiaga <Ricardo.Labiaga@netapp.com> Date: Fri, 11 Dec 2009 19:10:49 -0800 Subject: nfsd41: nfsd4_decode_compound() does not recognize all ops The server incorrectly assumes that the operations in the array start with value 0. The first operation (OP_ACCESS) has a value of 3, causing the check in nfsd4_decode_compound to be off. Instead of comparing that the operation number is less than the number of elements in the array, the server should verify that it is less than the maximum valid operation number defined by LAST_NFS4_OP. Signed-off-by: Ricardo Labiaga <Ricardo.Labiaga@netapp.com> Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu> --- fs/nfsd/nfs4xdr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index a8587e90fd5a..4f14f0c0616f 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -1434,7 +1434,7 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp) } op->opnum = ntohl(*argp->p++); - if (op->opnum >= OP_ACCESS && op->opnum < ops->nops) + if (op->opnum >= OP_ACCESS && op->opnum <= LAST_NFS4_OP) op->status = ops->decoders[op->opnum](argp, &op->u); else { op->opnum = OP_ILLEGAL; -- cgit v1.2.3 From de3cab793c6a5c8505d66bee111edcc7098380ba Mon Sep 17 00:00:00 2001 From: Ricardo Labiaga <Ricardo.Labiaga@netapp.com> Date: Fri, 11 Dec 2009 20:03:27 -0800 Subject: nfsd4: Use FIRST_NFS4_OP in nfsd4_decode_compound() Since we're checking for LAST_NFS4_OP, use FIRST_NFS4_OP to be consistent. Signed-off-by: Ricardo Labiaga <Ricardo.Labiaga@netapp.com> Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu> --- fs/nfsd/nfs4xdr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 4f14f0c0616f..c458fb11c957 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -1434,7 +1434,7 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp) } op->opnum = ntohl(*argp->p++); - if (op->opnum >= OP_ACCESS && op->opnum <= LAST_NFS4_OP) + if (op->opnum >= FIRST_NFS4_OP && op->opnum <= LAST_NFS4_OP) op->status = ops->decoders[op->opnum](argp, &op->u); else { op->opnum = OP_ILLEGAL; -- cgit v1.2.3 From 6a68f89ee1f2d177af4a5410fa7a45734c975fd6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Fri, 25 Dec 2009 17:45:13 +0100 Subject: nfsd: use vfs_fsync for non-directories Instead of opencoding the fsync calling sequence use vfs_fsync. This also gets rid of the useless i_mutex over the data writeout. Consolidate the remaining special code for syncing directories and document it's quirks. Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu> --- fs/nfsd/vfs.c | 49 +++++++++++++++++-------------------------------- 1 file changed, 17 insertions(+), 32 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index c194793b642b..79d216f276d9 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -769,40 +769,25 @@ nfsd_close(struct file *filp) } /* - * Sync a file - * As this calls fsync (not fdatasync) there is no need for a write_inode - * after it. + * Sync a directory to disk. + * + * We can't just call vfs_fsync because our requirements are slightly odd: + * + * a) we do not have a file struct available + * b) we expect to have i_mutex already held by the caller */ -static inline int nfsd_dosync(struct file *filp, struct dentry *dp, - const struct file_operations *fop) -{ - struct inode *inode = dp->d_inode; - int (*fsync) (struct file *, struct dentry *, int); - int err; - - err = filemap_write_and_wait(inode->i_mapping); - if (err == 0 && fop && (fsync = fop->fsync)) - err = fsync(filp, dp, 0); - return err; -} - -static int -nfsd_sync(struct file *filp) +int +nfsd_sync_dir(struct dentry *dentry) { - int err; - struct inode *inode = filp->f_path.dentry->d_inode; - dprintk("nfsd: sync file %s\n", filp->f_path.dentry->d_name.name); - mutex_lock(&inode->i_mutex); - err=nfsd_dosync(filp, filp->f_path.dentry, filp->f_op); - mutex_unlock(&inode->i_mutex); + struct inode *inode = dentry->d_inode; + int error; - return err; -} + WARN_ON(!mutex_is_locked(&inode->i_mutex)); -int -nfsd_sync_dir(struct dentry *dp) -{ - return nfsd_dosync(NULL, dp, dp->d_inode->i_fop); + error = filemap_write_and_wait(inode->i_mapping); + if (!error && inode->i_fop->fsync) + error = inode->i_fop->fsync(NULL, dentry, 0); + return error; } /* @@ -1008,7 +993,7 @@ static int wait_for_concurrent_writes(struct file *file) if (inode->i_state & I_DIRTY) { dprintk("nfsd: write sync %d\n", task_pid_nr(current)); - err = nfsd_sync(file); + err = vfs_fsync(file, file->f_path.dentry, 0); } last_ino = inode->i_ino; last_dev = inode->i_sb->s_dev; @@ -1177,7 +1162,7 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, return err; if (EX_ISSYNC(fhp->fh_export)) { if (file->f_op && file->f_op->fsync) { - err = nfserrno(nfsd_sync(file)); + err = nfserrno(vfs_fsync(file, file->f_path.dentry, 0)); } else { err = nfserr_notsupp; } -- cgit v1.2.3 From 8b8aae4009349397fffe7bd38a8fa200c9a5bcad Mon Sep 17 00:00:00 2001 From: Ricardo Labiaga <Ricardo.Labiaga@netapp.com> Date: Fri, 11 Dec 2009 19:10:48 -0800 Subject: nfsd41: Create the recovery entry for the NFSv4.1 client Signed-off-by: Ricardo Labiaga <Ricardo.Labiaga@netapp.com> Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu> --- fs/nfsd/nfs4state.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index f19ed866c95f..3a20c09353ed 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2480,8 +2480,10 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf } memcpy(&open->op_stateid, &stp->st_stateid, sizeof(stateid_t)); - if (nfsd4_has_session(&resp->cstate)) + if (nfsd4_has_session(&resp->cstate)) { open->op_stateowner->so_confirmed = 1; + nfsd4_create_clid_dir(open->op_stateowner->so_client); + } /* * Attempt to hand out a delegation. No error return, because the -- cgit v1.2.3 From 3a85cd96d3ab3c6dcf88b81fc6eaddb84e565a43 Mon Sep 17 00:00:00 2001 From: Dave Chinner <david@fromorbit.com> Date: Thu, 14 Jan 2010 01:33:55 +0000 Subject: xfs: add tracing to xfs_swap_extents To be able to diagnose whether the swap extents function is detecting compatible inode data fork configurations for swapping extents, add tracing points to the code to allow us to see the format of the inode forks before and after the swap. Signed-off-by: Dave Chinner <david@fromorbit.com> Signed-off-by: Alex Elder <aelder@sgi.com> --- fs/xfs/linux-2.6/xfs_trace.h | 53 ++++++++++++++++++++++++++++++++++++++++++++ fs/xfs/xfs_dfrag.c | 5 +++++ 2 files changed, 58 insertions(+) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h index c22a608321a3..3353aef50530 100644 --- a/fs/xfs/linux-2.6/xfs_trace.h +++ b/fs/xfs/linux-2.6/xfs_trace.h @@ -1414,6 +1414,59 @@ TRACE_EVENT(xfs_dir2_leafn_moveents, __entry->count) ); +#define XFS_SWAPEXT_INODES \ + { 0, "target" }, \ + { 1, "temp" } + +#define XFS_INODE_FORMAT_STR \ + { 0, "invalid" }, \ + { 1, "local" }, \ + { 2, "extent" }, \ + { 3, "btree" } + +DECLARE_EVENT_CLASS(xfs_swap_extent_class, + TP_PROTO(struct xfs_inode *ip, int which), + TP_ARGS(ip, which), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(int, which) + __field(xfs_ino_t, ino) + __field(int, format) + __field(int, nex) + __field(int, max_nex) + __field(int, broot_size) + __field(int, fork_off) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->which = which; + __entry->ino = ip->i_ino; + __entry->format = ip->i_d.di_format; + __entry->nex = ip->i_d.di_nextents; + __entry->max_nex = ip->i_df.if_ext_max; + __entry->broot_size = ip->i_df.if_broot_bytes; + __entry->fork_off = XFS_IFORK_BOFF(ip); + ), + TP_printk("dev %d:%d ino 0x%llx (%s), %s format, num_extents %d, " + "Max in-fork extents %d, broot size %d, fork offset %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __print_symbolic(__entry->which, XFS_SWAPEXT_INODES), + __print_symbolic(__entry->format, XFS_INODE_FORMAT_STR), + __entry->nex, + __entry->max_nex, + __entry->broot_size, + __entry->fork_off) +) + +#define DEFINE_SWAPEXT_EVENT(name) \ +DEFINE_EVENT(xfs_swap_extent_class, name, \ + TP_PROTO(struct xfs_inode *ip, int which), \ + TP_ARGS(ip, which)) + +DEFINE_SWAPEXT_EVENT(xfs_swap_extent_before); +DEFINE_SWAPEXT_EVENT(xfs_swap_extent_after); + #endif /* _TRACE_XFS_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c index 84ca1cf16a1e..f25e54027d10 100644 --- a/fs/xfs/xfs_dfrag.c +++ b/fs/xfs/xfs_dfrag.c @@ -254,6 +254,9 @@ xfs_swap_extents( goto out_unlock; } + trace_xfs_swap_extent_before(ip, 0); + trace_xfs_swap_extent_before(tip, 1); + /* check inode formats now that data is flushed */ error = xfs_swap_extents_check_format(ip, tip); if (error) { @@ -421,6 +424,8 @@ xfs_swap_extents( error = xfs_trans_commit(tp, XFS_TRANS_SWAPEXT); + trace_xfs_swap_extent_after(ip, 0); + trace_xfs_swap_extent_after(tip, 1); out: kmem_free(tempifp); return error; -- cgit v1.2.3 From 6bded0f383fd7971b76ad6c194dda7d5b814b871 Mon Sep 17 00:00:00 2001 From: Dave Chinner <david@fromorbit.com> Date: Thu, 14 Jan 2010 01:33:56 +0000 Subject: xfs: clean up inconsistent variable naming in xfs_swap_extent The swap extent ioctl passes in a target inode and a temporary inode which are clearly named in the ioctl structure. The code then assigns temp to target and vice versa, making it extremely difficult to work out which inode is which later in the code. Make this consistent throughout the code. Also make xfs_swap_extent static as there are no external users of the function. Signed-off-by: Dave Chinner <david@fromorbit.com> Signed-off-by: Alex Elder <aelder@sgi.com> --- fs/xfs/xfs_dfrag.c | 38 ++++++++++++++++++++++---------------- fs/xfs/xfs_dfrag.h | 3 --- 2 files changed, 22 insertions(+), 19 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c index f25e54027d10..cd27c9d6c71f 100644 --- a/fs/xfs/xfs_dfrag.c +++ b/fs/xfs/xfs_dfrag.c @@ -45,15 +45,21 @@ #include "xfs_vnodeops.h" #include "xfs_trace.h" + +static int xfs_swap_extents( + xfs_inode_t *ip, /* target inode */ + xfs_inode_t *tip, /* tmp inode */ + xfs_swapext_t *sxp); + /* - * Syssgi interface for swapext + * ioctl interface for swapext */ int xfs_swapext( xfs_swapext_t *sxp) { xfs_inode_t *ip, *tip; - struct file *file, *target_file; + struct file *file, *tmp_file; int error = 0; /* Pull information for the target fd */ @@ -68,46 +74,46 @@ xfs_swapext( goto out_put_file; } - target_file = fget((int)sxp->sx_fdtmp); - if (!target_file) { + tmp_file = fget((int)sxp->sx_fdtmp); + if (!tmp_file) { error = XFS_ERROR(EINVAL); goto out_put_file; } - if (!(target_file->f_mode & FMODE_WRITE) || - (target_file->f_flags & O_APPEND)) { + if (!(tmp_file->f_mode & FMODE_WRITE) || + (tmp_file->f_flags & O_APPEND)) { error = XFS_ERROR(EBADF); - goto out_put_target_file; + goto out_put_tmp_file; } if (IS_SWAPFILE(file->f_path.dentry->d_inode) || - IS_SWAPFILE(target_file->f_path.dentry->d_inode)) { + IS_SWAPFILE(tmp_file->f_path.dentry->d_inode)) { error = XFS_ERROR(EINVAL); - goto out_put_target_file; + goto out_put_tmp_file; } ip = XFS_I(file->f_path.dentry->d_inode); - tip = XFS_I(target_file->f_path.dentry->d_inode); + tip = XFS_I(tmp_file->f_path.dentry->d_inode); if (ip->i_mount != tip->i_mount) { error = XFS_ERROR(EINVAL); - goto out_put_target_file; + goto out_put_tmp_file; } if (ip->i_ino == tip->i_ino) { error = XFS_ERROR(EINVAL); - goto out_put_target_file; + goto out_put_tmp_file; } if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { error = XFS_ERROR(EIO); - goto out_put_target_file; + goto out_put_tmp_file; } error = xfs_swap_extents(ip, tip, sxp); - out_put_target_file: - fput(target_file); + out_put_tmp_file: + fput(tmp_file); out_put_file: fput(file); out: @@ -186,7 +192,7 @@ xfs_swap_extents_check_format( return 0; } -int +static int xfs_swap_extents( xfs_inode_t *ip, /* target inode */ xfs_inode_t *tip, /* tmp inode */ diff --git a/fs/xfs/xfs_dfrag.h b/fs/xfs/xfs_dfrag.h index 4f55a6306558..20bdd935c121 100644 --- a/fs/xfs/xfs_dfrag.h +++ b/fs/xfs/xfs_dfrag.h @@ -48,9 +48,6 @@ typedef struct xfs_swapext */ int xfs_swapext(struct xfs_swapext *sx); -int xfs_swap_extents(struct xfs_inode *ip, struct xfs_inode *tip, - struct xfs_swapext *sxp); - #endif /* __KERNEL__ */ #endif /* __XFS_DFRAG_H__ */ -- cgit v1.2.3 From 5d77c0dc0c05c2c65aee16149fae06831a118730 Mon Sep 17 00:00:00 2001 From: Eric Sandeen <sandeen@sandeen.net> Date: Thu, 19 Nov 2009 15:52:00 +0000 Subject: xfs: make several more functions static Just minor housekeeping, a lot more functions can be trivially made static; others could if we reordered things a bit... Signed-off-by: Eric Sandeen <sandeen@sandeen.net> Signed-off-by: Alex Elder <aelder@sgi.com> --- fs/xfs/linux-2.6/xfs_sync.c | 2 +- fs/xfs/linux-2.6/xfs_sync.h | 1 - fs/xfs/xfs_attr.c | 2 +- fs/xfs/xfs_attr.h | 1 - fs/xfs/xfs_bmap_btree.c | 2 +- fs/xfs/xfs_bmap_btree.h | 1 - fs/xfs/xfs_dir2_node.c | 2 +- fs/xfs/xfs_dir2_node.h | 2 -- fs/xfs/xfs_log_priv.h | 5 ----- fs/xfs/xfs_log_recover.c | 6 +++--- 10 files changed, 7 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index 1f5e4bb5e970..0f90bfe2815f 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c @@ -351,7 +351,7 @@ xfs_commit_dummy_trans( return error; } -int +STATIC int xfs_sync_fsdata( struct xfs_mount *mp, int flags) diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h index ea932b43335d..d480c346cabb 100644 --- a/fs/xfs/linux-2.6/xfs_sync.h +++ b/fs/xfs/linux-2.6/xfs_sync.h @@ -37,7 +37,6 @@ void xfs_syncd_stop(struct xfs_mount *mp); int xfs_sync_attr(struct xfs_mount *mp, int flags); int xfs_sync_data(struct xfs_mount *mp, int flags); -int xfs_sync_fsdata(struct xfs_mount *mp, int flags); int xfs_quiesce_data(struct xfs_mount *mp); void xfs_quiesce_attr(struct xfs_mount *mp); diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index e953b6cfb2a8..9d11ebad43b6 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -197,7 +197,7 @@ xfs_attr_get( /* * Calculate how many blocks we need for the new attribute, */ -int +STATIC int xfs_attr_calc_size( struct xfs_inode *ip, int namelen, diff --git a/fs/xfs/xfs_attr.h b/fs/xfs/xfs_attr.h index 59b410ce69a1..9c3a24372914 100644 --- a/fs/xfs/xfs_attr.h +++ b/fs/xfs/xfs_attr.h @@ -139,7 +139,6 @@ typedef struct xfs_attr_list_context { /* * Overall external interface routines. */ -int xfs_attr_calc_size(struct xfs_inode *, int, int, int *); int xfs_attr_inactive(struct xfs_inode *dp); int xfs_attr_rmtval_get(struct xfs_da_args *args); int xfs_attr_list_int(struct xfs_attr_list_context *); diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c index 38751d5fac6f..416e47e54b83 100644 --- a/fs/xfs/xfs_bmap_btree.c +++ b/fs/xfs/xfs_bmap_btree.c @@ -334,7 +334,7 @@ xfs_bmbt_disk_set_allf( /* * Set all the fields in a bmap extent record from the uncompressed form. */ -void +STATIC void xfs_bmbt_disk_set_all( xfs_bmbt_rec_t *r, xfs_bmbt_irec_t *s) diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h index cf07ca7c22e7..0e66c4ea0f85 100644 --- a/fs/xfs/xfs_bmap_btree.h +++ b/fs/xfs/xfs_bmap_btree.h @@ -223,7 +223,6 @@ extern void xfs_bmbt_set_startblock(xfs_bmbt_rec_host_t *r, xfs_fsblock_t v); extern void xfs_bmbt_set_startoff(xfs_bmbt_rec_host_t *r, xfs_fileoff_t v); extern void xfs_bmbt_set_state(xfs_bmbt_rec_host_t *r, xfs_exntst_t v); -extern void xfs_bmbt_disk_set_all(xfs_bmbt_rec_t *r, xfs_bmbt_irec_t *s); extern void xfs_bmbt_disk_set_allf(xfs_bmbt_rec_t *r, xfs_fileoff_t o, xfs_fsblock_t b, xfs_filblks_t c, xfs_exntst_t v); diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c index ce6e355199b5..78fc4d9ae756 100644 --- a/fs/xfs/xfs_dir2_node.c +++ b/fs/xfs/xfs_dir2_node.c @@ -65,7 +65,7 @@ static int xfs_dir2_node_addname_int(xfs_da_args_t *args, /* * Log entries from a freespace block. */ -void +STATIC void xfs_dir2_free_log_bests( xfs_trans_t *tp, /* transaction pointer */ xfs_dabuf_t *bp, /* freespace buffer */ diff --git a/fs/xfs/xfs_dir2_node.h b/fs/xfs/xfs_dir2_node.h index dde72db3d695..82dfe7147195 100644 --- a/fs/xfs/xfs_dir2_node.h +++ b/fs/xfs/xfs_dir2_node.h @@ -75,8 +75,6 @@ xfs_dir2_db_to_fdindex(struct xfs_mount *mp, xfs_dir2_db_t db) return ((db) % XFS_DIR2_MAX_FREE_BESTS(mp)); } -extern void xfs_dir2_free_log_bests(struct xfs_trans *tp, struct xfs_dabuf *bp, - int first, int last); extern int xfs_dir2_leaf_to_node(struct xfs_da_args *args, struct xfs_dabuf *lbp); extern xfs_dahash_t xfs_dir2_leafn_lasthash(struct xfs_dabuf *bp, int *count); diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index d55662db7077..fd02a18facd5 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -443,14 +443,9 @@ typedef struct log { /* common routines */ extern xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp); -extern int xlog_find_tail(xlog_t *log, - xfs_daddr_t *head_blk, - xfs_daddr_t *tail_blk); extern int xlog_recover(xlog_t *log); extern int xlog_recover_finish(xlog_t *log); extern void xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int); -extern struct xfs_buf *xlog_get_bp(xlog_t *, int); -extern void xlog_put_bp(struct xfs_buf *); extern kmem_zone_t *xfs_log_ticket_zone; diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 69ac2e5ef20c..48a7ab1e6311 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -68,7 +68,7 @@ STATIC void xlog_recover_check_summary(xlog_t *); ((bbs + (log)->l_sectbb_mask + 1) & ~(log)->l_sectbb_mask) : (bbs) ) #define XLOG_SECTOR_ROUNDDOWN_BLKNO(log, bno) ((bno) & ~(log)->l_sectbb_mask) -xfs_buf_t * +STATIC xfs_buf_t * xlog_get_bp( xlog_t *log, int nbblks) @@ -88,7 +88,7 @@ xlog_get_bp( return xfs_buf_get_noaddr(BBTOB(nbblks), log->l_mp->m_logdev_targp); } -void +STATIC void xlog_put_bp( xfs_buf_t *bp) { @@ -805,7 +805,7 @@ xlog_find_head( * We could speed up search by using current head_blk buffer, but it is not * available. */ -int +STATIC int xlog_find_tail( xlog_t *log, xfs_daddr_t *head_blk, -- cgit v1.2.3 From f0a7695380efa31cd281730917f7e907a724d5cb Mon Sep 17 00:00:00 2001 From: Dave Chinner <david@fromorbit.com> Date: Mon, 11 Jan 2010 11:49:57 +0000 Subject: xfs: Use list_heads for log recovery item lists Remove the roll-your-own linked list operations. Signed-off-by: Dave Chinner <david@fromorbit.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Alex Elder <aelder@sgi.com> --- fs/xfs/xfs_log_recover.c | 205 ++++++++++++++++------------------------------- fs/xfs/xfs_log_recover.h | 23 +++--- 2 files changed, 81 insertions(+), 147 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 48a7ab1e6311..65f1f137d789 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -50,8 +50,6 @@ STATIC int xlog_find_zeroed(xlog_t *, xfs_daddr_t *); STATIC int xlog_clear_stale_blocks(xlog_t *, xfs_lsn_t); -STATIC void xlog_recover_insert_item_backq(xlog_recover_item_t **q, - xlog_recover_item_t *item); #if defined(DEBUG) STATIC void xlog_recover_check_summary(xlog_t *); #else @@ -1367,36 +1365,45 @@ xlog_clear_stale_blocks( STATIC xlog_recover_t * xlog_recover_find_tid( - xlog_recover_t *q, + struct hlist_head *head, xlog_tid_t tid) { - xlog_recover_t *p = q; + xlog_recover_t *trans; + struct hlist_node *n; - while (p != NULL) { - if (p->r_log_tid == tid) - break; - p = p->r_next; + hlist_for_each_entry(trans, n, head, r_list) { + if (trans->r_log_tid == tid) + return trans; } - return p; + return NULL; } STATIC void -xlog_recover_put_hashq( - xlog_recover_t **q, - xlog_recover_t *trans) +xlog_recover_new_tid( + struct hlist_head *head, + xlog_tid_t tid, + xfs_lsn_t lsn) { - trans->r_next = *q; - *q = trans; + xlog_recover_t *trans; + + trans = kmem_zalloc(sizeof(xlog_recover_t), KM_SLEEP); + trans->r_log_tid = tid; + trans->r_lsn = lsn; + INIT_LIST_HEAD(&trans->r_itemq); + + INIT_HLIST_NODE(&trans->r_list); + hlist_add_head(&trans->r_list, head); } STATIC void xlog_recover_add_item( - xlog_recover_item_t **itemq) + struct list_head *head) { xlog_recover_item_t *item; item = kmem_zalloc(sizeof(xlog_recover_item_t), KM_SLEEP); - xlog_recover_insert_item_backq(itemq, item); + INIT_LIST_HEAD(&item->ri_list); + list_add_tail(&item->ri_list, head); } STATIC int @@ -1409,8 +1416,7 @@ xlog_recover_add_to_cont_trans( xfs_caddr_t ptr, old_ptr; int old_len; - item = trans->r_itemq; - if (item == NULL) { + if (list_empty(&trans->r_itemq)) { /* finish copying rest of trans header */ xlog_recover_add_item(&trans->r_itemq); ptr = (xfs_caddr_t) &trans->r_theader + @@ -1418,7 +1424,8 @@ xlog_recover_add_to_cont_trans( memcpy(ptr, dp, len); /* d, s, l */ return 0; } - item = item->ri_prev; + /* take the tail entry */ + item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list); old_ptr = item->ri_buf[item->ri_cnt-1].i_addr; old_len = item->ri_buf[item->ri_cnt-1].i_len; @@ -1455,8 +1462,7 @@ xlog_recover_add_to_trans( if (!len) return 0; - item = trans->r_itemq; - if (item == NULL) { + if (list_empty(&trans->r_itemq)) { /* we need to catch log corruptions here */ if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) { xlog_warn("XFS: xlog_recover_add_to_trans: " @@ -1474,12 +1480,15 @@ xlog_recover_add_to_trans( memcpy(ptr, dp, len); in_f = (xfs_inode_log_format_t *)ptr; - if (item->ri_prev->ri_total != 0 && - item->ri_prev->ri_total == item->ri_prev->ri_cnt) { + /* take the tail entry */ + item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list); + if (item->ri_total != 0 && + item->ri_total == item->ri_cnt) { + /* tail item is in use, get a new one */ xlog_recover_add_item(&trans->r_itemq); + item = list_entry(trans->r_itemq.prev, + xlog_recover_item_t, ri_list); } - item = trans->r_itemq; - item = item->ri_prev; if (item->ri_total == 0) { /* first region to be added */ if (in_f->ilf_size == 0 || @@ -1504,96 +1513,29 @@ xlog_recover_add_to_trans( return 0; } -STATIC void -xlog_recover_new_tid( - xlog_recover_t **q, - xlog_tid_t tid, - xfs_lsn_t lsn) -{ - xlog_recover_t *trans; - - trans = kmem_zalloc(sizeof(xlog_recover_t), KM_SLEEP); - trans->r_log_tid = tid; - trans->r_lsn = lsn; - xlog_recover_put_hashq(q, trans); -} - -STATIC int -xlog_recover_unlink_tid( - xlog_recover_t **q, - xlog_recover_t *trans) -{ - xlog_recover_t *tp; - int found = 0; - - ASSERT(trans != NULL); - if (trans == *q) { - *q = (*q)->r_next; - } else { - tp = *q; - while (tp) { - if (tp->r_next == trans) { - found = 1; - break; - } - tp = tp->r_next; - } - if (!found) { - xlog_warn( - "XFS: xlog_recover_unlink_tid: trans not found"); - ASSERT(0); - return XFS_ERROR(EIO); - } - tp->r_next = tp->r_next->r_next; - } - return 0; -} - -STATIC void -xlog_recover_insert_item_backq( - xlog_recover_item_t **q, - xlog_recover_item_t *item) -{ - if (*q == NULL) { - item->ri_prev = item->ri_next = item; - *q = item; - } else { - item->ri_next = *q; - item->ri_prev = (*q)->ri_prev; - (*q)->ri_prev = item; - item->ri_prev->ri_next = item; - } -} - -STATIC void -xlog_recover_insert_item_frontq( - xlog_recover_item_t **q, - xlog_recover_item_t *item) -{ - xlog_recover_insert_item_backq(q, item); - *q = item; -} - +/* + * Sort the log items in the transaction. Cancelled buffers need + * to be put first so they are processed before any items that might + * modify the buffers. If they are cancelled, then the modifications + * don't need to be replayed. + */ STATIC int xlog_recover_reorder_trans( xlog_recover_t *trans) { - xlog_recover_item_t *first_item, *itemq, *itemq_next; - xfs_buf_log_format_t *buf_f; - ushort flags = 0; + xlog_recover_item_t *item, *n; + LIST_HEAD(sort_list); + + list_splice_init(&trans->r_itemq, &sort_list); + list_for_each_entry_safe(item, n, &sort_list, ri_list) { + xfs_buf_log_format_t *buf_f; - first_item = itemq = trans->r_itemq; - trans->r_itemq = NULL; - do { - itemq_next = itemq->ri_next; - buf_f = (xfs_buf_log_format_t *)itemq->ri_buf[0].i_addr; + buf_f = (xfs_buf_log_format_t *)item->ri_buf[0].i_addr; - switch (ITEM_TYPE(itemq)) { + switch (ITEM_TYPE(item)) { case XFS_LI_BUF: - flags = buf_f->blf_flags; - if (!(flags & XFS_BLI_CANCEL)) { - xlog_recover_insert_item_frontq(&trans->r_itemq, - itemq); + if (!(buf_f->blf_flags & XFS_BLI_CANCEL)) { + list_move(&item->ri_list, &trans->r_itemq); break; } case XFS_LI_INODE: @@ -1601,7 +1543,7 @@ xlog_recover_reorder_trans( case XFS_LI_QUOTAOFF: case XFS_LI_EFD: case XFS_LI_EFI: - xlog_recover_insert_item_backq(&trans->r_itemq, itemq); + list_move_tail(&item->ri_list, &trans->r_itemq); break; default: xlog_warn( @@ -1609,8 +1551,8 @@ xlog_recover_reorder_trans( ASSERT(0); return XFS_ERROR(EIO); } - itemq = itemq_next; - } while (first_item != itemq); + } + ASSERT(list_empty(&sort_list)); return 0; } @@ -2814,14 +2756,13 @@ xlog_recover_do_trans( int pass) { int error = 0; - xlog_recover_item_t *item, *first_item; + xlog_recover_item_t *item; error = xlog_recover_reorder_trans(trans); if (error) return error; - first_item = item = trans->r_itemq; - do { + list_for_each_entry(item, &trans->r_itemq, ri_list) { switch (ITEM_TYPE(item)) { case XFS_LI_BUF: error = xlog_recover_do_buffer_trans(log, item, pass); @@ -2854,8 +2795,7 @@ xlog_recover_do_trans( if (error) return error; - item = item->ri_next; - } while (first_item != item); + } return 0; } @@ -2869,21 +2809,18 @@ STATIC void xlog_recover_free_trans( xlog_recover_t *trans) { - xlog_recover_item_t *first_item, *item, *free_item; + xlog_recover_item_t *item, *n; int i; - item = first_item = trans->r_itemq; - do { - free_item = item; - item = item->ri_next; - /* Free the regions in the item. */ - for (i = 0; i < free_item->ri_cnt; i++) { - kmem_free(free_item->ri_buf[i].i_addr); - } + list_for_each_entry_safe(item, n, &trans->r_itemq, ri_list) { + /* Free the regions in the item. */ + list_del(&item->ri_list); + for (i = 0; i < item->ri_cnt; i++) + kmem_free(item->ri_buf[i].i_addr); /* Free the item itself */ - kmem_free(free_item->ri_buf); - kmem_free(free_item); - } while (first_item != item); + kmem_free(item->ri_buf); + kmem_free(item); + } /* Free the transaction recover structure */ kmem_free(trans); } @@ -2891,14 +2828,12 @@ xlog_recover_free_trans( STATIC int xlog_recover_commit_trans( xlog_t *log, - xlog_recover_t **q, xlog_recover_t *trans, int pass) { int error; - if ((error = xlog_recover_unlink_tid(q, trans))) - return error; + hlist_del(&trans->r_list); if ((error = xlog_recover_do_trans(log, trans, pass))) return error; xlog_recover_free_trans(trans); /* no error */ @@ -2926,7 +2861,7 @@ xlog_recover_unmount_trans( STATIC int xlog_recover_process_data( xlog_t *log, - xlog_recover_t *rhash[], + struct hlist_head rhash[], xlog_rec_header_t *rhead, xfs_caddr_t dp, int pass) @@ -2960,7 +2895,7 @@ xlog_recover_process_data( } tid = be32_to_cpu(ohead->oh_tid); hash = XLOG_RHASH(tid); - trans = xlog_recover_find_tid(rhash[hash], tid); + trans = xlog_recover_find_tid(&rhash[hash], tid); if (trans == NULL) { /* not found; add new tid */ if (ohead->oh_flags & XLOG_START_TRANS) xlog_recover_new_tid(&rhash[hash], tid, @@ -2978,7 +2913,7 @@ xlog_recover_process_data( switch (flags) { case XLOG_COMMIT_TRANS: error = xlog_recover_commit_trans(log, - &rhash[hash], trans, pass); + trans, pass); break; case XLOG_UNMOUNT_TRANS: error = xlog_recover_unmount_trans(trans); @@ -3517,7 +3452,7 @@ xlog_do_recovery_pass( int error = 0, h_size; int bblks, split_bblks; int hblks, split_hblks, wrapped_hblks; - xlog_recover_t *rhash[XLOG_RHASH_SIZE]; + struct hlist_head rhash[XLOG_RHASH_SIZE]; ASSERT(head_blk != tail_blk); diff --git a/fs/xfs/xfs_log_recover.h b/fs/xfs/xfs_log_recover.h index b22545555301..75d749207258 100644 --- a/fs/xfs/xfs_log_recover.h +++ b/fs/xfs/xfs_log_recover.h @@ -35,22 +35,21 @@ * item headers are in ri_buf[0]. Additional buffers follow. */ typedef struct xlog_recover_item { - struct xlog_recover_item *ri_next; - struct xlog_recover_item *ri_prev; - int ri_type; - int ri_cnt; /* count of regions found */ - int ri_total; /* total regions */ - xfs_log_iovec_t *ri_buf; /* ptr to regions buffer */ + struct list_head ri_list; + int ri_type; + int ri_cnt; /* count of regions found */ + int ri_total; /* total regions */ + xfs_log_iovec_t *ri_buf; /* ptr to regions buffer */ } xlog_recover_item_t; struct xlog_tid; typedef struct xlog_recover { - struct xlog_recover *r_next; - xlog_tid_t r_log_tid; /* log's transaction id */ - xfs_trans_header_t r_theader; /* trans header for partial */ - int r_state; /* not needed */ - xfs_lsn_t r_lsn; /* xact lsn */ - xlog_recover_item_t *r_itemq; /* q for items */ + struct hlist_node r_list; + xlog_tid_t r_log_tid; /* log's transaction id */ + xfs_trans_header_t r_theader; /* trans header for partial */ + int r_state; /* not needed */ + xfs_lsn_t r_lsn; /* xact lsn */ + struct list_head r_itemq; /* q for items */ } xlog_recover_t; #define ITEM_TYPE(i) (*(ushort *)(i)->ri_buf[0].i_addr) -- cgit v1.2.3 From 453eac8a9aa417878a38bdfbccafd5f7ce4e8e4e Mon Sep 17 00:00:00 2001 From: Dave Chinner <david@fromorbit.com> Date: Mon, 11 Jan 2010 11:49:58 +0000 Subject: xfs: Don't wake the aild once per second Now that the AIL push algorithm is traversal safe, we don't need a watchdog function in the xfsaild to catch pushes that fail to make progress. Remove the watchdog timeout and make pushes purely driven by demand. This will remove the once-per-second wakeup that is seen when the filesystem is idle and make laptop power misers happy. Signed-off-by: Dave Chinner <david@fromorbit.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Alex Elder <aelder@sgi.com> --- fs/xfs/linux-2.6/xfs_super.c | 7 +++---- fs/xfs/xfs_trans_ail.c | 19 +++++++++++-------- 2 files changed, 14 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 77414db10dc2..9f2e398a5616 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -877,12 +877,11 @@ xfsaild( { struct xfs_ail *ailp = data; xfs_lsn_t last_pushed_lsn = 0; - long tout = 0; + long tout = 0; /* milliseconds */ while (!kthread_should_stop()) { - if (tout) - schedule_timeout_interruptible(msecs_to_jiffies(tout)); - tout = 1000; + schedule_timeout_interruptible(tout ? + msecs_to_jiffies(tout) : MAX_SCHEDULE_TIMEOUT); /* swsusp */ try_to_freeze(); diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 2ffc570679be..063dfbdca94b 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -237,14 +237,15 @@ out: } /* - * Function that does the work of pushing on the AIL + * xfsaild_push does the work of pushing on the AIL. Returning a timeout of + * zero indicates that the caller should sleep until woken. */ long xfsaild_push( struct xfs_ail *ailp, xfs_lsn_t *last_lsn) { - long tout = 1000; /* milliseconds */ + long tout = 0; xfs_lsn_t last_pushed_lsn = *last_lsn; xfs_lsn_t target = ailp->xa_target; xfs_lsn_t lsn; @@ -262,7 +263,7 @@ xfsaild_push( */ xfs_trans_ail_cursor_done(ailp, cur); spin_unlock(&ailp->xa_lock); - last_pushed_lsn = 0; + *last_lsn = 0; return tout; } @@ -279,7 +280,6 @@ xfsaild_push( * prevents use from spinning when we can't do anything or there is * lots of contention on the AIL lists. */ - tout = 10; lsn = lip->li_lsn; flush_log = stuck = count = 0; while ((XFS_LSN_CMP(lip->li_lsn, target) < 0)) { @@ -376,14 +376,14 @@ xfsaild_push( if (!count) { /* We're past our target or empty, so idle */ - tout = 1000; + last_pushed_lsn = 0; } else if (XFS_LSN_CMP(lsn, target) >= 0) { /* * We reached the target so wait a bit longer for I/O to * complete and remove pushed items from the AIL before we * start the next scan from the start of the AIL. */ - tout += 20; + tout = 50; last_pushed_lsn = 0; } else if ((stuck * 100) / count > 90) { /* @@ -395,11 +395,14 @@ xfsaild_push( * Backoff a bit more to allow some I/O to complete before * continuing from where we were. */ - tout += 10; + tout = 20; + } else { + /* more to do, but wait a short while before continuing */ + tout = 10; } *last_lsn = last_pushed_lsn; return tout; -} /* xfsaild_push */ +} /* -- cgit v1.2.3 From c9c129714e71c890bed1bd5b61697a896c3c2d54 Mon Sep 17 00:00:00 2001 From: Dave Chinner <david@fromorbit.com> Date: Mon, 11 Jan 2010 11:49:59 +0000 Subject: xfs: Don't wake xfsbufd when idle The xfsbufd wakes every xfsbufd_centisecs (once per second by default) for each filesystem even when the filesystem is idle. If the xfsbufd has nothing to do, put it into a long term sleep and only wake it up when there is work pending (i.e. dirty buffers to flush soon). This will make laptop power misers happy. Signed-off-by: Dave Chinner <david@fromorbit.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Alex Elder <aelder@sgi.com> --- fs/xfs/linux-2.6/xfs_buf.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 77b8be81c769..18ae3ba8f78a 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -1595,6 +1595,11 @@ xfs_buf_delwri_queue( list_del(&bp->b_list); } + if (list_empty(dwq)) { + /* start xfsbufd as it is about to have something to do */ + wake_up_process(bp->b_target->bt_task); + } + bp->b_flags |= _XBF_DELWRI_Q; list_add_tail(&bp->b_list, dwq); bp->b_queuetime = jiffies; @@ -1644,6 +1649,8 @@ xfsbufd_wakeup( list_for_each_entry(btp, &xfs_buftarg_list, bt_list) { if (test_bit(XBT_FORCE_SLEEP, &btp->bt_flags)) continue; + if (list_empty(&btp->bt_delwrite_queue)) + continue; set_bit(XBT_FORCE_FLUSH, &btp->bt_flags); wake_up_process(btp->bt_task); } @@ -1708,6 +1715,9 @@ xfsbufd( set_freezable(); do { + long age = xfs_buf_age_centisecs * msecs_to_jiffies(10); + long tout = xfs_buf_timer_centisecs * msecs_to_jiffies(10); + if (unlikely(freezing(current))) { set_bit(XBT_FORCE_SLEEP, &target->bt_flags); refrigerator(); @@ -1715,12 +1725,12 @@ xfsbufd( clear_bit(XBT_FORCE_SLEEP, &target->bt_flags); } - schedule_timeout_interruptible( - xfs_buf_timer_centisecs * msecs_to_jiffies(10)); - - xfs_buf_delwri_split(target, &tmp, - xfs_buf_age_centisecs * msecs_to_jiffies(10)); + /* sleep for a long time if there is nothing to do. */ + if (list_empty(&target->bt_delwrite_queue)) + tout = MAX_SCHEDULE_TIMEOUT; + schedule_timeout_interruptible(tout); + xfs_buf_delwri_split(target, &tmp, age); count = 0; while (!list_empty(&tmp)) { bp = list_entry(tmp.next, xfs_buf_t, b_list); -- cgit v1.2.3 From 5017e97d52628fb8ae56e434e86ac2e72ddaac2b Mon Sep 17 00:00:00 2001 From: Dave Chinner <david@fromorbit.com> Date: Mon, 11 Jan 2010 11:47:40 +0000 Subject: xfs: rename xfs_get_perag xfs_get_perag is really getting the perag that an inode belongs to based on it's inode number. Convert the use of this function to just get the perag from a provided ag number. Use this new function to obtain the per-ag structure when traversing the per AG inode trees for sync and reclaim. Signed-off-by: Dave Chinner <david@fromorbit.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Alex Elder <aelder@sgi.com> --- fs/xfs/linux-2.6/xfs_sync.c | 22 +++++++++++++--------- fs/xfs/xfs_iget.c | 10 +++++----- fs/xfs/xfs_inode.c | 8 +++++--- fs/xfs/xfs_mount.h | 8 ++++---- 4 files changed, 27 insertions(+), 21 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index 0f90bfe2815f..cc964faf12e9 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c @@ -90,14 +90,13 @@ xfs_inode_ag_lookup( STATIC int xfs_inode_ag_walk( struct xfs_mount *mp, - xfs_agnumber_t ag, + struct xfs_perag *pag, int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags), int flags, int tag, int exclusive) { - struct xfs_perag *pag = &mp->m_perag[ag]; uint32_t first_index; int last_error = 0; int skipped; @@ -141,8 +140,6 @@ restart: delay(1); goto restart; } - - xfs_put_perag(mp, pag); return last_error; } @@ -160,10 +157,16 @@ xfs_inode_ag_iterator( xfs_agnumber_t ag; for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) { - if (!mp->m_perag[ag].pag_ici_init) + struct xfs_perag *pag; + + pag = xfs_perag_get(mp, ag); + if (!pag->pag_ici_init) { + xfs_perag_put(pag); continue; - error = xfs_inode_ag_walk(mp, ag, execute, flags, tag, + } + error = xfs_inode_ag_walk(mp, pag, execute, flags, tag, exclusive); + xfs_perag_put(pag); if (error) { last_error = error; if (error == EFSCORRUPTED) @@ -690,16 +693,17 @@ void xfs_inode_set_reclaim_tag( xfs_inode_t *ip) { - xfs_mount_t *mp = ip->i_mount; - xfs_perag_t *pag = xfs_get_perag(mp, ip->i_ino); + struct xfs_mount *mp = ip->i_mount; + struct xfs_perag *pag; + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); read_lock(&pag->pag_ici_lock); spin_lock(&ip->i_flags_lock); __xfs_inode_set_reclaim_tag(pag, ip); __xfs_iflags_set(ip, XFS_IRECLAIMABLE); spin_unlock(&ip->i_flags_lock); read_unlock(&pag->pag_ici_lock); - xfs_put_perag(mp, pag); + xfs_perag_put(pag); } void diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c index 155e798f30a1..e281eb4a1c49 100644 --- a/fs/xfs/xfs_iget.c +++ b/fs/xfs/xfs_iget.c @@ -374,7 +374,7 @@ xfs_iget( return EINVAL; /* get the perag structure and ensure that it's inode capable */ - pag = xfs_get_perag(mp, ino); + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino)); if (!pag->pagi_inodeok) return EINVAL; ASSERT(pag->pag_ici_init); @@ -398,7 +398,7 @@ again: if (error) goto out_error_or_again; } - xfs_put_perag(mp, pag); + xfs_perag_put(pag); *ipp = ip; @@ -417,7 +417,7 @@ out_error_or_again: delay(1); goto again; } - xfs_put_perag(mp, pag); + xfs_perag_put(pag); return error; } @@ -488,12 +488,12 @@ xfs_ireclaim( * added to the tree assert that it's been there before to catch * problems with the inode life time early on. */ - pag = xfs_get_perag(mp, ip->i_ino); + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); write_lock(&pag->pag_ici_lock); if (!radix_tree_delete(&pag->pag_ici_root, agino)) ASSERT(0); write_unlock(&pag->pag_ici_lock); - xfs_put_perag(mp, pag); + xfs_perag_put(pag); /* * Here we do an (almost) spurious inode lock in order to coordinate diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index ef77fd88c8e3..bd3d81636d51 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1946,8 +1946,9 @@ xfs_ifree_cluster( xfs_inode_t *ip, **ip_found; xfs_inode_log_item_t *iip; xfs_log_item_t *lip; - xfs_perag_t *pag = xfs_get_perag(mp, inum); + struct xfs_perag *pag; + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum)); if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) { blks_per_cluster = 1; ninodes = mp->m_sb.sb_inopblock; @@ -2088,7 +2089,7 @@ xfs_ifree_cluster( } kmem_free(ip_found); - xfs_put_perag(mp, pag); + xfs_perag_put(pag); } /* @@ -2675,7 +2676,7 @@ xfs_iflush_cluster( xfs_buf_t *bp) { xfs_mount_t *mp = ip->i_mount; - xfs_perag_t *pag = xfs_get_perag(mp, ip->i_ino); + struct xfs_perag *pag; unsigned long first_index, mask; unsigned long inodes_per_cluster; int ilist_size; @@ -2686,6 +2687,7 @@ xfs_iflush_cluster( int bufwasdelwri; int i; + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); ASSERT(pag->pagi_inodeok); ASSERT(pag->pag_ici_init); diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 1df7e4502967..f8a68a2319b5 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -386,14 +386,14 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d) /* * perag get/put wrappers for eventual ref counting */ -static inline xfs_perag_t * -xfs_get_perag(struct xfs_mount *mp, xfs_ino_t ino) +static inline struct xfs_perag * +xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno) { - return &mp->m_perag[XFS_INO_TO_AGNO(mp, ino)]; + return &mp->m_perag[agno]; } static inline void -xfs_put_perag(struct xfs_mount *mp, xfs_perag_t *pag) +xfs_perag_put(struct xfs_perag *pag) { /* nothing to see here, move along */ } -- cgit v1.2.3 From a862e0fdcb8862aab2538ec2fc2f0dc07a625c59 Mon Sep 17 00:00:00 2001 From: Dave Chinner <david@fromorbit.com> Date: Mon, 11 Jan 2010 11:47:41 +0000 Subject: xfs: Don't directly reference m_perag in allocation code Start abstracting the perag references so that the indexing of the structures is not directly coded into all the places that uses the perag structures. This will allow us to separate the use of the perag structure and the way it is indexed and hence avoid the known deadlocks related to growing a busy filesystem. Signed-off-by: Dave Chinner <david@fromorbit.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Alex Elder <aelder@sgi.com> --- fs/xfs/xfs_alloc.c | 82 +++++++++++++++++++++++++++--------------------- fs/xfs/xfs_alloc_btree.c | 9 ++++-- 2 files changed, 53 insertions(+), 38 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 275b1f4f9430..84070f2e0ba4 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -1662,11 +1662,13 @@ xfs_free_ag_extent( xfs_agf_t *agf; xfs_perag_t *pag; /* per allocation group data */ + pag = xfs_perag_get(mp, agno); + pag->pagf_freeblks += len; + xfs_perag_put(pag); + agf = XFS_BUF_TO_AGF(agbp); - pag = &mp->m_perag[agno]; be32_add_cpu(&agf->agf_freeblks, len); xfs_trans_agblocks_delta(tp, len); - pag->pagf_freeblks += len; XFS_WANT_CORRUPTED_GOTO( be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length), @@ -1969,10 +1971,12 @@ xfs_alloc_get_freelist( xfs_trans_brelse(tp, agflbp); if (be32_to_cpu(agf->agf_flfirst) == XFS_AGFL_SIZE(mp)) agf->agf_flfirst = 0; - pag = &mp->m_perag[be32_to_cpu(agf->agf_seqno)]; + + pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno)); be32_add_cpu(&agf->agf_flcount, -1); xfs_trans_agflist_delta(tp, -1); pag->pagf_flcount--; + xfs_perag_put(pag); logflags = XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT; if (btreeblk) { @@ -2078,7 +2082,8 @@ xfs_alloc_put_freelist( be32_add_cpu(&agf->agf_fllast, 1); if (be32_to_cpu(agf->agf_fllast) == XFS_AGFL_SIZE(mp)) agf->agf_fllast = 0; - pag = &mp->m_perag[be32_to_cpu(agf->agf_seqno)]; + + pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno)); be32_add_cpu(&agf->agf_flcount, 1); xfs_trans_agflist_delta(tp, 1); pag->pagf_flcount++; @@ -2089,6 +2094,7 @@ xfs_alloc_put_freelist( pag->pagf_btreeblks--; logflags |= XFS_AGF_BTREEBLKS; } + xfs_perag_put(pag); xfs_alloc_log_agf(tp, agbp, logflags); @@ -2152,7 +2158,6 @@ xfs_read_agf( xfs_trans_brelse(tp, *bpp); return XFS_ERROR(EFSCORRUPTED); } - XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_AGF, XFS_AGF_REF); return 0; } @@ -2184,7 +2189,7 @@ xfs_alloc_read_agf( ASSERT(!XFS_BUF_GETERROR(*bpp)); agf = XFS_BUF_TO_AGF(*bpp); - pag = &mp->m_perag[agno]; + pag = xfs_perag_get(mp, agno); if (!pag->pagf_init) { pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks); pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks); @@ -2211,6 +2216,7 @@ xfs_alloc_read_agf( be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi])); } #endif + xfs_perag_put(pag); return 0; } @@ -2271,7 +2277,7 @@ xfs_alloc_vextent( */ args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno); down_read(&mp->m_peraglock); - args->pag = &mp->m_perag[args->agno]; + args->pag = xfs_perag_get(mp, args->agno); args->minleft = 0; error = xfs_alloc_fix_freelist(args, 0); args->minleft = minleft; @@ -2341,7 +2347,7 @@ xfs_alloc_vextent( */ down_read(&mp->m_peraglock); for (;;) { - args->pag = &mp->m_perag[args->agno]; + args->pag = xfs_perag_get(mp, args->agno); if (no_min) args->minleft = 0; error = xfs_alloc_fix_freelist(args, flags); args->minleft = minleft; @@ -2400,6 +2406,7 @@ xfs_alloc_vextent( } } } + xfs_perag_put(args->pag); } up_read(&mp->m_peraglock); if (bump_rotor || (type == XFS_ALLOCTYPE_ANY_AG)) { @@ -2427,8 +2434,10 @@ xfs_alloc_vextent( args->len); #endif } + xfs_perag_put(args->pag); return 0; error0: + xfs_perag_put(args->pag); up_read(&mp->m_peraglock); return error; } @@ -2455,7 +2464,7 @@ xfs_free_extent( ASSERT(args.agno < args.mp->m_sb.sb_agcount); args.agbno = XFS_FSB_TO_AGBNO(args.mp, bno); down_read(&args.mp->m_peraglock); - args.pag = &args.mp->m_perag[args.agno]; + args.pag = xfs_perag_get(args.mp, args.agno); if ((error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING))) goto error0; #ifdef DEBUG @@ -2465,6 +2474,7 @@ xfs_free_extent( #endif error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0); error0: + xfs_perag_put(args.pag); up_read(&args.mp->m_peraglock); return error; } @@ -2486,15 +2496,15 @@ xfs_alloc_mark_busy(xfs_trans_t *tp, xfs_agblock_t bno, xfs_extlen_t len) { - xfs_mount_t *mp; xfs_perag_busy_t *bsy; + struct xfs_perag *pag; int n; - mp = tp->t_mountp; - spin_lock(&mp->m_perag[agno].pagb_lock); + pag = xfs_perag_get(tp->t_mountp, agno); + spin_lock(&pag->pagb_lock); /* search pagb_list for an open slot */ - for (bsy = mp->m_perag[agno].pagb_list, n = 0; + for (bsy = pag->pagb_list, n = 0; n < XFS_PAGB_NUM_SLOTS; bsy++, n++) { if (bsy->busy_tp == NULL) { @@ -2502,11 +2512,11 @@ xfs_alloc_mark_busy(xfs_trans_t *tp, } } - trace_xfs_alloc_busy(mp, agno, bno, len, n); + trace_xfs_alloc_busy(tp->t_mountp, agno, bno, len, n); if (n < XFS_PAGB_NUM_SLOTS) { - bsy = &mp->m_perag[agno].pagb_list[n]; - mp->m_perag[agno].pagb_count++; + bsy = &pag->pagb_list[n]; + pag->pagb_count++; bsy->busy_start = bno; bsy->busy_length = len; bsy->busy_tp = tp; @@ -2521,7 +2531,8 @@ xfs_alloc_mark_busy(xfs_trans_t *tp, xfs_trans_set_sync(tp); } - spin_unlock(&mp->m_perag[agno].pagb_lock); + spin_unlock(&pag->pagb_lock); + xfs_perag_put(pag); } void @@ -2529,24 +2540,23 @@ xfs_alloc_clear_busy(xfs_trans_t *tp, xfs_agnumber_t agno, int idx) { - xfs_mount_t *mp; + struct xfs_perag *pag; xfs_perag_busy_t *list; - mp = tp->t_mountp; - - spin_lock(&mp->m_perag[agno].pagb_lock); - list = mp->m_perag[agno].pagb_list; - ASSERT(idx < XFS_PAGB_NUM_SLOTS); + pag = xfs_perag_get(tp->t_mountp, agno); + spin_lock(&pag->pagb_lock); + list = pag->pagb_list; - trace_xfs_alloc_unbusy(mp, agno, idx, list[idx].busy_tp == tp); + trace_xfs_alloc_unbusy(tp->t_mountp, agno, idx, list[idx].busy_tp == tp); if (list[idx].busy_tp == tp) { list[idx].busy_tp = NULL; - mp->m_perag[agno].pagb_count--; + pag->pagb_count--; } - spin_unlock(&mp->m_perag[agno].pagb_lock); + spin_unlock(&pag->pagb_lock); + xfs_perag_put(pag); } @@ -2560,17 +2570,15 @@ xfs_alloc_search_busy(xfs_trans_t *tp, xfs_agblock_t bno, xfs_extlen_t len) { - xfs_mount_t *mp; + struct xfs_perag *pag; xfs_perag_busy_t *bsy; xfs_agblock_t uend, bend; xfs_lsn_t lsn = 0; int cnt; - mp = tp->t_mountp; - - spin_lock(&mp->m_perag[agno].pagb_lock); - - uend = bno + len - 1; + pag = xfs_perag_get(tp->t_mountp, agno); + spin_lock(&pag->pagb_lock); + cnt = pag->pagb_count; /* * search pagb_list for this slot, skipping open slots. We have to @@ -2578,8 +2586,9 @@ xfs_alloc_search_busy(xfs_trans_t *tp, * we have to get the most recent LSN for the log force to push out * all the transactions that span the range. */ - for (cnt = 0; cnt < mp->m_perag[agno].pagb_count; cnt++) { - bsy = &mp->m_perag[agno].pagb_list[cnt]; + uend = bno + len - 1; + for (cnt = 0; cnt < pag->pagb_count; cnt++) { + bsy = &pag->pagb_list[cnt]; if (!bsy->busy_tp) continue; @@ -2591,7 +2600,8 @@ xfs_alloc_search_busy(xfs_trans_t *tp, if (XFS_LSN_CMP(bsy->busy_tp->t_commit_lsn, lsn) > 0) lsn = bsy->busy_tp->t_commit_lsn; } - spin_unlock(&mp->m_perag[agno].pagb_lock); + spin_unlock(&pag->pagb_lock); + xfs_perag_put(pag); trace_xfs_alloc_busysearch(tp->t_mountp, agno, bno, len, lsn); /* @@ -2599,5 +2609,5 @@ xfs_alloc_search_busy(xfs_trans_t *tp, * transaction that freed the block */ if (lsn) - xfs_log_force(mp, lsn, XFS_LOG_FORCE|XFS_LOG_SYNC); + xfs_log_force(tp->t_mountp, lsn, XFS_LOG_FORCE|XFS_LOG_SYNC); } diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c index adbd9141aea1..b726e10d2c1c 100644 --- a/fs/xfs/xfs_alloc_btree.c +++ b/fs/xfs/xfs_alloc_btree.c @@ -61,12 +61,14 @@ xfs_allocbt_set_root( struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno); int btnum = cur->bc_btnum; + struct xfs_perag *pag = xfs_perag_get(cur->bc_mp, seqno); ASSERT(ptr->s != 0); agf->agf_roots[btnum] = ptr->s; be32_add_cpu(&agf->agf_levels[btnum], inc); - cur->bc_mp->m_perag[seqno].pagf_levels[btnum] += inc; + pag->pagf_levels[btnum] += inc; + xfs_perag_put(pag); xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS); } @@ -150,6 +152,7 @@ xfs_allocbt_update_lastrec( { struct xfs_agf *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp); xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno); + struct xfs_perag *pag; __be32 len; int numrecs; @@ -193,7 +196,9 @@ xfs_allocbt_update_lastrec( } agf->agf_longest = len; - cur->bc_mp->m_perag[seqno].pagf_longest = be32_to_cpu(len); + pag = xfs_perag_get(cur->bc_mp, seqno); + pag->pagf_longest = be32_to_cpu(len); + xfs_perag_put(pag); xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp, XFS_AGF_LONGEST); } -- cgit v1.2.3 From 4196ac08c023c6dab90c3fa460d9c06deaa304c4 Mon Sep 17 00:00:00 2001 From: Dave Chinner <david@fromorbit.com> Date: Mon, 11 Jan 2010 11:47:42 +0000 Subject: xfs: Convert filestreams code to use per-ag get/put routines Use xfs_perag_get() and xfs_perag_put() in the filestreams code. Signed-off-by: Dave Chinner <david@fromorbit.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Alex Elder <aelder@sgi.com> --- fs/xfs/xfs_filestream.c | 19 ++++++++++++------- fs/xfs/xfs_filestream.h | 27 ++++++++++++++++++++++++--- 2 files changed, 36 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c index a631e1451abb..e61f2aa088a9 100644 --- a/fs/xfs/xfs_filestream.c +++ b/fs/xfs/xfs_filestream.c @@ -140,6 +140,7 @@ _xfs_filestream_pick_ag( int flags, xfs_extlen_t minlen) { + int streams, max_streams; int err, trylock, nscan; xfs_extlen_t longest, free, minfree, maxfree = 0; xfs_agnumber_t ag, max_ag = NULLAGNUMBER; @@ -155,15 +156,15 @@ _xfs_filestream_pick_ag( trylock = XFS_ALLOC_FLAG_TRYLOCK; for (nscan = 0; 1; nscan++) { - - TRACE_AG_SCAN(mp, ag, xfs_filestream_peek_ag(mp, ag)); - - pag = mp->m_perag + ag; + pag = xfs_perag_get(mp, ag); + TRACE_AG_SCAN(mp, ag, atomic_read(&pag->pagf_fstrms)); if (!pag->pagf_init) { err = xfs_alloc_pagf_init(mp, NULL, ag, trylock); - if (err && !trylock) + if (err && !trylock) { + xfs_perag_put(pag); return err; + } } /* Might fail sometimes during the 1st pass with trylock set. */ @@ -173,6 +174,7 @@ _xfs_filestream_pick_ag( /* Keep track of the AG with the most free blocks. */ if (pag->pagf_freeblks > maxfree) { maxfree = pag->pagf_freeblks; + max_streams = atomic_read(&pag->pagf_fstrms); max_ag = ag; } @@ -195,6 +197,8 @@ _xfs_filestream_pick_ag( /* Break out, retaining the reference on the AG. */ free = pag->pagf_freeblks; + streams = atomic_read(&pag->pagf_fstrms); + xfs_perag_put(pag); *agp = ag; break; } @@ -202,6 +206,7 @@ _xfs_filestream_pick_ag( /* Drop the reference on this AG, it's not usable. */ xfs_filestream_put_ag(mp, ag); next_ag: + xfs_perag_put(pag); /* Move to the next AG, wrapping to AG 0 if necessary. */ if (++ag >= mp->m_sb.sb_agcount) ag = 0; @@ -229,6 +234,7 @@ next_ag: if (max_ag != NULLAGNUMBER) { xfs_filestream_get_ag(mp, max_ag); TRACE_AG_PICK1(mp, max_ag, maxfree); + streams = max_streams; free = maxfree; *agp = max_ag; break; @@ -240,8 +246,7 @@ next_ag: return 0; } - TRACE_AG_PICK2(mp, startag, *agp, xfs_filestream_peek_ag(mp, *agp), - free, nscan, flags); + TRACE_AG_PICK2(mp, startag, *agp, streams, free, nscan, flags); return 0; } diff --git a/fs/xfs/xfs_filestream.h b/fs/xfs/xfs_filestream.h index 4aba67c5f64f..58378b2ea033 100644 --- a/fs/xfs/xfs_filestream.h +++ b/fs/xfs/xfs_filestream.h @@ -79,12 +79,21 @@ extern ktrace_t *xfs_filestreams_trace_buf; * the cache that reference per-ag array elements that have since been * reallocated. */ +/* + * xfs_filestream_peek_ag is only used in tracing code + */ static inline int xfs_filestream_peek_ag( xfs_mount_t *mp, xfs_agnumber_t agno) { - return atomic_read(&mp->m_perag[agno].pagf_fstrms); + struct xfs_perag *pag; + int ret; + + pag = xfs_perag_get(mp, agno); + ret = atomic_read(&pag->pagf_fstrms); + xfs_perag_put(pag); + return ret; } static inline int @@ -92,7 +101,13 @@ xfs_filestream_get_ag( xfs_mount_t *mp, xfs_agnumber_t agno) { - return atomic_inc_return(&mp->m_perag[agno].pagf_fstrms); + struct xfs_perag *pag; + int ret; + + pag = xfs_perag_get(mp, agno); + ret = atomic_inc_return(&pag->pagf_fstrms); + xfs_perag_put(pag); + return ret; } static inline int @@ -100,7 +115,13 @@ xfs_filestream_put_ag( xfs_mount_t *mp, xfs_agnumber_t agno) { - return atomic_dec_return(&mp->m_perag[agno].pagf_fstrms); + struct xfs_perag *pag; + int ret; + + pag = xfs_perag_get(mp, agno); + ret = atomic_dec_return(&pag->pagf_fstrms); + xfs_perag_put(pag); + return ret; } /* allocation selection flags */ -- cgit v1.2.3 From 44b56e0a1aed522a10051645e85d300e10926fd3 Mon Sep 17 00:00:00 2001 From: Dave Chinner <david@fromorbit.com> Date: Mon, 11 Jan 2010 11:47:43 +0000 Subject: xfs: convert remaining direct references to m_perag Convert the remaining direct lookups of the per ag structures to use get/put accesses. Ensure that the loops across AGs and prior users of the interface balance gets and puts correctly. Signed-off-by: Dave Chinner <david@fromorbit.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Alex Elder <aelder@sgi.com> --- fs/xfs/xfs_bmap.c | 8 +++++++- fs/xfs/xfs_ialloc.c | 35 +++++++++++++++++++++++++---------- fs/xfs/xfs_inode.c | 5 ++++- fs/xfs/xfs_mount.c | 9 ++++++--- 4 files changed, 42 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 98251cdc52aa..a9b95d9cf2ad 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -2630,11 +2630,12 @@ xfs_bmap_btalloc( startag = ag = 0; notinit = 0; down_read(&mp->m_peraglock); + pag = xfs_perag_get(mp, ag); while (blen < ap->alen) { - pag = &mp->m_perag[ag]; if (!pag->pagf_init && (error = xfs_alloc_pagf_init(mp, args.tp, ag, XFS_ALLOC_FLAG_TRYLOCK))) { + xfs_perag_put(pag); up_read(&mp->m_peraglock); return error; } @@ -2667,6 +2668,7 @@ xfs_bmap_btalloc( break; error = xfs_filestream_new_ag(ap, &ag); + xfs_perag_put(pag); if (error) { up_read(&mp->m_peraglock); return error; @@ -2674,6 +2676,7 @@ xfs_bmap_btalloc( /* loop again to set 'blen'*/ startag = NULLAGNUMBER; + pag = xfs_perag_get(mp, ag); continue; } } @@ -2681,7 +2684,10 @@ xfs_bmap_btalloc( ag = 0; if (ag == startag) break; + xfs_perag_put(pag); + pag = xfs_perag_get(mp, ag); } + xfs_perag_put(pag); up_read(&mp->m_peraglock); /* * Since the above loop did a BUF_TRYLOCK, it is diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index cb907ba69c4c..884ee1367f46 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -253,6 +253,7 @@ xfs_ialloc_ag_alloc( xfs_agino_t thisino; /* current inode number, for loop */ int isaligned = 0; /* inode allocation at stripe unit */ /* boundary */ + struct xfs_perag *pag; args.tp = tp; args.mp = tp->t_mountp; @@ -383,7 +384,9 @@ xfs_ialloc_ag_alloc( be32_add_cpu(&agi->agi_count, newlen); be32_add_cpu(&agi->agi_freecount, newlen); down_read(&args.mp->m_peraglock); - args.mp->m_perag[agno].pagi_freecount += newlen; + pag = xfs_perag_get(args.mp, agno); + pag->pagi_freecount += newlen; + xfs_perag_put(pag); up_read(&args.mp->m_peraglock); agi->agi_newino = cpu_to_be32(newino); @@ -488,7 +491,7 @@ xfs_ialloc_ag_select( flags = XFS_ALLOC_FLAG_TRYLOCK; down_read(&mp->m_peraglock); for (;;) { - pag = &mp->m_perag[agno]; + pag = xfs_perag_get(mp, agno); if (!pag->pagi_init) { if (xfs_ialloc_read_agi(mp, tp, agno, &agbp)) { agbp = NULL; @@ -527,6 +530,7 @@ xfs_ialloc_ag_select( agbp = NULL; goto nextag; } + xfs_perag_put(pag); up_read(&mp->m_peraglock); return agbp; } @@ -535,6 +539,7 @@ unlock_nextag: if (agbp) xfs_trans_brelse(tp, agbp); nextag: + xfs_perag_put(pag); /* * No point in iterating over the rest, if we're shutting * down. @@ -672,6 +677,7 @@ xfs_dialloc( xfs_agnumber_t tagno; /* testing allocation group number */ xfs_btree_cur_t *tcur; /* temp cursor */ xfs_inobt_rec_incore_t trec; /* temp inode allocation record */ + struct xfs_perag *pag; if (*IO_agbp == NULL) { @@ -772,11 +778,14 @@ nextag: return noroom ? ENOSPC : 0; } down_read(&mp->m_peraglock); - if (mp->m_perag[tagno].pagi_inodeok == 0) { + pag = xfs_perag_get(mp, tagno); + if (pag->pagi_inodeok == 0) { + xfs_perag_put(pag); up_read(&mp->m_peraglock); goto nextag; } error = xfs_ialloc_read_agi(mp, tp, tagno, &agbp); + xfs_perag_put(pag); up_read(&mp->m_peraglock); if (error) goto nextag; @@ -790,6 +799,7 @@ nextag: */ agno = tagno; *IO_agbp = NULL; + pag = xfs_perag_get(mp, agno); restart_pagno: cur = xfs_inobt_init_cursor(mp, tp, agbp, be32_to_cpu(agi->agi_seqno)); @@ -808,7 +818,6 @@ nextag: * If in the same AG as the parent, try to get near the parent. */ if (pagno == agno) { - xfs_perag_t *pag = &mp->m_perag[agno]; int doneleft; /* done, to the left */ int doneright; /* done, to the right */ int searchdistance = 10; @@ -1007,7 +1016,7 @@ alloc_inode: be32_add_cpu(&agi->agi_freecount, -1); xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT); down_read(&mp->m_peraglock); - mp->m_perag[tagno].pagi_freecount--; + pag->pagi_freecount--; up_read(&mp->m_peraglock); error = xfs_check_agi_freecount(cur, agi); @@ -1016,12 +1025,14 @@ alloc_inode: xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1); + xfs_perag_put(pag); *inop = ino; return 0; error1: xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); error0: xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + xfs_perag_put(pag); return error; } @@ -1052,6 +1063,7 @@ xfs_difree( xfs_mount_t *mp; /* mount structure for filesystem */ int off; /* offset of inode in inode chunk */ xfs_inobt_rec_incore_t rec; /* btree record */ + struct xfs_perag *pag; mp = tp->t_mountp; @@ -1158,7 +1170,9 @@ xfs_difree( be32_add_cpu(&agi->agi_freecount, -(ilen - 1)); xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT); down_read(&mp->m_peraglock); - mp->m_perag[agno].pagi_freecount -= ilen - 1; + pag = xfs_perag_get(mp, agno); + pag->pagi_freecount -= ilen - 1; + xfs_perag_put(pag); up_read(&mp->m_peraglock); xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, -ilen); xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -(ilen - 1)); @@ -1189,7 +1203,9 @@ xfs_difree( be32_add_cpu(&agi->agi_freecount, 1); xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT); down_read(&mp->m_peraglock); - mp->m_perag[agno].pagi_freecount++; + pag = xfs_perag_get(mp, agno); + pag->pagi_freecount++; + xfs_perag_put(pag); up_read(&mp->m_peraglock); xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, 1); } @@ -1379,7 +1395,6 @@ xfs_imap( XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)); return XFS_ERROR(EINVAL); } - return 0; } @@ -1523,8 +1538,7 @@ xfs_ialloc_read_agi( return error; agi = XFS_BUF_TO_AGI(*bpp); - pag = &mp->m_perag[agno]; - + pag = xfs_perag_get(mp, agno); if (!pag->pagi_init) { pag->pagi_freecount = be32_to_cpu(agi->agi_freecount); pag->pagi_count = be32_to_cpu(agi->agi_count); @@ -1537,6 +1551,7 @@ xfs_ialloc_read_agi( */ ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) || XFS_FORCED_SHUTDOWN(mp)); + xfs_perag_put(pag); return 0; } diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index bd3d81636d51..0317b000ab44 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2695,7 +2695,7 @@ xfs_iflush_cluster( ilist_size = inodes_per_cluster * sizeof(xfs_inode_t *); ilist = kmem_alloc(ilist_size, KM_MAYFAIL|KM_NOFS); if (!ilist) - return 0; + goto out_put; mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1); first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask; @@ -2764,6 +2764,8 @@ xfs_iflush_cluster( out_free: read_unlock(&pag->pag_ici_lock); kmem_free(ilist); +out_put: + xfs_perag_put(pag); return 0; @@ -2807,6 +2809,7 @@ cluster_corrupt_out: */ xfs_iflush_abort(iq); kmem_free(ilist); + xfs_perag_put(pag); return XFS_ERROR(EFSCORRUPTED); } diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index eb403b40e120..9055b60730d0 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -438,18 +438,20 @@ xfs_initialize_perag( } /* This ag is preferred for inodes */ - pag = &mp->m_perag[index]; + pag = xfs_perag_get(mp, index); pag->pagi_inodeok = 1; if (index < max_metadata) pag->pagf_metadata = 1; xfs_initialize_perag_icache(pag); + xfs_perag_put(pag); } } else { /* Setup default behavior for smaller filesystems */ for (index = 0; index < agcount; index++) { - pag = &mp->m_perag[index]; + pag = xfs_perag_get(mp, index); pag->pagi_inodeok = 1; xfs_initialize_perag_icache(pag); + xfs_perag_put(pag); } } return index; @@ -731,12 +733,13 @@ xfs_initialize_perag_data(xfs_mount_t *mp, xfs_agnumber_t agcount) error = xfs_ialloc_pagi_init(mp, NULL, index); if (error) return error; - pag = &mp->m_perag[index]; + pag = xfs_perag_get(mp, index); ifree += pag->pagi_freecount; ialloc += pag->pagi_count; bfree += pag->pagf_freeblks; bfreelst += pag->pagf_flcount; btree += pag->pagf_btreeblks; + xfs_perag_put(pag); } /* * Overwrite incore superblock counters with just-read data -- cgit v1.2.3 From 1c1c6ebcf5284aee4910f3b906ac90c20e510c82 Mon Sep 17 00:00:00 2001 From: Dave Chinner <david@fromorbit.com> Date: Mon, 11 Jan 2010 11:47:44 +0000 Subject: xfs: Replace per-ag array with a radix tree The use of an array for the per-ag structures requires reallocation of the array when growing the filesystem. This requires locking access to the array to avoid use after free situations, and the locking is difficult to get right. To avoid needing to reallocate an array, change the per-ag structures to an allocated object per ag and index them using a tree structure. The AGs are always densely indexed (hence the use of an array), but the number supported is 2^32 and lookups tend to be random and hence indexing needs to scale. A simple choice is a radix tree - it works well with this sort of index. This change also removes another large contiguous allocation from the mount/growfs path in XFS. The growing process now needs to change to only initialise the new AGs required for the extra space, and as such only needs to exclusively lock the tree for inserts. The rest of the code only needs to lock the tree while doing lookups, and hence this will remove all the deadlocks that currently occur on the m_perag_lock as it is now an innermost lock. The lock is also changed to a spinlock from a read/write lock as the hold time is now extremely short. To complete the picture, the per-ag structures will need to be reference counted to ensure that we don't free/modify them while they are still in use. This will be done in subsequent patch. Signed-off-by: Dave Chinner <david@fromorbit.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Alex Elder <aelder@sgi.com> --- fs/xfs/xfs_alloc.c | 8 ------- fs/xfs/xfs_bmap.c | 7 +----- fs/xfs/xfs_filestream.c | 13 ++++------ fs/xfs/xfs_fsops.c | 42 ++++++++++++++++----------------- fs/xfs/xfs_ialloc.c | 25 ++------------------ fs/xfs/xfs_itable.c | 4 ---- fs/xfs/xfs_mount.c | 63 +++++++++++++++++++++++++++++++++++++------------ fs/xfs/xfs_mount.h | 14 +++++++---- 8 files changed, 86 insertions(+), 90 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 84070f2e0ba4..4d66bb75579c 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -2276,7 +2276,6 @@ xfs_alloc_vextent( * These three force us into a single a.g. */ args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno); - down_read(&mp->m_peraglock); args->pag = xfs_perag_get(mp, args->agno); args->minleft = 0; error = xfs_alloc_fix_freelist(args, 0); @@ -2286,14 +2285,12 @@ xfs_alloc_vextent( goto error0; } if (!args->agbp) { - up_read(&mp->m_peraglock); trace_xfs_alloc_vextent_noagbp(args); break; } args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno); if ((error = xfs_alloc_ag_vextent(args))) goto error0; - up_read(&mp->m_peraglock); break; case XFS_ALLOCTYPE_START_BNO: /* @@ -2345,7 +2342,6 @@ xfs_alloc_vextent( * Loop over allocation groups twice; first time with * trylock set, second time without. */ - down_read(&mp->m_peraglock); for (;;) { args->pag = xfs_perag_get(mp, args->agno); if (no_min) args->minleft = 0; @@ -2408,7 +2404,6 @@ xfs_alloc_vextent( } xfs_perag_put(args->pag); } - up_read(&mp->m_peraglock); if (bump_rotor || (type == XFS_ALLOCTYPE_ANY_AG)) { if (args->agno == sagno) mp->m_agfrotor = (mp->m_agfrotor + 1) % @@ -2438,7 +2433,6 @@ xfs_alloc_vextent( return 0; error0: xfs_perag_put(args->pag); - up_read(&mp->m_peraglock); return error; } @@ -2463,7 +2457,6 @@ xfs_free_extent( args.agno = XFS_FSB_TO_AGNO(args.mp, bno); ASSERT(args.agno < args.mp->m_sb.sb_agcount); args.agbno = XFS_FSB_TO_AGBNO(args.mp, bno); - down_read(&args.mp->m_peraglock); args.pag = xfs_perag_get(args.mp, args.agno); if ((error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING))) goto error0; @@ -2475,7 +2468,6 @@ xfs_free_extent( error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0); error0: xfs_perag_put(args.pag); - up_read(&args.mp->m_peraglock); return error; } diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index a9b95d9cf2ad..7c6d9acd7154 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -2629,14 +2629,12 @@ xfs_bmap_btalloc( if (startag == NULLAGNUMBER) startag = ag = 0; notinit = 0; - down_read(&mp->m_peraglock); pag = xfs_perag_get(mp, ag); while (blen < ap->alen) { if (!pag->pagf_init && (error = xfs_alloc_pagf_init(mp, args.tp, ag, XFS_ALLOC_FLAG_TRYLOCK))) { xfs_perag_put(pag); - up_read(&mp->m_peraglock); return error; } /* @@ -2669,10 +2667,8 @@ xfs_bmap_btalloc( error = xfs_filestream_new_ag(ap, &ag); xfs_perag_put(pag); - if (error) { - up_read(&mp->m_peraglock); + if (error) return error; - } /* loop again to set 'blen'*/ startag = NULLAGNUMBER; @@ -2688,7 +2684,6 @@ xfs_bmap_btalloc( pag = xfs_perag_get(mp, ag); } xfs_perag_put(pag); - up_read(&mp->m_peraglock); /* * Since the above loop did a BUF_TRYLOCK, it is * possible that there is space for this request. diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c index e61f2aa088a9..914d00d0f119 100644 --- a/fs/xfs/xfs_filestream.c +++ b/fs/xfs/xfs_filestream.c @@ -253,8 +253,7 @@ next_ag: /* * Set the allocation group number for a file or a directory, updating inode - * references and per-AG references as appropriate. Must be called with the - * m_peraglock held in read mode. + * references and per-AG references as appropriate. */ static int _xfs_filestream_update_ag( @@ -456,10 +455,10 @@ xfs_filestream_unmount( } /* - * If the mount point's m_perag array is going to be reallocated, all + * If the mount point's m_perag tree is going to be modified, all * outstanding cache entries must be flushed to avoid accessing reference count * addresses that have been freed. The call to xfs_filestream_flush() must be - * made inside the block that holds the m_peraglock in write mode to do the + * made inside the block that holds the m_perag_lock in write mode to do the * reallocation. */ void @@ -531,7 +530,6 @@ xfs_filestream_associate( mp = pip->i_mount; cache = mp->m_filestream; - down_read(&mp->m_peraglock); /* * We have a problem, Houston. @@ -548,10 +546,8 @@ xfs_filestream_associate( * * So, if we can't get the iolock without sleeping then just give up */ - if (!xfs_ilock_nowait(pip, XFS_IOLOCK_EXCL)) { - up_read(&mp->m_peraglock); + if (!xfs_ilock_nowait(pip, XFS_IOLOCK_EXCL)) return 1; - } /* If the parent directory is already in the cache, use its AG. */ item = xfs_mru_cache_lookup(cache, pip->i_ino); @@ -606,7 +602,6 @@ exit_did_pick: exit: xfs_iunlock(pip, XFS_IOLOCK_EXCL); - up_read(&mp->m_peraglock); return -err; } diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index a13919a6a364..37a6f62c57b6 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -167,27 +167,14 @@ xfs_growfs_data_private( } new = nb - mp->m_sb.sb_dblocks; oagcount = mp->m_sb.sb_agcount; - if (nagcount > oagcount) { - void *new_perag, *old_perag; - - xfs_filestream_flush(mp); - - new_perag = kmem_zalloc(sizeof(xfs_perag_t) * nagcount, - KM_MAYFAIL); - if (!new_perag) - return XFS_ERROR(ENOMEM); - - down_write(&mp->m_peraglock); - memcpy(new_perag, mp->m_perag, sizeof(xfs_perag_t) * oagcount); - old_perag = mp->m_perag; - mp->m_perag = new_perag; - - mp->m_flags |= XFS_MOUNT_32BITINODES; - nagimax = xfs_initialize_perag(mp, nagcount); - up_write(&mp->m_peraglock); - kmem_free(old_perag); + /* allocate the new per-ag structures */ + if (nagcount > oagcount) { + error = xfs_initialize_perag(mp, nagcount, &nagimax); + if (error) + return error; } + tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFS); tp->t_flags |= XFS_TRANS_RESERVE; if ((error = xfs_trans_reserve(tp, XFS_GROWFS_SPACE_RES(mp), @@ -196,6 +183,11 @@ xfs_growfs_data_private( return error; } + /* + * Write new AG headers to disk. Non-transactional, but written + * synchronously so they are completed prior to the growfs transaction + * being logged. + */ nfree = 0; for (agno = nagcount - 1; agno >= oagcount; agno--, new -= agsize) { /* @@ -359,6 +351,12 @@ xfs_growfs_data_private( goto error0; } } + + /* + * Update changed superblock fields transactionally. These are not + * seen by the rest of the world until the transaction commit applies + * them atomically to the superblock. + */ if (nagcount > oagcount) xfs_trans_mod_sb(tp, XFS_TRANS_SB_AGCOUNT, nagcount - oagcount); if (nb > mp->m_sb.sb_dblocks) @@ -369,9 +367,9 @@ xfs_growfs_data_private( if (dpct) xfs_trans_mod_sb(tp, XFS_TRANS_SB_IMAXPCT, dpct); error = xfs_trans_commit(tp, 0); - if (error) { + if (error) return error; - } + /* New allocation groups fully initialized, so update mount struct */ if (nagimax) mp->m_maxagi = nagimax; @@ -381,6 +379,8 @@ xfs_growfs_data_private( mp->m_maxicount = icount << mp->m_sb.sb_inopblog; } else mp->m_maxicount = 0; + + /* update secondary superblocks. */ for (agno = 1; agno < nagcount; agno++) { error = xfs_read_buf(mp, mp->m_ddev_targp, XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)), diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index 884ee1367f46..52c9d006c0e6 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -383,11 +383,9 @@ xfs_ialloc_ag_alloc( newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0); be32_add_cpu(&agi->agi_count, newlen); be32_add_cpu(&agi->agi_freecount, newlen); - down_read(&args.mp->m_peraglock); pag = xfs_perag_get(args.mp, agno); pag->pagi_freecount += newlen; xfs_perag_put(pag); - up_read(&args.mp->m_peraglock); agi->agi_newino = cpu_to_be32(newino); /* @@ -489,7 +487,6 @@ xfs_ialloc_ag_select( */ agno = pagno; flags = XFS_ALLOC_FLAG_TRYLOCK; - down_read(&mp->m_peraglock); for (;;) { pag = xfs_perag_get(mp, agno); if (!pag->pagi_init) { @@ -531,7 +528,6 @@ xfs_ialloc_ag_select( goto nextag; } xfs_perag_put(pag); - up_read(&mp->m_peraglock); return agbp; } } @@ -544,18 +540,14 @@ nextag: * No point in iterating over the rest, if we're shutting * down. */ - if (XFS_FORCED_SHUTDOWN(mp)) { - up_read(&mp->m_peraglock); + if (XFS_FORCED_SHUTDOWN(mp)) return NULL; - } agno++; if (agno >= agcount) agno = 0; if (agno == pagno) { - if (flags == 0) { - up_read(&mp->m_peraglock); + if (flags == 0) return NULL; - } flags = 0; } } @@ -777,16 +769,13 @@ nextag: *inop = NULLFSINO; return noroom ? ENOSPC : 0; } - down_read(&mp->m_peraglock); pag = xfs_perag_get(mp, tagno); if (pag->pagi_inodeok == 0) { xfs_perag_put(pag); - up_read(&mp->m_peraglock); goto nextag; } error = xfs_ialloc_read_agi(mp, tp, tagno, &agbp); xfs_perag_put(pag); - up_read(&mp->m_peraglock); if (error) goto nextag; agi = XFS_BUF_TO_AGI(agbp); @@ -1015,9 +1004,7 @@ alloc_inode: goto error0; be32_add_cpu(&agi->agi_freecount, -1); xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT); - down_read(&mp->m_peraglock); pag->pagi_freecount--; - up_read(&mp->m_peraglock); error = xfs_check_agi_freecount(cur, agi); if (error) @@ -1100,9 +1087,7 @@ xfs_difree( /* * Get the allocation group header. */ - down_read(&mp->m_peraglock); error = xfs_ialloc_read_agi(mp, tp, agno, &agbp); - up_read(&mp->m_peraglock); if (error) { cmn_err(CE_WARN, "xfs_difree: xfs_ialloc_read_agi() returned an error %d on %s. Returning error.", @@ -1169,11 +1154,9 @@ xfs_difree( be32_add_cpu(&agi->agi_count, -ilen); be32_add_cpu(&agi->agi_freecount, -(ilen - 1)); xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT); - down_read(&mp->m_peraglock); pag = xfs_perag_get(mp, agno); pag->pagi_freecount -= ilen - 1; xfs_perag_put(pag); - up_read(&mp->m_peraglock); xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, -ilen); xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -(ilen - 1)); @@ -1202,11 +1185,9 @@ xfs_difree( */ be32_add_cpu(&agi->agi_freecount, 1); xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT); - down_read(&mp->m_peraglock); pag = xfs_perag_get(mp, agno); pag->pagi_freecount++; xfs_perag_put(pag); - up_read(&mp->m_peraglock); xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, 1); } @@ -1328,9 +1309,7 @@ xfs_imap( xfs_buf_t *agbp; /* agi buffer */ int i; /* temp state */ - down_read(&mp->m_peraglock); error = xfs_ialloc_read_agi(mp, tp, agno, &agbp); - up_read(&mp->m_peraglock); if (error) { xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: " "xfs_ialloc_read_agi() returned " diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index 62efab2f3839..940307a6a60b 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -420,9 +420,7 @@ xfs_bulkstat( while (XFS_BULKSTAT_UBLEFT(ubleft) && agno < mp->m_sb.sb_agcount) { cond_resched(); bp = NULL; - down_read(&mp->m_peraglock); error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp); - up_read(&mp->m_peraglock); if (error) { /* * Skip this allocation group and go to the next one. @@ -849,9 +847,7 @@ xfs_inumbers( agbp = NULL; while (left > 0 && agno < mp->m_sb.sb_agcount) { if (agbp == NULL) { - down_read(&mp->m_peraglock); error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp); - up_read(&mp->m_peraglock); if (error) { /* * If we can't read the AGI of this ag, diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 9055b60730d0..c04dd83cb57c 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -209,13 +209,16 @@ STATIC void xfs_free_perag( xfs_mount_t *mp) { - if (mp->m_perag) { - int agno; + xfs_agnumber_t agno; + struct xfs_perag *pag; - for (agno = 0; agno < mp->m_maxagi; agno++) - if (mp->m_perag[agno].pagb_list) - kmem_free(mp->m_perag[agno].pagb_list); - kmem_free(mp->m_perag); + for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { + spin_lock(&mp->m_perag_lock); + pag = radix_tree_delete(&mp->m_perag_tree, agno); + spin_unlock(&mp->m_perag_lock); + ASSERT(pag); + kmem_free(pag->pagb_list); + kmem_free(pag); } } @@ -389,10 +392,11 @@ xfs_initialize_perag_icache( } } -xfs_agnumber_t +int xfs_initialize_perag( xfs_mount_t *mp, - xfs_agnumber_t agcount) + xfs_agnumber_t agcount, + xfs_agnumber_t *maxagi) { xfs_agnumber_t index, max_metadata; xfs_perag_t *pag; @@ -405,6 +409,33 @@ xfs_initialize_perag( agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0); ino = XFS_AGINO_TO_INO(mp, agcount - 1, agino); + /* + * Walk the current per-ag tree so we don't try to initialise AGs + * that already exist (growfs case). Allocate and insert all the + * AGs we don't find ready for initialisation. + */ + for (index = 0; index < agcount; index++) { + pag = xfs_perag_get(mp, index); + if (pag) { + xfs_perag_put(pag); + continue; + } + pag = kmem_zalloc(sizeof(*pag), KM_MAYFAIL); + if (!pag) + return -ENOMEM; + if (radix_tree_preload(GFP_NOFS)) + return -ENOMEM; + spin_lock(&mp->m_perag_lock); + if (radix_tree_insert(&mp->m_perag_tree, index, pag)) { + BUG(); + spin_unlock(&mp->m_perag_lock); + kmem_free(pag); + return -EEXIST; + } + spin_unlock(&mp->m_perag_lock); + radix_tree_preload_end(); + } + /* Clear the mount flag if no inode can overflow 32 bits * on this filesystem, or if specifically requested.. */ @@ -454,7 +485,9 @@ xfs_initialize_perag( xfs_perag_put(pag); } } - return index; + if (maxagi) + *maxagi = index; + return 0; } void @@ -1155,13 +1188,13 @@ xfs_mountfs( /* * Allocate and initialize the per-ag data. */ - init_rwsem(&mp->m_peraglock); - mp->m_perag = kmem_zalloc(sbp->sb_agcount * sizeof(xfs_perag_t), - KM_MAYFAIL); - if (!mp->m_perag) + spin_lock_init(&mp->m_perag_lock); + INIT_RADIX_TREE(&mp->m_perag_tree, GFP_NOFS); + error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi); + if (error) { + cmn_err(CE_WARN, "XFS: Failed per-ag init: %d", error); goto out_remove_uuid; - - mp->m_maxagi = xfs_initialize_perag(mp, sbp->sb_agcount); + } if (!sbp->sb_logblocks) { cmn_err(CE_WARN, "XFS: no log defined"); diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index f8a68a2319b5..cfa7a5d22e72 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -207,8 +207,8 @@ typedef struct xfs_mount { uint m_ag_maxlevels; /* XFS_AG_MAXLEVELS */ uint m_bm_maxlevels[2]; /* XFS_BM_MAXLEVELS */ uint m_in_maxlevels; /* max inobt btree levels. */ - struct xfs_perag *m_perag; /* per-ag accounting info */ - struct rw_semaphore m_peraglock; /* lock for m_perag (pointer) */ + struct radix_tree_root m_perag_tree; /* per-ag accounting info */ + spinlock_t m_perag_lock; /* lock for m_perag_tree */ struct mutex m_growlock; /* growfs mutex */ int m_fixedfsid[2]; /* unchanged for life of FS */ uint m_dmevmask; /* DMI events for this FS */ @@ -389,7 +389,12 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d) static inline struct xfs_perag * xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno) { - return &mp->m_perag[agno]; + struct xfs_perag *pag; + + spin_lock(&mp->m_perag_lock); + pag = radix_tree_lookup(&mp->m_perag_tree, agno); + spin_unlock(&mp->m_perag_lock); + return pag; } static inline void @@ -450,7 +455,8 @@ extern struct xfs_dmops xfs_dmcore_xfs; #endif /* __KERNEL__ */ extern void xfs_mod_sb(struct xfs_trans *, __int64_t); -extern xfs_agnumber_t xfs_initialize_perag(struct xfs_mount *, xfs_agnumber_t); +extern int xfs_initialize_perag(struct xfs_mount *, xfs_agnumber_t, + xfs_agnumber_t *); extern void xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *); extern void xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t); -- cgit v1.2.3 From aed3bb90abaf0b42e8c8747e192f7bb97f445279 Mon Sep 17 00:00:00 2001 From: Dave Chinner <david@fromorbit.com> Date: Mon, 11 Jan 2010 11:47:45 +0000 Subject: xfs: Reference count per-ag structures Reference count the per-ag structures to ensure that we keep get/put pairs balanced. Assert that the reference counts are zero at unmount time to catch leaks. In future, reference counts will enable us to safely remove perag structures by allowing us to detect when they are no longer in use. Signed-off-by: Dave Chinner <david@fromorbit.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Alex Elder <aelder@sgi.com> --- fs/xfs/xfs_ag.h | 4 ++-- fs/xfs/xfs_mount.c | 1 + fs/xfs/xfs_mount.h | 11 +++++++++-- 3 files changed, 12 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h index 6702bd865811..18ae43f4255d 100644 --- a/fs/xfs/xfs_ag.h +++ b/fs/xfs/xfs_ag.h @@ -196,8 +196,8 @@ typedef struct xfs_perag_busy { #define XFS_PAGB_NUM_SLOTS 128 #endif -typedef struct xfs_perag -{ +typedef struct xfs_perag { + atomic_t pag_ref; /* perag reference count */ char pagf_init; /* this agf's entry is initialized */ char pagi_init; /* this agi's entry is initialized */ char pagf_metadata; /* the agf is preferred to be metadata */ diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index c04dd83cb57c..f241fec26070 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -215,6 +215,7 @@ xfs_free_perag( for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { spin_lock(&mp->m_perag_lock); pag = radix_tree_delete(&mp->m_perag_tree, agno); + ASSERT(atomic_read(&pag->pag_ref) == 0); spin_unlock(&mp->m_perag_lock); ASSERT(pag); kmem_free(pag->pagb_list); diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index cfa7a5d22e72..16b22120b98f 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -384,7 +384,7 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d) } /* - * perag get/put wrappers for eventual ref counting + * perag get/put wrappers for ref counting */ static inline struct xfs_perag * xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno) @@ -393,6 +393,12 @@ xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno) spin_lock(&mp->m_perag_lock); pag = radix_tree_lookup(&mp->m_perag_tree, agno); + if (pag) { + ASSERT(atomic_read(&pag->pag_ref) >= 0); + /* catch leaks in the positive direction during testing */ + ASSERT(atomic_read(&pag->pag_ref) < 1000); + atomic_inc(&pag->pag_ref); + } spin_unlock(&mp->m_perag_lock); return pag; } @@ -400,7 +406,8 @@ xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno) static inline void xfs_perag_put(struct xfs_perag *pag) { - /* nothing to see here, move along */ + ASSERT(atomic_read(&pag->pag_ref) > 0); + atomic_dec(&pag->pag_ref); } /* -- cgit v1.2.3 From 0fa800fbd549736dfdc1d7761f87e33dc8cd973b Mon Sep 17 00:00:00 2001 From: Dave Chinner <david@fromorbit.com> Date: Mon, 11 Jan 2010 11:47:46 +0000 Subject: xfs: Add trace points for per-ag refcount debugging. Uninline xfs_perag_{get,put} so that tracepoints can be inserted into them to speed debugging of reference count problems. Signed-off-by: Dave Chinner <david@fromorbit.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Alex Elder <aelder@sgi.com> --- fs/xfs/linux-2.6/xfs_trace.h | 27 +++++++++++++++++++++++++++ fs/xfs/xfs_ag.h | 2 ++ fs/xfs/xfs_mount.c | 34 ++++++++++++++++++++++++++++++++++ fs/xfs/xfs_mount.h | 25 ++----------------------- 4 files changed, 65 insertions(+), 23 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h index 3353aef50530..1bb09e70b2eb 100644 --- a/fs/xfs/linux-2.6/xfs_trace.h +++ b/fs/xfs/linux-2.6/xfs_trace.h @@ -78,6 +78,33 @@ DECLARE_EVENT_CLASS(xfs_attr_list_class, ) ) +#define DEFINE_PERAG_REF_EVENT(name) \ +TRACE_EVENT(name, \ + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount, \ + unsigned long caller_ip), \ + TP_ARGS(mp, agno, refcount, caller_ip), \ + TP_STRUCT__entry( \ + __field(dev_t, dev) \ + __field(xfs_agnumber_t, agno) \ + __field(int, refcount) \ + __field(unsigned long, caller_ip) \ + ), \ + TP_fast_assign( \ + __entry->dev = mp->m_super->s_dev; \ + __entry->agno = agno; \ + __entry->refcount = refcount; \ + __entry->caller_ip = caller_ip; \ + ), \ + TP_printk("dev %d:%d agno %u refcount %d caller %pf", \ + MAJOR(__entry->dev), MINOR(__entry->dev), \ + __entry->agno, \ + __entry->refcount, \ + (char *)__entry->caller_ip) \ +); + +DEFINE_PERAG_REF_EVENT(xfs_perag_get) +DEFINE_PERAG_REF_EVENT(xfs_perag_put) + #define DEFINE_ATTR_LIST_EVENT(name) \ DEFINE_EVENT(xfs_attr_list_class, name, \ TP_PROTO(struct xfs_attr_list_context *ctx), \ diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h index 18ae43f4255d..963bc2700bf7 100644 --- a/fs/xfs/xfs_ag.h +++ b/fs/xfs/xfs_ag.h @@ -197,6 +197,8 @@ typedef struct xfs_perag_busy { #endif typedef struct xfs_perag { + struct xfs_mount *pag_mount; /* owner filesystem */ + xfs_agnumber_t pag_agno; /* AG this structure belongs to */ atomic_t pag_ref; /* perag reference count */ char pagf_init; /* this agf's entry is initialized */ char pagi_init; /* this agi's entry is initialized */ diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index f241fec26070..049dbc71c28e 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -200,6 +200,38 @@ xfs_uuid_unmount( } +/* + * Reference counting access wrappers to the perag structures. + */ +struct xfs_perag * +xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno) +{ + struct xfs_perag *pag; + int ref = 0; + + spin_lock(&mp->m_perag_lock); + pag = radix_tree_lookup(&mp->m_perag_tree, agno); + if (pag) { + ASSERT(atomic_read(&pag->pag_ref) >= 0); + /* catch leaks in the positive direction during testing */ + ASSERT(atomic_read(&pag->pag_ref) < 1000); + ref = atomic_inc_return(&pag->pag_ref); + } + spin_unlock(&mp->m_perag_lock); + trace_xfs_perag_get(mp, agno, ref, _RET_IP_); + return pag; +} + +void +xfs_perag_put(struct xfs_perag *pag) +{ + int ref; + + ASSERT(atomic_read(&pag->pag_ref) > 0); + ref = atomic_dec_return(&pag->pag_ref); + trace_xfs_perag_put(pag->pag_mount, pag->pag_agno, ref, _RET_IP_); +} + /* * Free up the resources associated with a mount structure. Assume that * the structure was initially zeroed, so we can tell which fields got @@ -433,6 +465,8 @@ xfs_initialize_perag( kmem_free(pag); return -EEXIST; } + pag->pag_agno = index; + pag->pag_mount = mp; spin_unlock(&mp->m_perag_lock); radix_tree_preload_end(); } diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 16b22120b98f..e62fd1cde464 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -386,29 +386,8 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d) /* * perag get/put wrappers for ref counting */ -static inline struct xfs_perag * -xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno) -{ - struct xfs_perag *pag; - - spin_lock(&mp->m_perag_lock); - pag = radix_tree_lookup(&mp->m_perag_tree, agno); - if (pag) { - ASSERT(atomic_read(&pag->pag_ref) >= 0); - /* catch leaks in the positive direction during testing */ - ASSERT(atomic_read(&pag->pag_ref) < 1000); - atomic_inc(&pag->pag_ref); - } - spin_unlock(&mp->m_perag_lock); - return pag; -} - -static inline void -xfs_perag_put(struct xfs_perag *pag) -{ - ASSERT(atomic_read(&pag->pag_ref) > 0); - atomic_dec(&pag->pag_ref); -} +struct xfs_perag *xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno); +void xfs_perag_put(struct xfs_perag *pag); /* * Per-cpu superblock locking functions -- cgit v1.2.3 From b657fc82a3ca6d7ad16a59e81765f0fb0e86cdbb Mon Sep 17 00:00:00 2001 From: Dave Chinner <david@fromorbit.com> Date: Mon, 11 Jan 2010 11:47:47 +0000 Subject: xfs: Kill filestreams cache flush The filestreams cache flush is not needed in the sync code as it does not affect data writeback, and it is now not used by the growfs code, either, so kill it. Signed-off-by: Dave Chinner <david@fromorbit.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Alex Elder <aelder@sgi.com> --- fs/xfs/linux-2.6/xfs_sync.c | 3 --- fs/xfs/xfs_filestream.c | 14 -------------- fs/xfs/xfs_filestream.h | 1 - fs/xfs/xfs_mru_cache.c | 2 +- fs/xfs/xfs_mru_cache.h | 1 - 5 files changed, 1 insertion(+), 20 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index cc964faf12e9..b58f8412dfe2 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c @@ -451,9 +451,6 @@ xfs_quiesce_data( xfs_sync_data(mp, SYNC_WAIT); xfs_qm_sync(mp, SYNC_WAIT); - /* drop inode references pinned by filestreams */ - xfs_filestream_flush(mp); - /* write superblock and hoover up shutdown errors */ error = xfs_sync_fsdata(mp, SYNC_WAIT); diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c index 914d00d0f119..390850ee6603 100644 --- a/fs/xfs/xfs_filestream.c +++ b/fs/xfs/xfs_filestream.c @@ -454,20 +454,6 @@ xfs_filestream_unmount( xfs_mru_cache_destroy(mp->m_filestream); } -/* - * If the mount point's m_perag tree is going to be modified, all - * outstanding cache entries must be flushed to avoid accessing reference count - * addresses that have been freed. The call to xfs_filestream_flush() must be - * made inside the block that holds the m_perag_lock in write mode to do the - * reallocation. - */ -void -xfs_filestream_flush( - xfs_mount_t *mp) -{ - xfs_mru_cache_flush(mp->m_filestream); -} - /* * Return the AG of the filestream the file or directory belongs to, or * NULLAGNUMBER otherwise. diff --git a/fs/xfs/xfs_filestream.h b/fs/xfs/xfs_filestream.h index 58378b2ea033..260f757bbc5d 100644 --- a/fs/xfs/xfs_filestream.h +++ b/fs/xfs/xfs_filestream.h @@ -135,7 +135,6 @@ int xfs_filestream_init(void); void xfs_filestream_uninit(void); int xfs_filestream_mount(struct xfs_mount *mp); void xfs_filestream_unmount(struct xfs_mount *mp); -void xfs_filestream_flush(struct xfs_mount *mp); xfs_agnumber_t xfs_filestream_lookup_ag(struct xfs_inode *ip); int xfs_filestream_associate(struct xfs_inode *dip, struct xfs_inode *ip); void xfs_filestream_deassociate(struct xfs_inode *ip); diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c index 4b0613d99faa..45ce15dc5b2b 100644 --- a/fs/xfs/xfs_mru_cache.c +++ b/fs/xfs/xfs_mru_cache.c @@ -398,7 +398,7 @@ exit: * guaranteed that all the free functions for all the elements have finished * executing and the reaper is not running. */ -void +static void xfs_mru_cache_flush( xfs_mru_cache_t *mru) { diff --git a/fs/xfs/xfs_mru_cache.h b/fs/xfs/xfs_mru_cache.h index 5d439f34b0c9..36dd3ec8b4eb 100644 --- a/fs/xfs/xfs_mru_cache.h +++ b/fs/xfs/xfs_mru_cache.h @@ -42,7 +42,6 @@ void xfs_mru_cache_uninit(void); int xfs_mru_cache_create(struct xfs_mru_cache **mrup, unsigned int lifetime_ms, unsigned int grp_count, xfs_mru_cache_free_func_t free_func); -void xfs_mru_cache_flush(xfs_mru_cache_t *mru); void xfs_mru_cache_destroy(struct xfs_mru_cache *mru); int xfs_mru_cache_insert(struct xfs_mru_cache *mru, unsigned long key, void *value); -- cgit v1.2.3 From 8b26c5825e023b1bccac7afd174ebe55b8905cb1 Mon Sep 17 00:00:00 2001 From: Dave Chinner <david@fromorbit.com> Date: Mon, 11 Jan 2010 11:47:48 +0000 Subject: xfs: handle ENOMEM correctly during initialisation of perag structures Add proper error handling in case an error occurs while initializing new perag structures for a mount point. The mount structure is restored to its previous state by deleting and freeing any perag structures added during the call. Signed-off-by: Dave Chinner <david@fromorbit.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Alex Elder <aelder@sgi.com> --- fs/xfs/xfs_mount.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 049dbc71c28e..be643e588067 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -432,11 +432,13 @@ xfs_initialize_perag( xfs_agnumber_t *maxagi) { xfs_agnumber_t index, max_metadata; + xfs_agnumber_t first_initialised = 0; xfs_perag_t *pag; xfs_agino_t agino; xfs_ino_t ino; xfs_sb_t *sbp = &mp->m_sb; xfs_ino_t max_inum = XFS_MAXINUMBER_32; + int error = -ENOMEM; /* Check to see if the filesystem can overflow 32 bit inodes */ agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0); @@ -453,17 +455,20 @@ xfs_initialize_perag( xfs_perag_put(pag); continue; } + if (!first_initialised) + first_initialised = index; pag = kmem_zalloc(sizeof(*pag), KM_MAYFAIL); if (!pag) - return -ENOMEM; + goto out_unwind; if (radix_tree_preload(GFP_NOFS)) - return -ENOMEM; + goto out_unwind; spin_lock(&mp->m_perag_lock); if (radix_tree_insert(&mp->m_perag_tree, index, pag)) { BUG(); spin_unlock(&mp->m_perag_lock); - kmem_free(pag); - return -EEXIST; + radix_tree_preload_end(); + error = -EEXIST; + goto out_unwind; } pag->pag_agno = index; pag->pag_mount = mp; @@ -523,6 +528,14 @@ xfs_initialize_perag( if (maxagi) *maxagi = index; return 0; + +out_unwind: + kmem_free(pag); + for (; index > first_initialised; index--) { + pag = radix_tree_delete(&mp->m_perag_tree, index); + kmem_free(pag); + } + return error; } void -- cgit v1.2.3 From e57336ff7fc7520bec7b3a7741043bdebaf622ea Mon Sep 17 00:00:00 2001 From: Dave Chinner <david@fromorbit.com> Date: Mon, 11 Jan 2010 11:47:49 +0000 Subject: xfs: embed the pagb_list array in the perag structure Now that the perag structure is allocated memory rather than held in an array, we don't need to have the busy extent array external to the structure. Embed it into the perag structure to avoid needing an extra allocation when setting up. Signed-off-by: Dave Chinner <david@fromorbit.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Alex Elder <aelder@sgi.com> --- fs/xfs/xfs_ag.h | 10 ++-------- fs/xfs/xfs_alloc.c | 4 ++-- fs/xfs/xfs_mount.c | 3 +-- 3 files changed, 5 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h index 963bc2700bf7..b1a5a1ff88ea 100644 --- a/fs/xfs/xfs_ag.h +++ b/fs/xfs/xfs_ag.h @@ -187,14 +187,8 @@ typedef struct xfs_perag_busy { /* * Per-ag incore structure, copies of information in agf and agi, * to improve the performance of allocation group selection. - * - * pick sizes which fit in allocation buckets well */ -#if (BITS_PER_LONG == 32) -#define XFS_PAGB_NUM_SLOTS 84 -#elif (BITS_PER_LONG == 64) #define XFS_PAGB_NUM_SLOTS 128 -#endif typedef struct xfs_perag { struct xfs_mount *pag_mount; /* owner filesystem */ @@ -212,8 +206,6 @@ typedef struct xfs_perag { __uint32_t pagf_btreeblks; /* # of blocks held in AGF btrees */ xfs_agino_t pagi_freecount; /* number of free inodes */ xfs_agino_t pagi_count; /* number of allocated inodes */ - int pagb_count; /* pagb slots in use */ - xfs_perag_busy_t *pagb_list; /* unstable blocks */ /* * Inode allocation search lookup optimisation. @@ -232,6 +224,8 @@ typedef struct xfs_perag { rwlock_t pag_ici_lock; /* incore inode lock */ struct radix_tree_root pag_ici_root; /* incore inode cache root */ #endif + int pagb_count; /* pagb slots in use */ + xfs_perag_busy_t pagb_list[XFS_PAGB_NUM_SLOTS]; /* unstable blocks */ } xfs_perag_t; /* diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 4d66bb75579c..8aa181d6dd7d 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -2200,8 +2200,8 @@ xfs_alloc_read_agf( pag->pagf_levels[XFS_BTNUM_CNTi] = be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]); spin_lock_init(&pag->pagb_lock); - pag->pagb_list = kmem_zalloc(XFS_PAGB_NUM_SLOTS * - sizeof(xfs_perag_busy_t), KM_SLEEP); + pag->pagb_count = 0; + memset(pag->pagb_list, 0, sizeof(pag->pagb_list)); pag->pagf_init = 1; } #ifdef DEBUG diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index be643e588067..0df5045abd3b 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -247,10 +247,9 @@ xfs_free_perag( for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { spin_lock(&mp->m_perag_lock); pag = radix_tree_delete(&mp->m_perag_tree, agno); + ASSERT(pag); ASSERT(atomic_read(&pag->pag_ref) == 0); spin_unlock(&mp->m_perag_lock); - ASSERT(pag); - kmem_free(pag->pagb_list); kmem_free(pag); } } -- cgit v1.2.3 From 873ff5501d8cd1a21045d6c1da34f0c3876bc235 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@infradead.org> Date: Wed, 13 Jan 2010 22:17:57 +0000 Subject: xfs: clean up log buffer writes Don't bother using XFS_bwrite as it doesn't provide much code for our use case. Instead opencode it and fold xlog_bdstrat_cb into the new xlog_bdstrat helper. Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Alex Elder <aelder@sgi.com> --- fs/xfs/linux-2.6/xfs_buf.h | 2 -- fs/xfs/xfs_log.c | 67 +++++++++++++++++++++++----------------------- 2 files changed, 33 insertions(+), 36 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h index a34c7b54822d..c20a76001867 100644 --- a/fs/xfs/linux-2.6/xfs_buf.h +++ b/fs/xfs/linux-2.6/xfs_buf.h @@ -408,8 +408,6 @@ static inline int XFS_bwrite(xfs_buf_t *bp) return error; } -#define XFS_bdstrat(bp) xfs_buf_iorequest(bp) - #define xfs_iowait(bp) xfs_buf_iowait(bp) #define xfs_baread(target, rablkno, ralen) \ diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 600b5b06aaeb..0d17516fbb13 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -50,7 +50,6 @@ kmem_zone_t *xfs_log_ticket_zone; (off) += (bytes);} /* Local miscellaneous function prototypes */ -STATIC int xlog_bdstrat_cb(struct xfs_buf *); STATIC int xlog_commit_record(xfs_mount_t *mp, xlog_ticket_t *ticket, xlog_in_core_t **, xfs_lsn_t *); STATIC xlog_t * xlog_alloc_log(xfs_mount_t *mp, @@ -987,35 +986,6 @@ xlog_iodone(xfs_buf_t *bp) } /* xlog_iodone */ -/* - * The bdstrat callback function for log bufs. This gives us a central - * place to trap bufs in case we get hit by a log I/O error and need to - * shutdown. Actually, in practice, even when we didn't get a log error, - * we transition the iclogs to IOERROR state *after* flushing all existing - * iclogs to disk. This is because we don't want anymore new transactions to be - * started or completed afterwards. - */ -STATIC int -xlog_bdstrat_cb(struct xfs_buf *bp) -{ - xlog_in_core_t *iclog; - - iclog = XFS_BUF_FSPRIVATE(bp, xlog_in_core_t *); - - if ((iclog->ic_state & XLOG_STATE_IOERROR) == 0) { - /* note for irix bstrat will need struct bdevsw passed - * Fix the following macro if the code ever is merged - */ - XFS_bdstrat(bp); - return 0; - } - - XFS_BUF_ERROR(bp, EIO); - XFS_BUF_STALE(bp); - xfs_biodone(bp); - return XFS_ERROR(EIO); -} - /* * Return size of each in-core log record buffer. * @@ -1158,7 +1128,6 @@ xlog_alloc_log(xfs_mount_t *mp, if (!bp) goto out_free_log; XFS_BUF_SET_IODONE_FUNC(bp, xlog_iodone); - XFS_BUF_SET_BDSTRAT_FUNC(bp, xlog_bdstrat_cb); XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1); ASSERT(XFS_BUF_ISBUSY(bp)); ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); @@ -1196,7 +1165,6 @@ xlog_alloc_log(xfs_mount_t *mp, if (!XFS_BUF_CPSEMA(bp)) ASSERT(0); XFS_BUF_SET_IODONE_FUNC(bp, xlog_iodone); - XFS_BUF_SET_BDSTRAT_FUNC(bp, xlog_bdstrat_cb); XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1); iclog->ic_bp = bp; iclog->ic_data = bp->b_addr; @@ -1343,6 +1311,37 @@ xlog_grant_push_ail(xfs_mount_t *mp, xfs_trans_ail_push(log->l_ailp, threshold_lsn); } /* xlog_grant_push_ail */ +/* + * The bdstrat callback function for log bufs. This gives us a central + * place to trap bufs in case we get hit by a log I/O error and need to + * shutdown. Actually, in practice, even when we didn't get a log error, + * we transition the iclogs to IOERROR state *after* flushing all existing + * iclogs to disk. This is because we don't want anymore new transactions to be + * started or completed afterwards. + */ +STATIC int +xlog_bdstrat( + struct xfs_buf *bp) +{ + struct xlog_in_core *iclog; + + iclog = XFS_BUF_FSPRIVATE(bp, xlog_in_core_t *); + if (iclog->ic_state & XLOG_STATE_IOERROR) { + XFS_BUF_ERROR(bp, EIO); + XFS_BUF_STALE(bp); + xfs_biodone(bp); + /* + * It would seem logical to return EIO here, but we rely on + * the log state machine to propagate I/O errors instead of + * doing it here. + */ + return 0; + } + + bp->b_flags |= _XBF_RUN_QUEUES; + xfs_buf_iorequest(bp); + return 0; +} /* * Flush out the in-core log (iclog) to the on-disk log in an asynchronous @@ -1462,7 +1461,7 @@ xlog_sync(xlog_t *log, */ XFS_BUF_WRITE(bp); - if ((error = XFS_bwrite(bp))) { + if ((error = xlog_bdstrat(bp))) { xfs_ioerror_alert("xlog_sync", log->l_mp, bp, XFS_BUF_ADDR(bp)); return error; @@ -1502,7 +1501,7 @@ xlog_sync(xlog_t *log, /* account for internal log which doesn't start at block #0 */ XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart); XFS_BUF_WRITE(bp); - if ((error = XFS_bwrite(bp))) { + if ((error = xlog_bdstrat(bp))) { xfs_ioerror_alert("xlog_sync (split)", log->l_mp, bp, XFS_BUF_ADDR(bp)); return error; -- cgit v1.2.3 From 64e0bc7d2a6609ad265757a600e2a0d93c8adb47 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@infradead.org> Date: Wed, 13 Jan 2010 22:17:58 +0000 Subject: xfs: clean up xfs_bwrite Fold XFS_bwrite into it's only caller, xfs_bwrite and move it into xfs_buf.c instead of leaving it as a fairly large inline function. Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Alex Elder <aelder@sgi.com> --- fs/xfs/linux-2.6/xfs_buf.c | 27 +++++++++++++++++++++++++++ fs/xfs/linux-2.6/xfs_buf.h | 19 +------------------ fs/xfs/xfs_rw.c | 31 ------------------------------- fs/xfs/xfs_rw.h | 1 - 4 files changed, 28 insertions(+), 50 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 18ae3ba8f78a..492465c6e0b4 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -1050,6 +1050,33 @@ xfs_buf_ioerror( trace_xfs_buf_ioerror(bp, error, _RET_IP_); } +int +xfs_bwrite( + struct xfs_mount *mp, + struct xfs_buf *bp) +{ + int iowait = (bp->b_flags & XBF_ASYNC) == 0; + int error = 0; + + bp->b_strat = xfs_bdstrat_cb; + bp->b_mount = mp; + bp->b_flags |= XBF_WRITE; + if (!iowait) + bp->b_flags |= _XBF_RUN_QUEUES; + + xfs_buf_delwri_dequeue(bp); + xfs_buf_iostrategy(bp); + + if (iowait) { + error = xfs_buf_iowait(bp); + if (error) + xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); + xfs_buf_relse(bp); + } + + return error; +} + int xfs_bawrite( void *mp, diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h index c20a76001867..f69b8e714a11 100644 --- a/fs/xfs/linux-2.6/xfs_buf.h +++ b/fs/xfs/linux-2.6/xfs_buf.h @@ -232,6 +232,7 @@ extern void xfs_buf_lock(xfs_buf_t *); extern void xfs_buf_unlock(xfs_buf_t *); /* Buffer Read and Write Routines */ +extern int xfs_bwrite(struct xfs_mount *mp, struct xfs_buf *bp); extern int xfs_bawrite(void *mp, xfs_buf_t *bp); extern void xfs_bdwrite(void *mp, xfs_buf_t *bp); extern void xfs_buf_ioend(xfs_buf_t *, int); @@ -390,24 +391,6 @@ static inline void xfs_buf_relse(xfs_buf_t *bp) #define xfs_biozero(bp, off, len) \ xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO) - -static inline int XFS_bwrite(xfs_buf_t *bp) -{ - int iowait = (bp->b_flags & XBF_ASYNC) == 0; - int error = 0; - - if (!iowait) - bp->b_flags |= _XBF_RUN_QUEUES; - - xfs_buf_delwri_dequeue(bp); - xfs_buf_iostrategy(bp); - if (iowait) { - error = xfs_buf_iowait(bp); - xfs_buf_relse(bp); - } - return error; -} - #define xfs_iowait(bp) xfs_buf_iowait(bp) #define xfs_baread(target, rablkno, ralen) \ diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c index 5aa07caea5f1..9d933a10d6bb 100644 --- a/fs/xfs/xfs_rw.c +++ b/fs/xfs/xfs_rw.c @@ -305,37 +305,6 @@ xfs_read_buf( return (error); } -/* - * Wrapper around bwrite() so that we can trap - * write errors, and act accordingly. - */ -int -xfs_bwrite( - struct xfs_mount *mp, - struct xfs_buf *bp) -{ - int error; - - /* - * XXXsup how does this work for quotas. - */ - XFS_BUF_SET_BDSTRAT_FUNC(bp, xfs_bdstrat_cb); - bp->b_mount = mp; - XFS_BUF_WRITE(bp); - - if ((error = XFS_bwrite(bp))) { - ASSERT(mp); - /* - * Cannot put a buftrace here since if the buffer is not - * B_HOLD then we will brelse() the buffer before returning - * from bwrite and we could be tracing a buffer that has - * been reused. - */ - xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); - } - return (error); -} - /* * helper function to extract extent size hint from inode */ diff --git a/fs/xfs/xfs_rw.h b/fs/xfs/xfs_rw.h index 571f2174435c..ff68eb5e738e 100644 --- a/fs/xfs/xfs_rw.h +++ b/fs/xfs/xfs_rw.h @@ -40,7 +40,6 @@ xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb) * Prototypes for functions in xfs_rw.c. */ extern int xfs_write_clear_setuid(struct xfs_inode *ip); -extern int xfs_bwrite(struct xfs_mount *mp, struct xfs_buf *bp); extern int xfs_bioerror(struct xfs_buf *bp); extern int xfs_bioerror_relse(struct xfs_buf *bp); extern int xfs_read_buf(struct xfs_mount *mp, xfs_buftarg_t *btp, -- cgit v1.2.3 From 4e23471a3f3aba885ea70100db47ccacb5f069f6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@infradead.org> Date: Wed, 13 Jan 2010 22:17:56 +0000 Subject: xfs: move more buffer helpers into xfs_buf.c Move xfsbdstrat and xfs_bdstrat_cb from xfs_lrw.c and xfs_bioerror and xfs_bioerror_relse from xfs_rw.c into xfs_buf.c. This also means xfs_bioerror and xfs_bioerror_relse can be marked static now. Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Alex Elder <aelder@sgi.com> --- fs/xfs/linux-2.6/xfs_buf.c | 120 +++++++++++++++++++++++++++++++++++++++++++++ fs/xfs/linux-2.6/xfs_buf.h | 4 ++ fs/xfs/linux-2.6/xfs_lrw.c | 47 ------------------ fs/xfs/linux-2.6/xfs_lrw.h | 3 -- fs/xfs/xfs_rw.c | 82 ------------------------------- fs/xfs/xfs_rw.h | 2 - 6 files changed, 124 insertions(+), 134 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 492465c6e0b4..158fad4550df 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -1112,6 +1112,126 @@ xfs_bdwrite( xfs_buf_delwri_queue(bp, 1); } +/* + * Called when we want to stop a buffer from getting written or read. + * We attach the EIO error, muck with its flags, and call biodone + * so that the proper iodone callbacks get called. + */ +STATIC int +xfs_bioerror( + xfs_buf_t *bp) +{ +#ifdef XFSERRORDEBUG + ASSERT(XFS_BUF_ISREAD(bp) || bp->b_iodone); +#endif + + /* + * No need to wait until the buffer is unpinned, we aren't flushing it. + */ + XFS_BUF_ERROR(bp, EIO); + + /* + * We're calling biodone, so delete XBF_DONE flag. + */ + XFS_BUF_UNREAD(bp); + XFS_BUF_UNDELAYWRITE(bp); + XFS_BUF_UNDONE(bp); + XFS_BUF_STALE(bp); + + XFS_BUF_CLR_BDSTRAT_FUNC(bp); + xfs_biodone(bp); + + return EIO; +} + +/* + * Same as xfs_bioerror, except that we are releasing the buffer + * here ourselves, and avoiding the biodone call. + * This is meant for userdata errors; metadata bufs come with + * iodone functions attached, so that we can track down errors. + */ +STATIC int +xfs_bioerror_relse( + struct xfs_buf *bp) +{ + int64_t fl = XFS_BUF_BFLAGS(bp); + /* + * No need to wait until the buffer is unpinned. + * We aren't flushing it. + * + * chunkhold expects B_DONE to be set, whether + * we actually finish the I/O or not. We don't want to + * change that interface. + */ + XFS_BUF_UNREAD(bp); + XFS_BUF_UNDELAYWRITE(bp); + XFS_BUF_DONE(bp); + XFS_BUF_STALE(bp); + XFS_BUF_CLR_IODONE_FUNC(bp); + XFS_BUF_CLR_BDSTRAT_FUNC(bp); + if (!(fl & XFS_B_ASYNC)) { + /* + * Mark b_error and B_ERROR _both_. + * Lot's of chunkcache code assumes that. + * There's no reason to mark error for + * ASYNC buffers. + */ + XFS_BUF_ERROR(bp, EIO); + XFS_BUF_FINISH_IOWAIT(bp); + } else { + xfs_buf_relse(bp); + } + + return EIO; +} + + +/* + * All xfs metadata buffers except log state machine buffers + * get this attached as their b_bdstrat callback function. + * This is so that we can catch a buffer + * after prematurely unpinning it to forcibly shutdown the filesystem. + */ +int +xfs_bdstrat_cb( + struct xfs_buf *bp) +{ + if (XFS_FORCED_SHUTDOWN(bp->b_mount)) { + trace_xfs_bdstrat_shut(bp, _RET_IP_); + /* + * Metadata write that didn't get logged but + * written delayed anyway. These aren't associated + * with a transaction, and can be ignored. + */ + if (!bp->b_iodone && !XFS_BUF_ISREAD(bp)) + return xfs_bioerror_relse(bp); + else + return xfs_bioerror(bp); + } + + xfs_buf_iorequest(bp); + return 0; +} + +/* + * Wrapper around bdstrat so that we can stop data from going to disk in case + * we are shutting down the filesystem. Typically user data goes thru this + * path; one of the exceptions is the superblock. + */ +void +xfsbdstrat( + struct xfs_mount *mp, + struct xfs_buf *bp) +{ + if (XFS_FORCED_SHUTDOWN(mp)) { + trace_xfs_bdstrat_shut(bp, _RET_IP_); + xfs_bioerror_relse(bp); + return; + } + + xfs_buf_iorequest(bp); +} + STATIC void _xfs_buf_ioend( xfs_buf_t *bp, diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h index f69b8e714a11..9a29d18656ec 100644 --- a/fs/xfs/linux-2.6/xfs_buf.h +++ b/fs/xfs/linux-2.6/xfs_buf.h @@ -235,6 +235,10 @@ extern void xfs_buf_unlock(xfs_buf_t *); extern int xfs_bwrite(struct xfs_mount *mp, struct xfs_buf *bp); extern int xfs_bawrite(void *mp, xfs_buf_t *bp); extern void xfs_bdwrite(void *mp, xfs_buf_t *bp); + +extern void xfsbdstrat(struct xfs_mount *, struct xfs_buf *); +extern int xfs_bdstrat_cb(struct xfs_buf *); + extern void xfs_buf_ioend(xfs_buf_t *, int); extern void xfs_buf_ioerror(xfs_buf_t *, int); extern int xfs_buf_iorequest(xfs_buf_t *); diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c index 0d32457abef1..c80fa00d2ad7 100644 --- a/fs/xfs/linux-2.6/xfs_lrw.c +++ b/fs/xfs/linux-2.6/xfs_lrw.c @@ -783,53 +783,6 @@ write_retry: return -error; } -/* - * All xfs metadata buffers except log state machine buffers - * get this attached as their b_bdstrat callback function. - * This is so that we can catch a buffer - * after prematurely unpinning it to forcibly shutdown the filesystem. - */ -int -xfs_bdstrat_cb(struct xfs_buf *bp) -{ - if (XFS_FORCED_SHUTDOWN(bp->b_mount)) { - trace_xfs_bdstrat_shut(bp, _RET_IP_); - /* - * Metadata write that didn't get logged but - * written delayed anyway. These aren't associated - * with a transaction, and can be ignored. - */ - if (XFS_BUF_IODONE_FUNC(bp) == NULL && - (XFS_BUF_ISREAD(bp)) == 0) - return (xfs_bioerror_relse(bp)); - else - return (xfs_bioerror(bp)); - } - - xfs_buf_iorequest(bp); - return 0; -} - -/* - * Wrapper around bdstrat so that we can stop data from going to disk in case - * we are shutting down the filesystem. Typically user data goes thru this - * path; one of the exceptions is the superblock. - */ -void -xfsbdstrat( - struct xfs_mount *mp, - struct xfs_buf *bp) -{ - ASSERT(mp); - if (!XFS_FORCED_SHUTDOWN(mp)) { - xfs_buf_iorequest(bp); - return; - } - - trace_xfs_bdstrat_shut(bp, _RET_IP_); - xfs_bioerror_relse(bp); -} - /* * If the underlying (data/log/rt) device is readonly, there are some * operations that cannot proceed. diff --git a/fs/xfs/linux-2.6/xfs_lrw.h b/fs/xfs/linux-2.6/xfs_lrw.h index d1f7789c7ffb..342ae8c0d011 100644 --- a/fs/xfs/linux-2.6/xfs_lrw.h +++ b/fs/xfs/linux-2.6/xfs_lrw.h @@ -22,9 +22,6 @@ struct xfs_mount; struct xfs_inode; struct xfs_buf; -/* errors from xfsbdstrat() must be extracted from the buffer */ -extern void xfsbdstrat(struct xfs_mount *, struct xfs_buf *); -extern int xfs_bdstrat_cb(struct xfs_buf *); extern int xfs_dev_is_read_only(struct xfs_mount *, char *); extern int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t); diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c index 9d933a10d6bb..abb2c458b148 100644 --- a/fs/xfs/xfs_rw.c +++ b/fs/xfs/xfs_rw.c @@ -153,88 +153,6 @@ xfs_do_force_shutdown( } } - -/* - * Called when we want to stop a buffer from getting written or read. - * We attach the EIO error, muck with its flags, and call biodone - * so that the proper iodone callbacks get called. - */ -int -xfs_bioerror( - xfs_buf_t *bp) -{ - -#ifdef XFSERRORDEBUG - ASSERT(XFS_BUF_ISREAD(bp) || bp->b_iodone); -#endif - - /* - * No need to wait until the buffer is unpinned. - * We aren't flushing it. - */ - XFS_BUF_ERROR(bp, EIO); - /* - * We're calling biodone, so delete B_DONE flag. Either way - * we have to call the iodone callback, and calling biodone - * probably is the best way since it takes care of - * GRIO as well. - */ - XFS_BUF_UNREAD(bp); - XFS_BUF_UNDELAYWRITE(bp); - XFS_BUF_UNDONE(bp); - XFS_BUF_STALE(bp); - - XFS_BUF_CLR_BDSTRAT_FUNC(bp); - xfs_biodone(bp); - - return (EIO); -} - -/* - * Same as xfs_bioerror, except that we are releasing the buffer - * here ourselves, and avoiding the biodone call. - * This is meant for userdata errors; metadata bufs come with - * iodone functions attached, so that we can track down errors. - */ -int -xfs_bioerror_relse( - xfs_buf_t *bp) -{ - int64_t fl; - - ASSERT(XFS_BUF_IODONE_FUNC(bp) != xfs_buf_iodone_callbacks); - ASSERT(XFS_BUF_IODONE_FUNC(bp) != xlog_iodone); - - fl = XFS_BUF_BFLAGS(bp); - /* - * No need to wait until the buffer is unpinned. - * We aren't flushing it. - * - * chunkhold expects B_DONE to be set, whether - * we actually finish the I/O or not. We don't want to - * change that interface. - */ - XFS_BUF_UNREAD(bp); - XFS_BUF_UNDELAYWRITE(bp); - XFS_BUF_DONE(bp); - XFS_BUF_STALE(bp); - XFS_BUF_CLR_IODONE_FUNC(bp); - XFS_BUF_CLR_BDSTRAT_FUNC(bp); - if (!(fl & XFS_B_ASYNC)) { - /* - * Mark b_error and B_ERROR _both_. - * Lot's of chunkcache code assumes that. - * There's no reason to mark error for - * ASYNC buffers. - */ - XFS_BUF_ERROR(bp, EIO); - XFS_BUF_FINISH_IOWAIT(bp); - } else { - xfs_buf_relse(bp); - } - return (EIO); -} - /* * Prints out an ALERT message about I/O error. */ diff --git a/fs/xfs/xfs_rw.h b/fs/xfs/xfs_rw.h index ff68eb5e738e..a54c3b7cd376 100644 --- a/fs/xfs/xfs_rw.h +++ b/fs/xfs/xfs_rw.h @@ -40,8 +40,6 @@ xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb) * Prototypes for functions in xfs_rw.c. */ extern int xfs_write_clear_setuid(struct xfs_inode *ip); -extern int xfs_bioerror(struct xfs_buf *bp); -extern int xfs_bioerror_relse(struct xfs_buf *bp); extern int xfs_read_buf(struct xfs_mount *mp, xfs_buftarg_t *btp, xfs_daddr_t blkno, int len, uint flags, struct xfs_buf **bpp); -- cgit v1.2.3 From e2bcd936eb95d0019ca5e05f9fdd27e770ddded1 Mon Sep 17 00:00:00 2001 From: Dave Chinner <david@fromorbit.com> Date: Wed, 20 Jan 2010 10:44:58 +1100 Subject: xfs: directory names are unsigned Convert the struct xfs_name to use unsigned chars for the name strings to match both what is stored on disk (__uint8_t) and what the VFS expects (unsigned char). Signed-off-by: Dave Chinner <david@fromorbit.com> Reviewed-by: Christoph Hellwig <hch@lst.de> --- fs/xfs/xfs_types.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h index d725428c9df6..b09904555d07 100644 --- a/fs/xfs/xfs_types.h +++ b/fs/xfs/xfs_types.h @@ -151,8 +151,8 @@ typedef enum { } xfs_btnum_t; struct xfs_name { - const char *name; - int len; + const unsigned char *name; + int len; }; #endif /* __XFS_TYPES_H__ */ -- cgit v1.2.3 From 046ea753130fc51d885835458bf8c1d84765b9ac Mon Sep 17 00:00:00 2001 From: Dave Chinner <david@fromorbit.com> Date: Wed, 20 Jan 2010 10:47:08 +1100 Subject: xfs: convert DM ops to use unsigned char names dmops uses a signed char for it's namespace event. To be consistent with the rest of the code, convert them to unsigned char for the namespace string. Signed-off-by: Dave Chinner <david@fromorbit.com> Reviewed-by: Christoph Hellwig <hch@lst.de> --- fs/xfs/xfs_mount.h | 3 ++- fs/xfs/xfs_vnodeops.c | 6 ++++-- 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index e62fd1cde464..f4d1441f3f15 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -78,7 +78,8 @@ typedef int (*xfs_send_destroy_t)(struct xfs_inode *, dm_right_t); typedef int (*xfs_send_namesp_t)(dm_eventtype_t, struct xfs_mount *, struct xfs_inode *, dm_right_t, struct xfs_inode *, dm_right_t, - const char *, const char *, mode_t, int, int); + const unsigned char *, const unsigned char *, + mode_t, int, int); typedef int (*xfs_send_mount_t)(struct xfs_mount *, dm_right_t, char *, char *); typedef void (*xfs_send_unmount_t)(struct xfs_mount *, struct xfs_inode *, diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 6f268756bf36..9f7c001ef469 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -2199,7 +2199,8 @@ xfs_symlink( if (DM_EVENT_ENABLED(dp, DM_EVENT_SYMLINK)) { error = XFS_SEND_NAMESP(mp, DM_EVENT_SYMLINK, dp, DM_RIGHT_NULL, NULL, DM_RIGHT_NULL, - link_name->name, target_path, 0, 0, 0); + link_name->name, + (unsigned char *)target_path, 0, 0, 0); if (error) return error; } @@ -2395,7 +2396,8 @@ std_return: dp, DM_RIGHT_NULL, error ? NULL : ip, DM_RIGHT_NULL, link_name->name, - target_path, 0, error, 0); + (unsigned char *)target_path, + 0, error, 0); } if (!error) -- cgit v1.2.3 From 2bc754213d40d67c39ddd58cf240f2b948e1951e Mon Sep 17 00:00:00 2001 From: Dave Chinner <david@fromorbit.com> Date: Wed, 20 Jan 2010 10:47:17 +1100 Subject: xfs: convert dirnameops to unsigned char names To be consistent across the codebase, convert the dirnameops to pass the directory names by unsigned char strings. Signed-off-by: Dave Chinner <david@fromorbit.com> Reviewed-by: Christoph Hellwig <hch@lst.de> --- fs/xfs/xfs_da_btree.c | 4 ++-- fs/xfs/xfs_da_btree.h | 5 +++-- fs/xfs/xfs_dir2.c | 4 ++-- 3 files changed, 7 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index c0c8869115b1..0ca556b4bf31 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -1534,8 +1534,8 @@ xfs_da_hashname(const __uint8_t *name, int namelen) enum xfs_dacmp xfs_da_compname( struct xfs_da_args *args, - const char *name, - int len) + const unsigned char *name, + int len) { return (args->namelen == len && memcmp(args->name, name, len) == 0) ? XFS_CMP_EXACT : XFS_CMP_DIFFERENT; diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h index 30cd08f56a3a..fe9f5a8c1d2a 100644 --- a/fs/xfs/xfs_da_btree.h +++ b/fs/xfs/xfs_da_btree.h @@ -209,7 +209,8 @@ typedef struct xfs_da_state { */ struct xfs_nameops { xfs_dahash_t (*hashname)(struct xfs_name *); - enum xfs_dacmp (*compname)(struct xfs_da_args *, const char *, int); + enum xfs_dacmp (*compname)(struct xfs_da_args *, + const unsigned char *, int); }; @@ -260,7 +261,7 @@ int xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno, uint xfs_da_hashname(const __uint8_t *name_string, int name_length); enum xfs_dacmp xfs_da_compname(struct xfs_da_args *args, - const char *name, int len); + const unsigned char *name, int len); xfs_da_state_t *xfs_da_state_alloc(void); diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c index 93634a7e90e9..c21c527766bf 100644 --- a/fs/xfs/xfs_dir2.c +++ b/fs/xfs/xfs_dir2.c @@ -66,8 +66,8 @@ xfs_ascii_ci_hashname( STATIC enum xfs_dacmp xfs_ascii_ci_compname( struct xfs_da_args *args, - const char *name, - int len) + const unsigned char *name, + int len) { enum xfs_dacmp result; int i; -- cgit v1.2.3 From a3380ae39fa321282c407ba5e1835e14b64853d9 Mon Sep 17 00:00:00 2001 From: Dave Chinner <david@fromorbit.com> Date: Wed, 20 Jan 2010 10:47:25 +1100 Subject: xfs: make xfs_dir_cilookup_result use unsigned char For consistency with the result of the code. Signed-off-by: Dave Chinner <david@fromorbit.com> Reviewed-by: Christoph Hellwig <hch@lst.de> --- fs/xfs/xfs_dir2.c | 2 +- fs/xfs/xfs_dir2.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c index c21c527766bf..3a8c6ba0638f 100644 --- a/fs/xfs/xfs_dir2.c +++ b/fs/xfs/xfs_dir2.c @@ -247,7 +247,7 @@ xfs_dir_createname( int xfs_dir_cilookup_result( struct xfs_da_args *args, - const char *name, + const unsigned char *name, int len) { if (args->cmpresult == XFS_CMP_DIFFERENT) diff --git a/fs/xfs/xfs_dir2.h b/fs/xfs/xfs_dir2.h index 1d9ef96f33aa..74a3b1057685 100644 --- a/fs/xfs/xfs_dir2.h +++ b/fs/xfs/xfs_dir2.h @@ -100,7 +100,7 @@ extern int xfs_dir2_isleaf(struct xfs_trans *tp, struct xfs_inode *dp, extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db, struct xfs_dabuf *bp); -extern int xfs_dir_cilookup_result(struct xfs_da_args *args, const char *name, - int len); +extern int xfs_dir_cilookup_result(struct xfs_da_args *args, + const unsigned char *name, int len); #endif /* __XFS_DIR2_H__ */ -- cgit v1.2.3 From b9c48649577dfc4a8c263c106d518effa24ea54b Mon Sep 17 00:00:00 2001 From: Dave Chinner <david@fromorbit.com> Date: Wed, 20 Jan 2010 10:47:39 +1100 Subject: xfs: xfs_buf_iomove() doesn't care about signedness xfs_buf_iomove() uses xfs_caddr_t as it's parameter types, but it doesn't care about the signedness of the variables as it is just copying the data. Change the prototype to use void * so that we don't get sign warnings at call sites. Signed-off-by: Dave Chinner <david@fromorbit.com> Reviewed-by: Christoph Hellwig <hch@lst.de> --- fs/xfs/linux-2.6/xfs_buf.c | 2 +- fs/xfs/linux-2.6/xfs_buf.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 158fad4550df..efd745bb8887 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -1443,7 +1443,7 @@ xfs_buf_iomove( xfs_buf_t *bp, /* buffer to process */ size_t boff, /* starting buffer offset */ size_t bsize, /* length to copy */ - caddr_t data, /* data address */ + void *data, /* data address */ xfs_buf_rw_t mode) /* read/write/zero flag */ { size_t bend, cpoff, csize; diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h index 9a29d18656ec..4f2ad66edb76 100644 --- a/fs/xfs/linux-2.6/xfs_buf.h +++ b/fs/xfs/linux-2.6/xfs_buf.h @@ -243,7 +243,7 @@ extern void xfs_buf_ioend(xfs_buf_t *, int); extern void xfs_buf_ioerror(xfs_buf_t *, int); extern int xfs_buf_iorequest(xfs_buf_t *); extern int xfs_buf_iowait(xfs_buf_t *); -extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, xfs_caddr_t, +extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *, xfs_buf_rw_t); static inline int xfs_buf_iostrategy(xfs_buf_t *bp) -- cgit v1.2.3 From a9273ca5c6814f393e18ed66645f817b2b71e9ad Mon Sep 17 00:00:00 2001 From: Dave Chinner <david@fromorbit.com> Date: Wed, 20 Jan 2010 10:47:48 +1100 Subject: xfs: convert attr to use unsigned names MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To be consistent with the directory code, the attr code should use unsigned names. Convert the names from the vfs at the highest level to unsigned, and ænsure they are consistenly used as unsigned down to disk. Signed-off-by: Dave Chinner <david@fromorbit.com> Reviewed-by: Christoph Hellwig <hch@lst.de> --- fs/xfs/linux-2.6/xfs_acl.c | 11 ++++++----- fs/xfs/linux-2.6/xfs_ioctl.c | 18 +++++++++--------- fs/xfs/linux-2.6/xfs_ioctl.h | 12 ++++++------ fs/xfs/linux-2.6/xfs_ioctl32.c | 4 ++-- fs/xfs/linux-2.6/xfs_iops.c | 4 ++-- fs/xfs/linux-2.6/xfs_xattr.c | 27 +++++++++++++++++++-------- fs/xfs/xfs_acl.h | 4 ++-- fs/xfs/xfs_attr.c | 38 +++++++++++++++++++++++--------------- fs/xfs/xfs_attr.h | 2 +- fs/xfs/xfs_attr_leaf.c | 28 ++++++++++++++-------------- fs/xfs/xfs_attr_sf.h | 2 +- fs/xfs/xfs_vnodeops.h | 10 +++++----- 12 files changed, 90 insertions(+), 70 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_acl.c b/fs/xfs/linux-2.6/xfs_acl.c index 883ca5ab8af5..bf85bbe4a9ae 100644 --- a/fs/xfs/linux-2.6/xfs_acl.c +++ b/fs/xfs/linux-2.6/xfs_acl.c @@ -106,7 +106,7 @@ xfs_get_acl(struct inode *inode, int type) struct posix_acl *acl; struct xfs_acl *xfs_acl; int len = sizeof(struct xfs_acl); - char *ea_name; + unsigned char *ea_name; int error; acl = get_cached_acl(inode, type); @@ -133,7 +133,8 @@ xfs_get_acl(struct inode *inode, int type) if (!xfs_acl) return ERR_PTR(-ENOMEM); - error = -xfs_attr_get(ip, ea_name, (char *)xfs_acl, &len, ATTR_ROOT); + error = -xfs_attr_get(ip, ea_name, (unsigned char *)xfs_acl, + &len, ATTR_ROOT); if (error) { /* * If the attribute doesn't exist make sure we have a negative @@ -162,7 +163,7 @@ STATIC int xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl) { struct xfs_inode *ip = XFS_I(inode); - char *ea_name; + unsigned char *ea_name; int error; if (S_ISLNK(inode->i_mode)) @@ -194,7 +195,7 @@ xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl) (sizeof(struct xfs_acl_entry) * (XFS_ACL_MAX_ENTRIES - acl->a_count)); - error = -xfs_attr_set(ip, ea_name, (char *)xfs_acl, + error = -xfs_attr_set(ip, ea_name, (unsigned char *)xfs_acl, len, ATTR_ROOT); kfree(xfs_acl); @@ -262,7 +263,7 @@ xfs_set_mode(struct inode *inode, mode_t mode) } static int -xfs_acl_exists(struct inode *inode, char *name) +xfs_acl_exists(struct inode *inode, unsigned char *name) { int len = sizeof(struct xfs_acl); diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c index a034cf624437..3906e85abfdc 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl.c +++ b/fs/xfs/linux-2.6/xfs_ioctl.c @@ -447,12 +447,12 @@ xfs_attrlist_by_handle( int xfs_attrmulti_attr_get( struct inode *inode, - char *name, - char __user *ubuf, + unsigned char *name, + unsigned char __user *ubuf, __uint32_t *len, __uint32_t flags) { - char *kbuf; + unsigned char *kbuf; int error = EFAULT; if (*len > XATTR_SIZE_MAX) @@ -476,12 +476,12 @@ xfs_attrmulti_attr_get( int xfs_attrmulti_attr_set( struct inode *inode, - char *name, - const char __user *ubuf, + unsigned char *name, + const unsigned char __user *ubuf, __uint32_t len, __uint32_t flags) { - char *kbuf; + unsigned char *kbuf; int error = EFAULT; if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) @@ -501,7 +501,7 @@ xfs_attrmulti_attr_set( int xfs_attrmulti_attr_remove( struct inode *inode, - char *name, + unsigned char *name, __uint32_t flags) { if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) @@ -519,7 +519,7 @@ xfs_attrmulti_by_handle( xfs_fsop_attrmulti_handlereq_t am_hreq; struct dentry *dentry; unsigned int i, size; - char *attr_name; + unsigned char *attr_name; if (!capable(CAP_SYS_ADMIN)) return -XFS_ERROR(EPERM); @@ -547,7 +547,7 @@ xfs_attrmulti_by_handle( error = 0; for (i = 0; i < am_hreq.opcount; i++) { - ops[i].am_error = strncpy_from_user(attr_name, + ops[i].am_error = strncpy_from_user((char *)attr_name, ops[i].am_attrname, MAXNAMELEN); if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN) error = -ERANGE; diff --git a/fs/xfs/linux-2.6/xfs_ioctl.h b/fs/xfs/linux-2.6/xfs_ioctl.h index 7bd7c6afc1eb..d56173b34a2a 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl.h +++ b/fs/xfs/linux-2.6/xfs_ioctl.h @@ -45,23 +45,23 @@ xfs_readlink_by_handle( extern int xfs_attrmulti_attr_get( struct inode *inode, - char *name, - char __user *ubuf, + unsigned char *name, + unsigned char __user *ubuf, __uint32_t *len, __uint32_t flags); extern int - xfs_attrmulti_attr_set( +xfs_attrmulti_attr_set( struct inode *inode, - char *name, - const char __user *ubuf, + unsigned char *name, + const unsigned char __user *ubuf, __uint32_t len, __uint32_t flags); extern int xfs_attrmulti_attr_remove( struct inode *inode, - char *name, + unsigned char *name, __uint32_t flags); extern struct dentry * diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c index be1527b1670c..0bf6d61f0528 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl32.c +++ b/fs/xfs/linux-2.6/xfs_ioctl32.c @@ -411,7 +411,7 @@ xfs_compat_attrmulti_by_handle( compat_xfs_fsop_attrmulti_handlereq_t am_hreq; struct dentry *dentry; unsigned int i, size; - char *attr_name; + unsigned char *attr_name; if (!capable(CAP_SYS_ADMIN)) return -XFS_ERROR(EPERM); @@ -440,7 +440,7 @@ xfs_compat_attrmulti_by_handle( error = 0; for (i = 0; i < am_hreq.opcount; i++) { - ops[i].am_error = strncpy_from_user(attr_name, + ops[i].am_error = strncpy_from_user((char *)attr_name, compat_ptr(ops[i].am_attrname), MAXNAMELEN); if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN) diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index 225946012d0b..e8566bbf0f00 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -140,10 +140,10 @@ xfs_init_security( struct xfs_inode *ip = XFS_I(inode); size_t length; void *value; - char *name; + unsigned char *name; int error; - error = security_inode_init_security(inode, dir, &name, + error = security_inode_init_security(inode, dir, (char **)&name, &value, &length); if (error) { if (error == -EOPNOTSUPP) diff --git a/fs/xfs/linux-2.6/xfs_xattr.c b/fs/xfs/linux-2.6/xfs_xattr.c index 0b1878857fc3..fa01b9daba6b 100644 --- a/fs/xfs/linux-2.6/xfs_xattr.c +++ b/fs/xfs/linux-2.6/xfs_xattr.c @@ -45,7 +45,7 @@ xfs_xattr_get(struct dentry *dentry, const char *name, value = NULL; } - error = -xfs_attr_get(ip, name, value, &asize, xflags); + error = -xfs_attr_get(ip, (unsigned char *)name, value, &asize, xflags); if (error) return error; return asize; @@ -67,8 +67,9 @@ xfs_xattr_set(struct dentry *dentry, const char *name, const void *value, xflags |= ATTR_REPLACE; if (!value) - return -xfs_attr_remove(ip, name, xflags); - return -xfs_attr_set(ip, name, (void *)value, size, xflags); + return -xfs_attr_remove(ip, (unsigned char *)name, xflags); + return -xfs_attr_set(ip, (unsigned char *)name, + (void *)value, size, xflags); } static struct xattr_handler xfs_xattr_user_handler = { @@ -124,8 +125,13 @@ static const char *xfs_xattr_prefix(int flags) } static int -xfs_xattr_put_listent(struct xfs_attr_list_context *context, int flags, - char *name, int namelen, int valuelen, char *value) +xfs_xattr_put_listent( + struct xfs_attr_list_context *context, + int flags, + unsigned char *name, + int namelen, + int valuelen, + unsigned char *value) { unsigned int prefix_len = xfs_xattr_prefix_len(flags); char *offset; @@ -148,7 +154,7 @@ xfs_xattr_put_listent(struct xfs_attr_list_context *context, int flags, offset = (char *)context->alist + context->count; strncpy(offset, xfs_xattr_prefix(flags), prefix_len); offset += prefix_len; - strncpy(offset, name, namelen); /* real name */ + strncpy(offset, (char *)name, namelen); /* real name */ offset += namelen; *offset = '\0'; context->count += prefix_len + namelen + 1; @@ -156,8 +162,13 @@ xfs_xattr_put_listent(struct xfs_attr_list_context *context, int flags, } static int -xfs_xattr_put_listent_sizes(struct xfs_attr_list_context *context, int flags, - char *name, int namelen, int valuelen, char *value) +xfs_xattr_put_listent_sizes( + struct xfs_attr_list_context *context, + int flags, + unsigned char *name, + int namelen, + int valuelen, + unsigned char *value) { context->count += xfs_xattr_prefix_len(flags) + namelen + 1; return 0; diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h index 00fd357c3e46..d13eeba2c8f8 100644 --- a/fs/xfs/xfs_acl.h +++ b/fs/xfs/xfs_acl.h @@ -36,8 +36,8 @@ struct xfs_acl { }; /* On-disk XFS extended attribute names */ -#define SGI_ACL_FILE "SGI_ACL_FILE" -#define SGI_ACL_DEFAULT "SGI_ACL_DEFAULT" +#define SGI_ACL_FILE (unsigned char *)"SGI_ACL_FILE" +#define SGI_ACL_DEFAULT (unsigned char *)"SGI_ACL_DEFAULT" #define SGI_ACL_FILE_SIZE (sizeof(SGI_ACL_FILE)-1) #define SGI_ACL_DEFAULT_SIZE (sizeof(SGI_ACL_DEFAULT)-1) diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index 9d11ebad43b6..f7b426a1b6ee 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -93,12 +93,12 @@ STATIC int xfs_attr_rmtval_remove(xfs_da_args_t *args); STATIC int xfs_attr_name_to_xname( struct xfs_name *xname, - const char *aname) + const unsigned char *aname) { if (!aname) return EINVAL; xname->name = aname; - xname->len = strlen(aname); + xname->len = strlen((char *)aname); if (xname->len >= MAXNAMELEN) return EFAULT; /* match IRIX behaviour */ @@ -124,7 +124,7 @@ STATIC int xfs_attr_get_int( struct xfs_inode *ip, struct xfs_name *name, - char *value, + unsigned char *value, int *valuelenp, int flags) { @@ -171,8 +171,8 @@ xfs_attr_get_int( int xfs_attr_get( xfs_inode_t *ip, - const char *name, - char *value, + const unsigned char *name, + unsigned char *value, int *valuelenp, int flags) { @@ -235,8 +235,12 @@ xfs_attr_calc_size( } STATIC int -xfs_attr_set_int(xfs_inode_t *dp, struct xfs_name *name, - char *value, int valuelen, int flags) +xfs_attr_set_int( + struct xfs_inode *dp, + struct xfs_name *name, + unsigned char *value, + int valuelen, + int flags) { xfs_da_args_t args; xfs_fsblock_t firstblock; @@ -452,8 +456,8 @@ out: int xfs_attr_set( xfs_inode_t *dp, - const char *name, - char *value, + const unsigned char *name, + unsigned char *value, int valuelen, int flags) { @@ -600,7 +604,7 @@ out: int xfs_attr_remove( xfs_inode_t *dp, - const char *name, + const unsigned char *name, int flags) { int error; @@ -669,9 +673,13 @@ xfs_attr_list_int(xfs_attr_list_context_t *context) */ /*ARGSUSED*/ STATIC int -xfs_attr_put_listent(xfs_attr_list_context_t *context, int flags, - char *name, int namelen, - int valuelen, char *value) +xfs_attr_put_listent( + xfs_attr_list_context_t *context, + int flags, + unsigned char *name, + int namelen, + int valuelen, + unsigned char *value) { struct attrlist *alist = (struct attrlist *)context->alist; attrlist_ent_t *aep; @@ -1980,7 +1988,7 @@ xfs_attr_rmtval_get(xfs_da_args_t *args) xfs_bmbt_irec_t map[ATTR_RMTVALUE_MAPSIZE]; xfs_mount_t *mp; xfs_daddr_t dblkno; - xfs_caddr_t dst; + void *dst; xfs_buf_t *bp; int nmap, error, tmp, valuelen, blkcnt, i; xfs_dablk_t lblkno; @@ -2039,7 +2047,7 @@ xfs_attr_rmtval_set(xfs_da_args_t *args) xfs_inode_t *dp; xfs_bmbt_irec_t map; xfs_daddr_t dblkno; - xfs_caddr_t src; + void *src; xfs_buf_t *bp; xfs_dablk_t lblkno; int blkcnt, valuelen, nmap, error, tmp, committed; diff --git a/fs/xfs/xfs_attr.h b/fs/xfs/xfs_attr.h index 9c3a24372914..e920d68ef509 100644 --- a/fs/xfs/xfs_attr.h +++ b/fs/xfs/xfs_attr.h @@ -113,7 +113,7 @@ typedef struct attrlist_cursor_kern { typedef int (*put_listent_func_t)(struct xfs_attr_list_context *, int, - char *, int, int, char *); + unsigned char *, int, int, unsigned char *); typedef struct xfs_attr_list_context { struct xfs_inode *dp; /* inode */ diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index baf41b5af756..52519a201849 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -521,11 +521,11 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args) sfe = &sf->list[0]; for (i = 0; i < sf->hdr.count; i++) { - nargs.name = (char *)sfe->nameval; + nargs.name = sfe->nameval; nargs.namelen = sfe->namelen; - nargs.value = (char *)&sfe->nameval[nargs.namelen]; + nargs.value = &sfe->nameval[nargs.namelen]; nargs.valuelen = sfe->valuelen; - nargs.hashval = xfs_da_hashname((char *)sfe->nameval, + nargs.hashval = xfs_da_hashname(sfe->nameval, sfe->namelen); nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(sfe->flags); error = xfs_attr_leaf_lookup_int(bp, &nargs); /* set a->index */ @@ -612,10 +612,10 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context) for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) { error = context->put_listent(context, sfe->flags, - (char *)sfe->nameval, + sfe->nameval, (int)sfe->namelen, (int)sfe->valuelen, - (char*)&sfe->nameval[sfe->namelen]); + &sfe->nameval[sfe->namelen]); /* * Either search callback finished early or @@ -659,8 +659,8 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context) } sbp->entno = i; - sbp->hash = xfs_da_hashname((char *)sfe->nameval, sfe->namelen); - sbp->name = (char *)sfe->nameval; + sbp->hash = xfs_da_hashname(sfe->nameval, sfe->namelen); + sbp->name = sfe->nameval; sbp->namelen = sfe->namelen; /* These are bytes, and both on-disk, don't endian-flip */ sbp->valuelen = sfe->valuelen; @@ -818,9 +818,9 @@ xfs_attr_leaf_to_shortform(xfs_dabuf_t *bp, xfs_da_args_t *args, int forkoff) continue; ASSERT(entry->flags & XFS_ATTR_LOCAL); name_loc = xfs_attr_leaf_name_local(leaf, i); - nargs.name = (char *)name_loc->nameval; + nargs.name = name_loc->nameval; nargs.namelen = name_loc->namelen; - nargs.value = (char *)&name_loc->nameval[nargs.namelen]; + nargs.value = &name_loc->nameval[nargs.namelen]; nargs.valuelen = be16_to_cpu(name_loc->valuelen); nargs.hashval = be32_to_cpu(entry->hashval); nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(entry->flags); @@ -2370,10 +2370,10 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context) retval = context->put_listent(context, entry->flags, - (char *)name_loc->nameval, + name_loc->nameval, (int)name_loc->namelen, be16_to_cpu(name_loc->valuelen), - (char *)&name_loc->nameval[name_loc->namelen]); + &name_loc->nameval[name_loc->namelen]); if (retval) return retval; } else { @@ -2397,15 +2397,15 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context) return retval; retval = context->put_listent(context, entry->flags, - (char *)name_rmt->name, + name_rmt->name, (int)name_rmt->namelen, valuelen, - (char*)args.value); + args.value); kmem_free(args.value); } else { retval = context->put_listent(context, entry->flags, - (char *)name_rmt->name, + name_rmt->name, (int)name_rmt->namelen, valuelen, NULL); diff --git a/fs/xfs/xfs_attr_sf.h b/fs/xfs/xfs_attr_sf.h index 76ab7b0cbb3a..919756e3ba53 100644 --- a/fs/xfs/xfs_attr_sf.h +++ b/fs/xfs/xfs_attr_sf.h @@ -52,7 +52,7 @@ typedef struct xfs_attr_sf_sort { __uint8_t valuelen; /* length of value */ __uint8_t flags; /* flags bits (see xfs_attr_leaf.h) */ xfs_dahash_t hash; /* this entry's hash value */ - char *name; /* name value, pointer into buffer */ + unsigned char *name; /* name value, pointer into buffer */ } xfs_attr_sf_sort_t; #define XFS_ATTR_SF_ENTSIZE_BYNAME(nlen,vlen) /* space name/value uses */ \ diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h index 167a467403a5..774f40729ca1 100644 --- a/fs/xfs/xfs_vnodeops.h +++ b/fs/xfs/xfs_vnodeops.h @@ -43,11 +43,11 @@ int xfs_change_file_space(struct xfs_inode *ip, int cmd, int xfs_rename(struct xfs_inode *src_dp, struct xfs_name *src_name, struct xfs_inode *src_ip, struct xfs_inode *target_dp, struct xfs_name *target_name, struct xfs_inode *target_ip); -int xfs_attr_get(struct xfs_inode *ip, const char *name, char *value, - int *valuelenp, int flags); -int xfs_attr_set(struct xfs_inode *dp, const char *name, char *value, - int valuelen, int flags); -int xfs_attr_remove(struct xfs_inode *dp, const char *name, int flags); +int xfs_attr_get(struct xfs_inode *ip, const unsigned char *name, + unsigned char *value, int *valuelenp, int flags); +int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name, + unsigned char *value, int valuelen, int flags); +int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags); int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize, int flags, struct attrlist_cursor_kern *cursor); ssize_t xfs_read(struct xfs_inode *ip, struct kiocb *iocb, -- cgit v1.2.3 From 4a24cb71407dc25035d75dd3d118e0e55679e217 Mon Sep 17 00:00:00 2001 From: Dave Chinner <david@fromorbit.com> Date: Wed, 20 Jan 2010 10:48:05 +1100 Subject: xfs: clean up sign warnings in dir2 code We are now consistently using unsigned char strings for names so fix up the remaining warnings in the dir2 code to complete the cleanup. Signed-off-by: Dave Chinner <david@fromorbit.com> Reviewed-by: Christoph Hellwig <hch@lst.de> --- fs/xfs/xfs_dir2.c | 2 +- fs/xfs/xfs_dir2_block.c | 9 +++++---- fs/xfs/xfs_dir2_leaf.c | 2 +- fs/xfs/xfs_dir2_sf.c | 2 +- 4 files changed, 8 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c index 3a8c6ba0638f..42520f041265 100644 --- a/fs/xfs/xfs_dir2.c +++ b/fs/xfs/xfs_dir2.c @@ -44,7 +44,7 @@ #include "xfs_vnodeops.h" #include "xfs_trace.h" -struct xfs_name xfs_name_dotdot = {"..", 2}; +struct xfs_name xfs_name_dotdot = { (unsigned char *)"..", 2}; /* * ASCII case-insensitive (ie. A-Z) support for directories that was diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c index ddc4ecc7807f..779a267b0a84 100644 --- a/fs/xfs/xfs_dir2_block.c +++ b/fs/xfs/xfs_dir2_block.c @@ -57,8 +57,8 @@ static xfs_dahash_t xfs_dir_hash_dot, xfs_dir_hash_dotdot; void xfs_dir_startup(void) { - xfs_dir_hash_dot = xfs_da_hashname(".", 1); - xfs_dir_hash_dotdot = xfs_da_hashname("..", 2); + xfs_dir_hash_dot = xfs_da_hashname((unsigned char *)".", 1); + xfs_dir_hash_dotdot = xfs_da_hashname((unsigned char *)"..", 2); } /* @@ -513,8 +513,9 @@ xfs_dir2_block_getdents( /* * If it didn't fit, set the final offset to here & return. */ - if (filldir(dirent, dep->name, dep->namelen, cook & 0x7fffffff, - be64_to_cpu(dep->inumber), DT_UNKNOWN)) { + if (filldir(dirent, (char *)dep->name, dep->namelen, + cook & 0x7fffffff, be64_to_cpu(dep->inumber), + DT_UNKNOWN)) { *offset = cook & 0x7fffffff; xfs_da_brelse(NULL, bp); return 0; diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c index 29f484c11b3a..e2d89854ec9e 100644 --- a/fs/xfs/xfs_dir2_leaf.c +++ b/fs/xfs/xfs_dir2_leaf.c @@ -1081,7 +1081,7 @@ xfs_dir2_leaf_getdents( dep = (xfs_dir2_data_entry_t *)ptr; length = xfs_dir2_data_entsize(dep->namelen); - if (filldir(dirent, dep->name, dep->namelen, + if (filldir(dirent, (char *)dep->name, dep->namelen, xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff, be64_to_cpu(dep->inumber), DT_UNKNOWN)) break; diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c index 9d4f17a69676..c1a5945d463a 100644 --- a/fs/xfs/xfs_dir2_sf.c +++ b/fs/xfs/xfs_dir2_sf.c @@ -782,7 +782,7 @@ xfs_dir2_sf_getdents( } ino = xfs_dir2_sf_get_inumber(sfp, xfs_dir2_sf_inumberp(sfep)); - if (filldir(dirent, sfep->name, sfep->namelen, + if (filldir(dirent, (char *)sfep->name, sfep->namelen, off & 0x7fffffff, ino, DT_UNKNOWN)) { *offset = off & 0x7fffffff; return 0; -- cgit v1.2.3 From 58c75cfb51393a52b45262394c1fa81514b4d9bd Mon Sep 17 00:00:00 2001 From: Dave Chinner <david@fromorbit.com> Date: Wed, 20 Jan 2010 10:49:18 +1100 Subject: xfs: make compile warn about char sign mismatches again The -fno-unsigned-char directive has no effect anymore as the XFs build is clean. However, the kernel build hides pointer sign differences so turn that back on so that we can clean up all the mismatches prior to a userspace code resync. Signed-off-by: Dave Chinner <david@fromorbit.com> Reviewed-by: Christoph Hellwig <hch@lst.de> --- fs/xfs/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 56641fe52a23..19267019dacf 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -16,7 +16,7 @@ # Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA # -EXTRA_CFLAGS += -I$(src) -I$(src)/linux-2.6 -funsigned-char +EXTRA_CFLAGS += -I$(src) -I$(src)/linux-2.6 -Wpointer-sign XFS_LINUX := linux-2.6 -- cgit v1.2.3 From f0a0eaa8da08ebc6519cacd731df05bbb4ca47ce Mon Sep 17 00:00:00 2001 From: Dave Chinner <david@fromorbit.com> Date: Wed, 20 Jan 2010 10:50:06 +1100 Subject: xfs: suppress spurious uninitialised var warning in xfs_bmapi() Initialise the xfs_bmalloca_t structure to zero to avoid uninitialised variable warnings. This is done by zeroing the arg structure rather than using the uninitialised_var() trick so we know for certain that the structure is correctly initialised as xfs_bmapi is a very complex function and it is difficult to prove warnings are spurious. Signed-off-by: Dave Chinner <david@fromorbit.com> Reviewed-by: Christoph Hellwig <hch@lst.de> --- fs/xfs/xfs_bmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 7c6d9acd7154..1869fb973819 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -4471,7 +4471,7 @@ xfs_bmapi( xfs_fsblock_t abno; /* allocated block number */ xfs_extlen_t alen; /* allocated extent length */ xfs_fileoff_t aoff; /* allocated file offset */ - xfs_bmalloca_t bma; /* args for xfs_bmap_alloc */ + xfs_bmalloca_t bma = { 0 }; /* args for xfs_bmap_alloc */ xfs_btree_cur_t *cur; /* bmap btree cursor */ xfs_fileoff_t end; /* end of mapped file region */ int eof; /* we've hit the end of extents */ -- cgit v1.2.3 From 587aa0feb74ffe3239b5e26ff5d017ba9f5daec9 Mon Sep 17 00:00:00 2001 From: Dave Chinner <david@fromorbit.com> Date: Wed, 20 Jan 2010 12:04:53 +1100 Subject: xfs: rearrange xfs_mod_sb() to avoid array subscript warning gcc warns of an array subscript out of bounds in xfs_mod_sb(). The code is written in such a way that if the array subscript is out of bounds, then it will assert fail. Rearrange the code to avoid the bounds check warning. Signed-off-by: Dave Chinner <david@fromorbit.com> Reviewed-by: Christoph Hellwig <hch@lst.de> --- fs/xfs/xfs_mount.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 0df5045abd3b..d95bd1809f3c 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1631,15 +1631,14 @@ xfs_mod_sb(xfs_trans_t *tp, __int64_t fields) xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb, fields); /* find modified range */ + f = (xfs_sb_field_t)xfs_highbit64((__uint64_t)fields); + ASSERT((1LL << f) & XFS_SB_MOD_BITS); + last = xfs_sb_info[f + 1].offset - 1; f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields); ASSERT((1LL << f) & XFS_SB_MOD_BITS); first = xfs_sb_info[f].offset; - f = (xfs_sb_field_t)xfs_highbit64((__uint64_t)fields); - ASSERT((1LL << f) & XFS_SB_MOD_BITS); - last = xfs_sb_info[f + 1].offset - 1; - xfs_trans_log_buf(tp, bp, first, last); } -- cgit v1.2.3 From e6a6d3795565b8ccb957afc6ca0e50db40b2d899 Mon Sep 17 00:00:00 2001 From: Phillip Lougher <phillip@lougher.demon.co.uk> Date: Tue, 22 Sep 2009 19:25:24 +0100 Subject: Squashfs: move zlib decompression wrapper code into a separate file Signed-off-by: Phillip Lougher <phillip@lougher.demon.co.uk> --- fs/squashfs/Makefile | 2 +- fs/squashfs/block.c | 74 ++---------------------------- fs/squashfs/squashfs.h | 4 ++ fs/squashfs/zlib_wrapper.c | 109 +++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 118 insertions(+), 71 deletions(-) create mode 100644 fs/squashfs/zlib_wrapper.c (limited to 'fs') diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile index 70e3244fa30f..a397e6f12ab5 100644 --- a/fs/squashfs/Makefile +++ b/fs/squashfs/Makefile @@ -4,4 +4,4 @@ obj-$(CONFIG_SQUASHFS) += squashfs.o squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o -squashfs-y += namei.o super.o symlink.o +squashfs-y += namei.o super.o symlink.o zlib_wrapper.o diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c index 2a7960310349..b8addfdc6094 100644 --- a/fs/squashfs/block.c +++ b/fs/squashfs/block.c @@ -29,7 +29,6 @@ #include <linux/fs.h> #include <linux/vfs.h> #include <linux/slab.h> -#include <linux/mutex.h> #include <linux/string.h> #include <linux/buffer_head.h> #include <linux/zlib.h> @@ -153,72 +152,10 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index, } if (compressed) { - int zlib_err = 0, zlib_init = 0; - - /* - * Uncompress block. - */ - - mutex_lock(&msblk->read_data_mutex); - - msblk->stream.avail_out = 0; - msblk->stream.avail_in = 0; - - bytes = length; - do { - if (msblk->stream.avail_in == 0 && k < b) { - avail = min(bytes, msblk->devblksize - offset); - bytes -= avail; - wait_on_buffer(bh[k]); - if (!buffer_uptodate(bh[k])) - goto release_mutex; - - if (avail == 0) { - offset = 0; - put_bh(bh[k++]); - continue; - } - - msblk->stream.next_in = bh[k]->b_data + offset; - msblk->stream.avail_in = avail; - offset = 0; - } - - if (msblk->stream.avail_out == 0 && page < pages) { - msblk->stream.next_out = buffer[page++]; - msblk->stream.avail_out = PAGE_CACHE_SIZE; - } - - if (!zlib_init) { - zlib_err = zlib_inflateInit(&msblk->stream); - if (zlib_err != Z_OK) { - ERROR("zlib_inflateInit returned" - " unexpected result 0x%x," - " srclength %d\n", zlib_err, - srclength); - goto release_mutex; - } - zlib_init = 1; - } - - zlib_err = zlib_inflate(&msblk->stream, Z_SYNC_FLUSH); - - if (msblk->stream.avail_in == 0 && k < b) - put_bh(bh[k++]); - } while (zlib_err == Z_OK); - - if (zlib_err != Z_STREAM_END) { - ERROR("zlib_inflate error, data probably corrupt\n"); - goto release_mutex; - } - - zlib_err = zlib_inflateEnd(&msblk->stream); - if (zlib_err != Z_OK) { - ERROR("zlib_inflate error, data probably corrupt\n"); - goto release_mutex; - } - length = msblk->stream.total_out; - mutex_unlock(&msblk->read_data_mutex); + length = squashfs_zlib_uncompress(msblk, buffer, bh, b, offset, + length, srclength, pages); + if (length < 0) + goto read_failure; } else { /* * Block is uncompressed. @@ -255,9 +192,6 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index, kfree(bh); return length; -release_mutex: - mutex_unlock(&msblk->read_data_mutex); - block_release: for (; k < b; k++) put_bh(bh[k]); diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h index 0e9feb6adf7e..ba87db693651 100644 --- a/fs/squashfs/squashfs.h +++ b/fs/squashfs/squashfs.h @@ -70,6 +70,10 @@ extern struct inode *squashfs_iget(struct super_block *, long long, unsigned int); extern int squashfs_read_inode(struct inode *, long long); +/* zlib_wrapper.c */ +extern int squashfs_zlib_uncompress(struct squashfs_sb_info *, void **, + struct buffer_head **, int, int, int, int, int); + /* * Inodes and files operations */ diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c new file mode 100644 index 000000000000..3be99642d6af --- /dev/null +++ b/fs/squashfs/zlib_wrapper.c @@ -0,0 +1,109 @@ +/* + * Squashfs - a compressed read only filesystem for Linux + * + * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009 + * Phillip Lougher <phillip@lougher.demon.co.uk> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * zlib_wrapper.c + */ + + +#include <linux/mutex.h> +#include <linux/buffer_head.h> +#include <linux/zlib.h> + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "squashfs_fs_i.h" +#include "squashfs.h" + +int squashfs_zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer, + struct buffer_head **bh, int b, int offset, int length, int srclength, + int pages) +{ + int zlib_err = 0, zlib_init = 0; + int avail, bytes, k = 0, page = 0; + + mutex_lock(&msblk->read_data_mutex); + + msblk->stream.avail_out = 0; + msblk->stream.avail_in = 0; + + bytes = length; + do { + if (msblk->stream.avail_in == 0 && k < b) { + avail = min(bytes, msblk->devblksize - offset); + bytes -= avail; + wait_on_buffer(bh[k]); + if (!buffer_uptodate(bh[k])) + goto release_mutex; + + if (avail == 0) { + offset = 0; + put_bh(bh[k++]); + continue; + } + + msblk->stream.next_in = bh[k]->b_data + offset; + msblk->stream.avail_in = avail; + offset = 0; + } + + if (msblk->stream.avail_out == 0 && page < pages) { + msblk->stream.next_out = buffer[page++]; + msblk->stream.avail_out = PAGE_CACHE_SIZE; + } + + if (!zlib_init) { + zlib_err = zlib_inflateInit(&msblk->stream); + if (zlib_err != Z_OK) { + ERROR("zlib_inflateInit returned unexpected " + "result 0x%x, srclength %d\n", + zlib_err, srclength); + goto release_mutex; + } + zlib_init = 1; + } + + zlib_err = zlib_inflate(&msblk->stream, Z_SYNC_FLUSH); + + if (msblk->stream.avail_in == 0 && k < b) + put_bh(bh[k++]); + } while (zlib_err == Z_OK); + + if (zlib_err != Z_STREAM_END) { + ERROR("zlib_inflate error, data probably corrupt\n"); + goto release_mutex; + } + + zlib_err = zlib_inflateEnd(&msblk->stream); + if (zlib_err != Z_OK) { + ERROR("zlib_inflate error, data probably corrupt\n"); + goto release_mutex; + } + + mutex_unlock(&msblk->read_data_mutex); + return msblk->stream.total_out; + +release_mutex: + mutex_unlock(&msblk->read_data_mutex); + + for (; k < b; k++) + put_bh(bh[k]); + + return -EIO; +} -- cgit v1.2.3 From f1a40359f8d8ba073257ed31a513e492621bcbc5 Mon Sep 17 00:00:00 2001 From: Phillip Lougher <phillip@lougher.demon.co.uk> Date: Wed, 23 Sep 2009 19:04:49 +0100 Subject: Squashfs: factor out remaining zlib dependencies into separate wrapper file Move zlib buffer init/destroy code into separate wrapper file. Also make zlib z_stream field a void * removing the need to include zlib.h for most files. Signed-off-by: Phillip Lougher <phillip@lougher.demon.co.uk> --- fs/squashfs/block.c | 1 - fs/squashfs/cache.c | 1 - fs/squashfs/dir.c | 1 - fs/squashfs/export.c | 1 - fs/squashfs/file.c | 1 - fs/squashfs/fragment.c | 1 - fs/squashfs/id.c | 1 - fs/squashfs/inode.c | 1 - fs/squashfs/namei.c | 1 - fs/squashfs/squashfs.h | 2 ++ fs/squashfs/squashfs_fs_sb.h | 2 +- fs/squashfs/super.c | 14 ++++------- fs/squashfs/symlink.c | 1 - fs/squashfs/zlib_wrapper.c | 56 ++++++++++++++++++++++++++++++++++---------- 14 files changed, 51 insertions(+), 33 deletions(-) (limited to 'fs') diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c index b8addfdc6094..3f836e181eb8 100644 --- a/fs/squashfs/block.c +++ b/fs/squashfs/block.c @@ -31,7 +31,6 @@ #include <linux/slab.h> #include <linux/string.h> #include <linux/buffer_head.h> -#include <linux/zlib.h> #include "squashfs_fs.h" #include "squashfs_fs_sb.h" diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c index 40c98fa6b5d6..57314bee9059 100644 --- a/fs/squashfs/cache.c +++ b/fs/squashfs/cache.c @@ -51,7 +51,6 @@ #include <linux/sched.h> #include <linux/spinlock.h> #include <linux/wait.h> -#include <linux/zlib.h> #include <linux/pagemap.h> #include "squashfs_fs.h" diff --git a/fs/squashfs/dir.c b/fs/squashfs/dir.c index 566b0eaed868..12b933ac6585 100644 --- a/fs/squashfs/dir.c +++ b/fs/squashfs/dir.c @@ -30,7 +30,6 @@ #include <linux/fs.h> #include <linux/vfs.h> #include <linux/slab.h> -#include <linux/zlib.h> #include "squashfs_fs.h" #include "squashfs_fs_sb.h" diff --git a/fs/squashfs/export.c b/fs/squashfs/export.c index 2b1b8fe5e037..7f93d5a9ee05 100644 --- a/fs/squashfs/export.c +++ b/fs/squashfs/export.c @@ -39,7 +39,6 @@ #include <linux/vfs.h> #include <linux/dcache.h> #include <linux/exportfs.h> -#include <linux/zlib.h> #include <linux/slab.h> #include "squashfs_fs.h" diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c index 717767d831df..a25c5060bdcb 100644 --- a/fs/squashfs/file.c +++ b/fs/squashfs/file.c @@ -47,7 +47,6 @@ #include <linux/string.h> #include <linux/pagemap.h> #include <linux/mutex.h> -#include <linux/zlib.h> #include "squashfs_fs.h" #include "squashfs_fs_sb.h" diff --git a/fs/squashfs/fragment.c b/fs/squashfs/fragment.c index b5a2c15bbbc7..7c90bbd6879d 100644 --- a/fs/squashfs/fragment.c +++ b/fs/squashfs/fragment.c @@ -36,7 +36,6 @@ #include <linux/fs.h> #include <linux/vfs.h> #include <linux/slab.h> -#include <linux/zlib.h> #include "squashfs_fs.h" #include "squashfs_fs_sb.h" diff --git a/fs/squashfs/id.c b/fs/squashfs/id.c index 3795b837ba28..b7f64bcd2b70 100644 --- a/fs/squashfs/id.c +++ b/fs/squashfs/id.c @@ -34,7 +34,6 @@ #include <linux/fs.h> #include <linux/vfs.h> #include <linux/slab.h> -#include <linux/zlib.h> #include "squashfs_fs.h" #include "squashfs_fs_sb.h" diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c index 9101dbde39ec..49daaf669e41 100644 --- a/fs/squashfs/inode.c +++ b/fs/squashfs/inode.c @@ -40,7 +40,6 @@ #include <linux/fs.h> #include <linux/vfs.h> -#include <linux/zlib.h> #include "squashfs_fs.h" #include "squashfs_fs_sb.h" diff --git a/fs/squashfs/namei.c b/fs/squashfs/namei.c index 9e398653b22b..5266bd8ad932 100644 --- a/fs/squashfs/namei.c +++ b/fs/squashfs/namei.c @@ -57,7 +57,6 @@ #include <linux/slab.h> #include <linux/string.h> #include <linux/dcache.h> -#include <linux/zlib.h> #include "squashfs_fs.h" #include "squashfs_fs_sb.h" diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h index ba87db693651..9c2f76a1c50b 100644 --- a/fs/squashfs/squashfs.h +++ b/fs/squashfs/squashfs.h @@ -71,6 +71,8 @@ extern struct inode *squashfs_iget(struct super_block *, long long, extern int squashfs_read_inode(struct inode *, long long); /* zlib_wrapper.c */ +extern void *squashfs_zlib_init(void); +extern void squashfs_zlib_free(void *); extern int squashfs_zlib_uncompress(struct squashfs_sb_info *, void **, struct buffer_head **, int, int, int, int, int); diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h index c8c65614dd1c..23a67fa40b03 100644 --- a/fs/squashfs/squashfs_fs_sb.h +++ b/fs/squashfs/squashfs_fs_sb.h @@ -64,7 +64,7 @@ struct squashfs_sb_info { struct mutex read_data_mutex; struct mutex meta_index_mutex; struct meta_index *meta_index; - z_stream stream; + void *stream; __le64 *inode_lookup_table; u64 inode_table; u64 directory_table; diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index 6c197ef53add..b9f8c6a92d6a 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -35,7 +35,6 @@ #include <linux/pagemap.h> #include <linux/init.h> #include <linux/module.h> -#include <linux/zlib.h> #include <linux/magic.h> #include "squashfs_fs.h" @@ -87,12 +86,9 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent) } msblk = sb->s_fs_info; - msblk->stream.workspace = kmalloc(zlib_inflate_workspacesize(), - GFP_KERNEL); - if (msblk->stream.workspace == NULL) { - ERROR("Failed to allocate zlib workspace\n"); + msblk->stream = squashfs_zlib_init(); + if (msblk->stream == NULL) goto failure; - } sblk = kzalloc(sizeof(*sblk), GFP_KERNEL); if (sblk == NULL) { @@ -292,17 +288,17 @@ failed_mount: squashfs_cache_delete(msblk->block_cache); squashfs_cache_delete(msblk->fragment_cache); squashfs_cache_delete(msblk->read_page); + squashfs_zlib_free(msblk->stream); kfree(msblk->inode_lookup_table); kfree(msblk->fragment_index); kfree(msblk->id_table); - kfree(msblk->stream.workspace); kfree(sb->s_fs_info); sb->s_fs_info = NULL; kfree(sblk); return err; failure: - kfree(msblk->stream.workspace); + squashfs_zlib_free(msblk->stream); kfree(sb->s_fs_info); sb->s_fs_info = NULL; return -ENOMEM; @@ -346,10 +342,10 @@ static void squashfs_put_super(struct super_block *sb) squashfs_cache_delete(sbi->block_cache); squashfs_cache_delete(sbi->fragment_cache); squashfs_cache_delete(sbi->read_page); + squashfs_zlib_free(sbi->stream); kfree(sbi->id_table); kfree(sbi->fragment_index); kfree(sbi->meta_index); - kfree(sbi->stream.workspace); kfree(sb->s_fs_info); sb->s_fs_info = NULL; } diff --git a/fs/squashfs/symlink.c b/fs/squashfs/symlink.c index 83d87880aac8..e80be2022a7f 100644 --- a/fs/squashfs/symlink.c +++ b/fs/squashfs/symlink.c @@ -36,7 +36,6 @@ #include <linux/slab.h> #include <linux/string.h> #include <linux/pagemap.h> -#include <linux/zlib.h> #include "squashfs_fs.h" #include "squashfs_fs_sb.h" diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c index 3be99642d6af..c814594d522e 100644 --- a/fs/squashfs/zlib_wrapper.c +++ b/fs/squashfs/zlib_wrapper.c @@ -31,21 +31,51 @@ #include "squashfs_fs_i.h" #include "squashfs.h" +void *squashfs_zlib_init() +{ + z_stream *stream = kmalloc(sizeof(z_stream), GFP_KERNEL); + if (stream == NULL) + goto failed; + stream->workspace = kmalloc(zlib_inflate_workspacesize(), + GFP_KERNEL); + if (stream->workspace == NULL) + goto failed; + + return stream; + +failed: + ERROR("Failed to allocate zlib workspace\n"); + kfree(stream); + return NULL; +} + + +void squashfs_zlib_free(void *strm) +{ + z_stream *stream = strm; + + if (stream) + kfree(stream->workspace); + kfree(stream); +} + + int squashfs_zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer, struct buffer_head **bh, int b, int offset, int length, int srclength, int pages) { int zlib_err = 0, zlib_init = 0; int avail, bytes, k = 0, page = 0; + z_stream *stream = msblk->stream; mutex_lock(&msblk->read_data_mutex); - msblk->stream.avail_out = 0; - msblk->stream.avail_in = 0; + stream->avail_out = 0; + stream->avail_in = 0; bytes = length; do { - if (msblk->stream.avail_in == 0 && k < b) { + if (stream->avail_in == 0 && k < b) { avail = min(bytes, msblk->devblksize - offset); bytes -= avail; wait_on_buffer(bh[k]); @@ -58,18 +88,18 @@ int squashfs_zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer, continue; } - msblk->stream.next_in = bh[k]->b_data + offset; - msblk->stream.avail_in = avail; + stream->next_in = bh[k]->b_data + offset; + stream->avail_in = avail; offset = 0; } - if (msblk->stream.avail_out == 0 && page < pages) { - msblk->stream.next_out = buffer[page++]; - msblk->stream.avail_out = PAGE_CACHE_SIZE; + if (stream->avail_out == 0 && page < pages) { + stream->next_out = buffer[page++]; + stream->avail_out = PAGE_CACHE_SIZE; } if (!zlib_init) { - zlib_err = zlib_inflateInit(&msblk->stream); + zlib_err = zlib_inflateInit(stream); if (zlib_err != Z_OK) { ERROR("zlib_inflateInit returned unexpected " "result 0x%x, srclength %d\n", @@ -79,9 +109,9 @@ int squashfs_zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer, zlib_init = 1; } - zlib_err = zlib_inflate(&msblk->stream, Z_SYNC_FLUSH); + zlib_err = zlib_inflate(stream, Z_SYNC_FLUSH); - if (msblk->stream.avail_in == 0 && k < b) + if (stream->avail_in == 0 && k < b) put_bh(bh[k++]); } while (zlib_err == Z_OK); @@ -90,14 +120,14 @@ int squashfs_zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer, goto release_mutex; } - zlib_err = zlib_inflateEnd(&msblk->stream); + zlib_err = zlib_inflateEnd(stream); if (zlib_err != Z_OK) { ERROR("zlib_inflate error, data probably corrupt\n"); goto release_mutex; } mutex_unlock(&msblk->read_data_mutex); - return msblk->stream.total_out; + return stream->total_out; release_mutex: mutex_unlock(&msblk->read_data_mutex); -- cgit v1.2.3 From 4c0f0bb2351bee3de8dd7715ee199454a59f1230 Mon Sep 17 00:00:00 2001 From: Phillip Lougher <phillip@lougher.demon.co.uk> Date: Tue, 6 Oct 2009 04:04:15 +0100 Subject: Squashfs: add a decompressor framework This adds a decompressor framework which allows multiple compression algorithms to be cleanly supported. Also update zlib wrapper and other code to use the new framework. Signed-off-by: Phillip Lougher <phillip@lougher.demon.co.uk> --- fs/squashfs/Makefile | 2 +- fs/squashfs/block.c | 3 ++- fs/squashfs/decompressor.c | 58 ++++++++++++++++++++++++++++++++++++++++++++ fs/squashfs/decompressor.h | 55 +++++++++++++++++++++++++++++++++++++++++ fs/squashfs/squashfs.h | 14 +++++------ fs/squashfs/squashfs_fs_sb.h | 41 ++++++++++++++++--------------- fs/squashfs/super.c | 45 +++++++++++++++++++--------------- fs/squashfs/zlib_wrapper.c | 17 ++++++++++--- 8 files changed, 184 insertions(+), 51 deletions(-) create mode 100644 fs/squashfs/decompressor.c create mode 100644 fs/squashfs/decompressor.h (limited to 'fs') diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile index a397e6f12ab5..df8a19ef870d 100644 --- a/fs/squashfs/Makefile +++ b/fs/squashfs/Makefile @@ -4,4 +4,4 @@ obj-$(CONFIG_SQUASHFS) += squashfs.o squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o -squashfs-y += namei.o super.o symlink.o zlib_wrapper.o +squashfs-y += namei.o super.o symlink.o zlib_wrapper.o decompressor.o diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c index 3f836e181eb8..1cb0d81b164b 100644 --- a/fs/squashfs/block.c +++ b/fs/squashfs/block.c @@ -36,6 +36,7 @@ #include "squashfs_fs_sb.h" #include "squashfs_fs_i.h" #include "squashfs.h" +#include "decompressor.h" /* * Read the metadata block length, this is stored in the first two @@ -151,7 +152,7 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index, } if (compressed) { - length = squashfs_zlib_uncompress(msblk, buffer, bh, b, offset, + length = squashfs_decompress(msblk, buffer, bh, b, offset, length, srclength, pages); if (length < 0) goto read_failure; diff --git a/fs/squashfs/decompressor.c b/fs/squashfs/decompressor.c new file mode 100644 index 000000000000..0072ccdac1e2 --- /dev/null +++ b/fs/squashfs/decompressor.c @@ -0,0 +1,58 @@ +/* + * Squashfs - a compressed read only filesystem for Linux + * + * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009 + * Phillip Lougher <phillip@lougher.demon.co.uk> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * decompressor.c + */ + +#include <linux/types.h> +#include <linux/mutex.h> +#include <linux/buffer_head.h> + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "squashfs_fs_i.h" +#include "decompressor.h" +#include "squashfs.h" + +/* + * This file (and decompressor.h) implements a decompressor framework for + * Squashfs, allowing multiple decompressors to be easily supported + */ + +static const struct squashfs_decompressor squashfs_unknown_comp_ops = { + NULL, NULL, NULL, 0, "unknown", 0 +}; + +static const struct squashfs_decompressor *decompressor[] = { + &squashfs_zlib_comp_ops, + &squashfs_unknown_comp_ops +}; + + +const struct squashfs_decompressor *squashfs_lookup_decompressor(int id) +{ + int i; + + for (i = 0; decompressor[i]->id; i++) + if (id == decompressor[i]->id) + break; + + return decompressor[i]; +} diff --git a/fs/squashfs/decompressor.h b/fs/squashfs/decompressor.h new file mode 100644 index 000000000000..7425f80783f6 --- /dev/null +++ b/fs/squashfs/decompressor.h @@ -0,0 +1,55 @@ +#ifndef DECOMPRESSOR_H +#define DECOMPRESSOR_H +/* + * Squashfs - a compressed read only filesystem for Linux + * + * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009 + * Phillip Lougher <phillip@lougher.demon.co.uk> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * decompressor.h + */ + +struct squashfs_decompressor { + void *(*init)(struct squashfs_sb_info *); + void (*free)(void *); + int (*decompress)(struct squashfs_sb_info *, void **, + struct buffer_head **, int, int, int, int, int); + int id; + char *name; + int supported; +}; + +static inline void *squashfs_decompressor_init(struct squashfs_sb_info *msblk) +{ + return msblk->decompressor->init(msblk); +} + +static inline void squashfs_decompressor_free(struct squashfs_sb_info *msblk, + void *s) +{ + if (msblk->decompressor) + msblk->decompressor->free(s); +} + +static inline int squashfs_decompress(struct squashfs_sb_info *msblk, + void **buffer, struct buffer_head **bh, int b, int offset, int length, + int srclength, int pages) +{ + return msblk->decompressor->decompress(msblk, buffer, bh, b, offset, + length, srclength, pages); +} +#endif diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h index 9c2f76a1c50b..fe2587af5512 100644 --- a/fs/squashfs/squashfs.h +++ b/fs/squashfs/squashfs.h @@ -51,6 +51,9 @@ extern struct squashfs_cache_entry *squashfs_get_datablock(struct super_block *, u64, int); extern int squashfs_read_table(struct super_block *, void *, u64, int); +/* decompressor.c */ +extern const struct squashfs_decompressor *squashfs_lookup_decompressor(int); + /* export.c */ extern __le64 *squashfs_read_inode_lookup_table(struct super_block *, u64, unsigned int); @@ -70,14 +73,8 @@ extern struct inode *squashfs_iget(struct super_block *, long long, unsigned int); extern int squashfs_read_inode(struct inode *, long long); -/* zlib_wrapper.c */ -extern void *squashfs_zlib_init(void); -extern void squashfs_zlib_free(void *); -extern int squashfs_zlib_uncompress(struct squashfs_sb_info *, void **, - struct buffer_head **, int, int, int, int, int); - /* - * Inodes and files operations + * Inodes, files and decompressor operations */ /* dir.c */ @@ -94,3 +91,6 @@ extern const struct inode_operations squashfs_dir_inode_ops; /* symlink.c */ extern const struct address_space_operations squashfs_symlink_aops; + +/* zlib_wrapper.c */ +extern const struct squashfs_decompressor squashfs_zlib_comp_ops; diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h index 23a67fa40b03..753335085e41 100644 --- a/fs/squashfs/squashfs_fs_sb.h +++ b/fs/squashfs/squashfs_fs_sb.h @@ -52,25 +52,26 @@ struct squashfs_cache_entry { }; struct squashfs_sb_info { - int devblksize; - int devblksize_log2; - struct squashfs_cache *block_cache; - struct squashfs_cache *fragment_cache; - struct squashfs_cache *read_page; - int next_meta_index; - __le64 *id_table; - __le64 *fragment_index; - unsigned int *fragment_index_2; - struct mutex read_data_mutex; - struct mutex meta_index_mutex; - struct meta_index *meta_index; - void *stream; - __le64 *inode_lookup_table; - u64 inode_table; - u64 directory_table; - unsigned int block_size; - unsigned short block_log; - long long bytes_used; - unsigned int inodes; + const struct squashfs_decompressor *decompressor; + int devblksize; + int devblksize_log2; + struct squashfs_cache *block_cache; + struct squashfs_cache *fragment_cache; + struct squashfs_cache *read_page; + int next_meta_index; + __le64 *id_table; + __le64 *fragment_index; + unsigned int *fragment_index_2; + struct mutex read_data_mutex; + struct mutex meta_index_mutex; + struct meta_index *meta_index; + void *stream; + __le64 *inode_lookup_table; + u64 inode_table; + u64 directory_table; + unsigned int block_size; + unsigned short block_log; + long long bytes_used; + unsigned int inodes; }; #endif diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index b9f8c6a92d6a..3550aec2f655 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -41,27 +41,35 @@ #include "squashfs_fs_sb.h" #include "squashfs_fs_i.h" #include "squashfs.h" +#include "decompressor.h" static struct file_system_type squashfs_fs_type; static const struct super_operations squashfs_super_ops; -static int supported_squashfs_filesystem(short major, short minor, short comp) +static const struct squashfs_decompressor *supported_squashfs_filesystem(short + major, short minor, short id) { + const struct squashfs_decompressor *decompressor; + if (major < SQUASHFS_MAJOR) { ERROR("Major/Minor mismatch, older Squashfs %d.%d " "filesystems are unsupported\n", major, minor); - return -EINVAL; + return NULL; } else if (major > SQUASHFS_MAJOR || minor > SQUASHFS_MINOR) { ERROR("Major/Minor mismatch, trying to mount newer " "%d.%d filesystem\n", major, minor); ERROR("Please update your kernel\n"); - return -EINVAL; + return NULL; } - if (comp != ZLIB_COMPRESSION) - return -EINVAL; + decompressor = squashfs_lookup_decompressor(id); + if (!decompressor->supported) { + ERROR("Filesystem uses \"%s\" compression. This is not " + "supported\n", decompressor->name); + return NULL; + } - return 0; + return decompressor; } @@ -86,10 +94,6 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent) } msblk = sb->s_fs_info; - msblk->stream = squashfs_zlib_init(); - if (msblk->stream == NULL) - goto failure; - sblk = kzalloc(sizeof(*sblk), GFP_KERNEL); if (sblk == NULL) { ERROR("Failed to allocate squashfs_super_block\n"); @@ -116,25 +120,25 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount; } + err = -EINVAL; + /* Check it is a SQUASHFS superblock */ sb->s_magic = le32_to_cpu(sblk->s_magic); if (sb->s_magic != SQUASHFS_MAGIC) { if (!silent) ERROR("Can't find a SQUASHFS superblock on %s\n", bdevname(sb->s_bdev, b)); - err = -EINVAL; goto failed_mount; } - /* Check the MAJOR & MINOR versions and compression type */ - err = supported_squashfs_filesystem(le16_to_cpu(sblk->s_major), + /* Check the MAJOR & MINOR versions and lookup compression type */ + msblk->decompressor = supported_squashfs_filesystem( + le16_to_cpu(sblk->s_major), le16_to_cpu(sblk->s_minor), le16_to_cpu(sblk->compression)); - if (err < 0) + if (msblk->decompressor == NULL) goto failed_mount; - err = -EINVAL; - /* * Check if there's xattrs in the filesystem. These are not * supported in this version, so warn that they will be ignored. @@ -201,6 +205,10 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent) err = -ENOMEM; + msblk->stream = squashfs_decompressor_init(msblk); + if (msblk->stream == NULL) + goto failed_mount; + msblk->block_cache = squashfs_cache_init("metadata", SQUASHFS_CACHED_BLKS, SQUASHFS_METADATA_SIZE); if (msblk->block_cache == NULL) @@ -288,7 +296,7 @@ failed_mount: squashfs_cache_delete(msblk->block_cache); squashfs_cache_delete(msblk->fragment_cache); squashfs_cache_delete(msblk->read_page); - squashfs_zlib_free(msblk->stream); + squashfs_decompressor_free(msblk, msblk->stream); kfree(msblk->inode_lookup_table); kfree(msblk->fragment_index); kfree(msblk->id_table); @@ -298,7 +306,6 @@ failed_mount: return err; failure: - squashfs_zlib_free(msblk->stream); kfree(sb->s_fs_info); sb->s_fs_info = NULL; return -ENOMEM; @@ -342,7 +349,7 @@ static void squashfs_put_super(struct super_block *sb) squashfs_cache_delete(sbi->block_cache); squashfs_cache_delete(sbi->fragment_cache); squashfs_cache_delete(sbi->read_page); - squashfs_zlib_free(sbi->stream); + squashfs_decompressor_free(sbi, sbi->stream); kfree(sbi->id_table); kfree(sbi->fragment_index); kfree(sbi->meta_index); diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c index c814594d522e..4dd70e04333b 100644 --- a/fs/squashfs/zlib_wrapper.c +++ b/fs/squashfs/zlib_wrapper.c @@ -30,8 +30,9 @@ #include "squashfs_fs_sb.h" #include "squashfs_fs_i.h" #include "squashfs.h" +#include "decompressor.h" -void *squashfs_zlib_init() +static void *zlib_init(struct squashfs_sb_info *dummy) { z_stream *stream = kmalloc(sizeof(z_stream), GFP_KERNEL); if (stream == NULL) @@ -50,7 +51,7 @@ failed: } -void squashfs_zlib_free(void *strm) +static void zlib_free(void *strm) { z_stream *stream = strm; @@ -60,7 +61,7 @@ void squashfs_zlib_free(void *strm) } -int squashfs_zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer, +static int zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer, struct buffer_head **bh, int b, int offset, int length, int srclength, int pages) { @@ -137,3 +138,13 @@ release_mutex: return -EIO; } + +const struct squashfs_decompressor squashfs_zlib_comp_ops = { + .init = zlib_init, + .free = zlib_free, + .decompress = zlib_uncompress, + .id = ZLIB_COMPRESSION, + .name = "zlib", + .supported = 1 +}; + -- cgit v1.2.3 From dc3256782f88602953676c447b243dedb1be99ad Mon Sep 17 00:00:00 2001 From: Phillip Lougher <phillip@lougher.demon.co.uk> Date: Wed, 14 Oct 2009 03:58:11 +0100 Subject: Squashfs: add decompressor entries for lzma and lzo Add knowledge of lzma/lzo compression formats to the decompressor framework. For now these are added as unsupported. Without these entries lzma/lzo compressed filesystems will be flagged as having unknown compression which is undesirable. Signed-off-by: Phillip Lougher <phillip@lougher.demon.co.uk> --- fs/squashfs/decompressor.c | 10 ++++++++++ fs/squashfs/squashfs_fs.h | 4 +++- 2 files changed, 13 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/squashfs/decompressor.c b/fs/squashfs/decompressor.c index 0072ccdac1e2..157478da6ac9 100644 --- a/fs/squashfs/decompressor.c +++ b/fs/squashfs/decompressor.c @@ -36,12 +36,22 @@ * Squashfs, allowing multiple decompressors to be easily supported */ +static const struct squashfs_decompressor squashfs_lzma_unsupported_comp_ops = { + NULL, NULL, NULL, LZMA_COMPRESSION, "lzma", 0 +}; + +static const struct squashfs_decompressor squashfs_lzo_unsupported_comp_ops = { + NULL, NULL, NULL, LZO_COMPRESSION, "lzo", 0 +}; + static const struct squashfs_decompressor squashfs_unknown_comp_ops = { NULL, NULL, NULL, 0, "unknown", 0 }; static const struct squashfs_decompressor *decompressor[] = { &squashfs_zlib_comp_ops, + &squashfs_lzma_unsupported_comp_ops, + &squashfs_lzo_unsupported_comp_ops, &squashfs_unknown_comp_ops }; diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h index 283daafc568e..36e1604ab1c1 100644 --- a/fs/squashfs/squashfs_fs.h +++ b/fs/squashfs/squashfs_fs.h @@ -211,7 +211,9 @@ struct meta_index { /* * definitions for structures on disk */ -#define ZLIB_COMPRESSION 1 +#define ZLIB_COMPRESSION 1 +#define LZMA_COMPRESSION 2 +#define LZO_COMPRESSION 3 struct squashfs_super_block { __le32 s_magic; -- cgit v1.2.3 From 512dd1abd9539a474f2792eeaf6783c59ad7778a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@infradead.org> Date: Wed, 13 Jan 2010 22:05:48 +0000 Subject: xfs: kill XFS_QMOPT_ASYNC The option is unused and one of the few remaining users of xfs_bawrite, so let's get rid of it. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Dave Chinner <david@fromorbit.com> Signed-off-by: Alex Elder <aelder@sgi.com> --- fs/xfs/quota/xfs_dquot.c | 2 -- fs/xfs/xfs_quota.h | 1 - 2 files changed, 3 deletions(-) (limited to 'fs') diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c index d7c7eea09fc2..a447493690eb 100644 --- a/fs/xfs/quota/xfs_dquot.c +++ b/fs/xfs/quota/xfs_dquot.c @@ -1253,8 +1253,6 @@ xfs_qm_dqflush( if (flags & XFS_QMOPT_DELWRI) { xfs_bdwrite(mp, bp); - } else if (flags & XFS_QMOPT_ASYNC) { - error = xfs_bawrite(mp, bp); } else { error = xfs_bwrite(mp, bp); } diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index 91bfd60f4c74..21d11d9f48f2 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -226,7 +226,6 @@ typedef struct xfs_qoff_logformat { * flags for dqflush and dqflush_all. */ #define XFS_QMOPT_SYNC 0x1000000 -#define XFS_QMOPT_ASYNC 0x2000000 #define XFS_QMOPT_DELWRI 0x4000000 /* -- cgit v1.2.3 From 4d1f88d75b00c4d23f4c51305ab5b779a86ef74e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@infradead.org> Date: Wed, 13 Jan 2010 22:05:49 +0000 Subject: xfs: clean up error handling in xfs_trans_dqresv Move the error code selection after the goto label and fold the xfs_quota_error helper into it. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Dave Chinner <david@fromorbit.com> Signed-off-by: Alex Elder <aelder@sgi.com> --- fs/xfs/quota/xfs_trans_dquot.c | 48 +++++++++++++++--------------------------- 1 file changed, 17 insertions(+), 31 deletions(-) (limited to 'fs') diff --git a/fs/xfs/quota/xfs_trans_dquot.c b/fs/xfs/quota/xfs_trans_dquot.c index 97ac9640be98..b9db6f781cd6 100644 --- a/fs/xfs/quota/xfs_trans_dquot.c +++ b/fs/xfs/quota/xfs_trans_dquot.c @@ -589,14 +589,6 @@ xfs_trans_unreserve_and_mod_dquots( } } -STATIC int -xfs_quota_error(uint flags) -{ - if (flags & XFS_QMOPT_ENOSPC) - return ENOSPC; - return EDQUOT; -} - /* * This reserves disk blocks and inodes against a dquot. * Flags indicate if the dquot is to be locked here and also @@ -612,7 +604,6 @@ xfs_trans_dqresv( long ninos, uint flags) { - int error; xfs_qcnt_t hardlimit; xfs_qcnt_t softlimit; time_t timer; @@ -649,7 +640,6 @@ xfs_trans_dqresv( warnlimit = XFS_QI_RTBWARNLIMIT(dqp->q_mount); resbcountp = &dqp->q_res_rtbcount; } - error = 0; if ((flags & XFS_QMOPT_FORCE_RES) == 0 && dqp->q_core.d_id && @@ -667,19 +657,13 @@ xfs_trans_dqresv( * nblks. */ if (hardlimit > 0ULL && - (hardlimit <= nblks + *resbcountp)) { - error = xfs_quota_error(flags); + hardlimit <= nblks + *resbcountp) goto error_return; - } - if (softlimit > 0ULL && - (softlimit <= nblks + *resbcountp)) { - if ((timer != 0 && get_seconds() > timer) || - (warns != 0 && warns >= warnlimit)) { - error = xfs_quota_error(flags); - goto error_return; - } - } + softlimit <= nblks + *resbcountp && + ((timer != 0 && get_seconds() > timer) || + (warns != 0 && warns >= warnlimit))) + goto error_return; } if (ninos > 0) { count = be64_to_cpu(dqp->q_core.d_icount); @@ -692,16 +676,13 @@ xfs_trans_dqresv( softlimit = be64_to_cpu(dqp->q_core.d_ino_softlimit); if (!softlimit) softlimit = q->qi_isoftlimit; - if (hardlimit > 0ULL && count >= hardlimit) { - error = xfs_quota_error(flags); + + if (hardlimit > 0ULL && count >= hardlimit) + goto error_return; + if (softlimit > 0ULL && count >= softlimit && + ((timer != 0 && get_seconds() > timer) || + (warns != 0 && warns >= warnlimit))) goto error_return; - } else if (softlimit > 0ULL && count >= softlimit) { - if ((timer != 0 && get_seconds() > timer) || - (warns != 0 && warns >= warnlimit)) { - error = xfs_quota_error(flags); - goto error_return; - } - } } } @@ -736,9 +717,14 @@ xfs_trans_dqresv( ASSERT(dqp->q_res_rtbcount >= be64_to_cpu(dqp->q_core.d_rtbcount)); ASSERT(dqp->q_res_icount >= be64_to_cpu(dqp->q_core.d_icount)); + xfs_dqunlock(dqp); + return 0; + error_return: xfs_dqunlock(dqp); - return error; + if (flags & XFS_QMOPT_ENOSPC) + return ENOSPC; + return EDQUOT; } -- cgit v1.2.3 From a210c1aa7f6c90b729cc3a72d03e789b13cb6c47 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@infradead.org> Date: Sun, 17 Jan 2010 22:36:19 +0000 Subject: xfs: implement quota warnings via netlink Wire up quota_send_warning to send quota warnings over netlink. This is used by various desktops to show user quota warnings. Tested by running the quota_nld daemon while running the xfstest quota tests and observing the warnings. I'll see how I can get a more formal testcase for it written. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Dave Chinner <david@fromorbit.com> Signed-off-by: Alex Elder <aelder@sgi.com> --- fs/xfs/quota/xfs_trans_dquot.c | 49 +++++++++++++++++++++++++++++++++--------- 1 file changed, 39 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/xfs/quota/xfs_trans_dquot.c b/fs/xfs/quota/xfs_trans_dquot.c index b9db6f781cd6..c3ab75cb1d9a 100644 --- a/fs/xfs/quota/xfs_trans_dquot.c +++ b/fs/xfs/quota/xfs_trans_dquot.c @@ -589,6 +589,20 @@ xfs_trans_unreserve_and_mod_dquots( } } +STATIC void +xfs_quota_warn( + struct xfs_mount *mp, + struct xfs_dquot *dqp, + int type) +{ + /* no warnings for project quotas - we just return ENOSPC later */ + if (dqp->dq_flags & XFS_DQ_PROJ) + return; + quota_send_warning((dqp->dq_flags & XFS_DQ_USER) ? USRQUOTA : GRPQUOTA, + be32_to_cpu(dqp->q_core.d_id), mp->m_super->s_dev, + type); +} + /* * This reserves disk blocks and inodes against a dquot. * Flags indicate if the dquot is to be locked here and also @@ -657,13 +671,21 @@ xfs_trans_dqresv( * nblks. */ if (hardlimit > 0ULL && - hardlimit <= nblks + *resbcountp) + hardlimit <= nblks + *resbcountp) { + xfs_quota_warn(mp, dqp, QUOTA_NL_BHARDWARN); goto error_return; + } if (softlimit > 0ULL && - softlimit <= nblks + *resbcountp && - ((timer != 0 && get_seconds() > timer) || - (warns != 0 && warns >= warnlimit))) - goto error_return; + softlimit <= nblks + *resbcountp) { + if ((timer != 0 && get_seconds() > timer) || + (warns != 0 && warns >= warnlimit)) { + xfs_quota_warn(mp, dqp, + QUOTA_NL_BSOFTLONGWARN); + goto error_return; + } + + xfs_quota_warn(mp, dqp, QUOTA_NL_BSOFTWARN); + } } if (ninos > 0) { count = be64_to_cpu(dqp->q_core.d_icount); @@ -677,12 +699,19 @@ xfs_trans_dqresv( if (!softlimit) softlimit = q->qi_isoftlimit; - if (hardlimit > 0ULL && count >= hardlimit) - goto error_return; - if (softlimit > 0ULL && count >= softlimit && - ((timer != 0 && get_seconds() > timer) || - (warns != 0 && warns >= warnlimit))) + if (hardlimit > 0ULL && count >= hardlimit) { + xfs_quota_warn(mp, dqp, QUOTA_NL_IHARDWARN); goto error_return; + } + if (softlimit > 0ULL && count >= softlimit) { + if ((timer != 0 && get_seconds() > timer) || + (warns != 0 && warns >= warnlimit)) { + xfs_quota_warn(mp, dqp, + QUOTA_NL_ISOFTLONGWARN); + goto error_return; + } + xfs_quota_warn(mp, dqp, QUOTA_NL_ISOFTWARN); + } } } -- cgit v1.2.3 From 0cadda1c5f194f98a05d252ff4385d86d2ed0862 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@infradead.org> Date: Tue, 19 Jan 2010 09:56:44 +0000 Subject: xfs: remove duplicate buffer flags Currently we define aliases for the buffer flags in various namespaces, which only adds confusion. Remove all but the XBF_ flags to clean this up a bit. Note that we still abuse XFS_B_ASYNC/XBF_ASYNC for some non-buffer uses, but I'll clean that up later. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Dave Chinner <david@fromorbit.com> Signed-off-by: Alex Elder <aelder@sgi.com> --- fs/xfs/linux-2.6/xfs_buf.c | 2 +- fs/xfs/linux-2.6/xfs_buf.h | 22 ++++------------------ fs/xfs/linux-2.6/xfs_fs_subr.c | 2 +- fs/xfs/linux-2.6/xfs_sync.c | 4 ++-- fs/xfs/quota/xfs_dquot.c | 3 +-- fs/xfs/quota/xfs_dquot_item.c | 3 +-- fs/xfs/xfs_alloc.c | 2 +- fs/xfs/xfs_attr.c | 12 +++++------- fs/xfs/xfs_attr_leaf.c | 2 +- fs/xfs/xfs_btree.c | 4 ++-- fs/xfs/xfs_ialloc.c | 2 +- fs/xfs/xfs_inode.c | 20 ++++++++++---------- fs/xfs/xfs_inode_item.c | 2 +- fs/xfs/xfs_log_recover.c | 8 ++++---- fs/xfs/xfs_mount.c | 4 ++-- fs/xfs/xfs_trans_buf.c | 27 ++++++++++++++------------- fs/xfs/xfs_vnodeops.c | 4 ++-- 17 files changed, 53 insertions(+), 70 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index efd745bb8887..730eff1e71a3 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -1169,7 +1169,7 @@ xfs_bioerror_relse( XFS_BUF_STALE(bp); XFS_BUF_CLR_IODONE_FUNC(bp); XFS_BUF_CLR_BDSTRAT_FUNC(bp); - if (!(fl & XFS_B_ASYNC)) { + if (!(fl & XBF_ASYNC)) { /* * Mark b_error and B_ERROR _both_. * Lot's of chunkcache code assumes that. diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h index 4f2ad66edb76..ea8c198f0c39 100644 --- a/fs/xfs/linux-2.6/xfs_buf.h +++ b/fs/xfs/linux-2.6/xfs_buf.h @@ -275,33 +275,19 @@ extern void xfs_buf_terminate(void); ({ char __b[BDEVNAME_SIZE]; bdevname((target)->bt_bdev, __b); __b; }) -#define XFS_B_ASYNC XBF_ASYNC -#define XFS_B_DELWRI XBF_DELWRI -#define XFS_B_READ XBF_READ -#define XFS_B_WRITE XBF_WRITE -#define XFS_B_STALE XBF_STALE - -#define XFS_BUF_TRYLOCK XBF_TRYLOCK -#define XFS_INCORE_TRYLOCK XBF_TRYLOCK -#define XFS_BUF_LOCK XBF_LOCK -#define XFS_BUF_MAPPED XBF_MAPPED - -#define BUF_BUSY XBF_DONT_BLOCK - #define XFS_BUF_BFLAGS(bp) ((bp)->b_flags) #define XFS_BUF_ZEROFLAGS(bp) ((bp)->b_flags &= \ ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI|XBF_ORDERED)) -#define XFS_BUF_STALE(bp) ((bp)->b_flags |= XFS_B_STALE) -#define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XFS_B_STALE) -#define XFS_BUF_ISSTALE(bp) ((bp)->b_flags & XFS_B_STALE) +#define XFS_BUF_STALE(bp) ((bp)->b_flags |= XBF_STALE) +#define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XBF_STALE) +#define XFS_BUF_ISSTALE(bp) ((bp)->b_flags & XBF_STALE) #define XFS_BUF_SUPER_STALE(bp) do { \ XFS_BUF_STALE(bp); \ xfs_buf_delwri_dequeue(bp); \ XFS_BUF_DONE(bp); \ } while (0) -#define XFS_BUF_MANAGE XBF_FS_MANAGED #define XFS_BUF_UNMANAGE(bp) ((bp)->b_flags &= ~XBF_FS_MANAGED) #define XFS_BUF_DELAYWRITE(bp) ((bp)->b_flags |= XBF_DELWRI) @@ -390,7 +376,7 @@ static inline void xfs_buf_relse(xfs_buf_t *bp) #define xfs_biomove(bp, off, len, data, rw) \ xfs_buf_iomove((bp), (off), (len), (data), \ - ((rw) == XFS_B_WRITE) ? XBRW_WRITE : XBRW_READ) + ((rw) == XBF_WRITE) ? XBRW_WRITE : XBRW_READ) #define xfs_biozero(bp, off, len) \ xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO) diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/linux-2.6/xfs_fs_subr.c index 7501b85fd860..b6918d76bc7b 100644 --- a/fs/xfs/linux-2.6/xfs_fs_subr.c +++ b/fs/xfs/linux-2.6/xfs_fs_subr.c @@ -79,7 +79,7 @@ xfs_flush_pages( xfs_iflags_clear(ip, XFS_ITRUNCATED); ret = -filemap_fdatawrite(mapping); } - if (flags & XFS_B_ASYNC) + if (flags & XBF_ASYNC) return ret; ret2 = xfs_wait_on_pages(ip, first, last); if (!ret) diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index b58f8412dfe2..58c24be72c65 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c @@ -234,7 +234,7 @@ xfs_sync_inode_data( } error = xfs_flush_pages(ip, 0, -1, (flags & SYNC_WAIT) ? - 0 : XFS_B_ASYNC, FI_NONE); + 0 : XBF_ASYNC, FI_NONE); xfs_iunlock(ip, XFS_IOLOCK_SHARED); out_wait: @@ -370,7 +370,7 @@ xfs_sync_fsdata( if (flags & SYNC_TRYLOCK) { ASSERT(!(flags & SYNC_WAIT)); - bp = xfs_getsb(mp, XFS_BUF_TRYLOCK); + bp = xfs_getsb(mp, XBF_TRYLOCK); if (!bp) goto out; diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c index a447493690eb..5756392ffdee 100644 --- a/fs/xfs/quota/xfs_dquot.c +++ b/fs/xfs/quota/xfs_dquot.c @@ -1527,8 +1527,7 @@ xfs_qm_dqflock_pushbuf_wait( * the flush lock when the I/O completes. */ bp = xfs_incore(dqp->q_mount->m_ddev_targp, dqp->q_blkno, - XFS_QI_DQCHUNKLEN(dqp->q_mount), - XFS_INCORE_TRYLOCK); + XFS_QI_DQCHUNKLEN(dqp->q_mount), XBF_TRYLOCK); if (bp != NULL) { if (XFS_BUF_ISDELAYWRITE(bp)) { int error; diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c index d0d4a9a0bbd7..37929d19ef44 100644 --- a/fs/xfs/quota/xfs_dquot_item.c +++ b/fs/xfs/quota/xfs_dquot_item.c @@ -237,8 +237,7 @@ xfs_qm_dquot_logitem_pushbuf( } mp = dqp->q_mount; bp = xfs_incore(mp->m_ddev_targp, qip->qli_format.qlf_blkno, - XFS_QI_DQCHUNKLEN(mp), - XFS_INCORE_TRYLOCK); + XFS_QI_DQCHUNKLEN(mp), XBF_TRYLOCK); if (bp != NULL) { if (XFS_BUF_ISDELAYWRITE(bp)) { dopush = ((qip->qli_item.li_flags & XFS_LI_IN_AIL) && diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 8aa181d6dd7d..a27aeb7d9e74 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -2180,7 +2180,7 @@ xfs_alloc_read_agf( ASSERT(agno != NULLAGNUMBER); error = xfs_read_agf(mp, tp, agno, - (flags & XFS_ALLOC_FLAG_TRYLOCK) ? XFS_BUF_TRYLOCK : 0, + (flags & XFS_ALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0, bpp); if (error) return error; diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index f7b426a1b6ee..b9c196a53c42 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -2015,15 +2015,14 @@ xfs_attr_rmtval_get(xfs_da_args_t *args) dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock); blkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount); error = xfs_read_buf(mp, mp->m_ddev_targp, dblkno, - blkcnt, - XFS_BUF_LOCK | XBF_DONT_BLOCK, + blkcnt, XBF_LOCK | XBF_DONT_BLOCK, &bp); if (error) return(error); tmp = (valuelen < XFS_BUF_SIZE(bp)) ? valuelen : XFS_BUF_SIZE(bp); - xfs_biomove(bp, 0, tmp, dst, XFS_B_READ); + xfs_biomove(bp, 0, tmp, dst, XBF_READ); xfs_buf_relse(bp); dst += tmp; valuelen -= tmp; @@ -2149,13 +2148,13 @@ xfs_attr_rmtval_set(xfs_da_args_t *args) blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); bp = xfs_buf_get(mp->m_ddev_targp, dblkno, blkcnt, - XFS_BUF_LOCK | XBF_DONT_BLOCK); + XBF_LOCK | XBF_DONT_BLOCK); ASSERT(bp); ASSERT(!XFS_BUF_GETERROR(bp)); tmp = (valuelen < XFS_BUF_SIZE(bp)) ? valuelen : XFS_BUF_SIZE(bp); - xfs_biomove(bp, 0, tmp, src, XFS_B_WRITE); + xfs_biomove(bp, 0, tmp, src, XBF_WRITE); if (tmp < XFS_BUF_SIZE(bp)) xfs_biozero(bp, tmp, XFS_BUF_SIZE(bp) - tmp); if ((error = xfs_bwrite(mp, bp))) {/* GROT: NOTE: synchronous write */ @@ -2216,8 +2215,7 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args) /* * If the "remote" value is in the cache, remove it. */ - bp = xfs_incore(mp->m_ddev_targp, dblkno, blkcnt, - XFS_INCORE_TRYLOCK); + bp = xfs_incore(mp->m_ddev_targp, dblkno, blkcnt, XBF_TRYLOCK); if (bp) { XFS_BUF_STALE(bp); XFS_BUF_UNDELAYWRITE(bp); diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index 52519a201849..a90ce74fc256 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -2950,7 +2950,7 @@ xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp, map.br_blockcount); bp = xfs_trans_get_buf(*trans, dp->i_mount->m_ddev_targp, - dblkno, dblkcnt, XFS_BUF_LOCK); + dblkno, dblkcnt, XBF_LOCK); xfs_trans_binval(*trans, bp); /* * Roll to next transaction. diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c index 36a0992dd669..96be4b0f2496 100644 --- a/fs/xfs/xfs_btree.c +++ b/fs/xfs/xfs_btree.c @@ -977,7 +977,7 @@ xfs_btree_get_buf_block( xfs_daddr_t d; /* need to sort out how callers deal with failures first */ - ASSERT(!(flags & XFS_BUF_TRYLOCK)); + ASSERT(!(flags & XBF_TRYLOCK)); d = xfs_btree_ptr_to_daddr(cur, ptr); *bpp = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d, @@ -1008,7 +1008,7 @@ xfs_btree_read_buf_block( int error; /* need to sort out how callers deal with failures first */ - ASSERT(!(flags & XFS_BUF_TRYLOCK)); + ASSERT(!(flags & XBF_TRYLOCK)); d = xfs_btree_ptr_to_daddr(cur, ptr); error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d, diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index 52c9d006c0e6..9d884c127bb9 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -205,7 +205,7 @@ xfs_ialloc_inode_init( d = XFS_AGB_TO_DADDR(mp, agno, agbno + (j * blks_per_cluster)); fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize * blks_per_cluster, - XFS_BUF_LOCK); + XBF_LOCK); ASSERT(fbuf); ASSERT(!XFS_BUF_GETERROR(fbuf)); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 0317b000ab44..bbb3bee8e936 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -151,7 +151,7 @@ xfs_imap_to_bp( "an error %d on %s. Returning error.", error, mp->m_fsname); } else { - ASSERT(buf_flags & XFS_BUF_TRYLOCK); + ASSERT(buf_flags & XBF_TRYLOCK); } return error; } @@ -239,7 +239,7 @@ xfs_inotobp( if (error) return error; - error = xfs_imap_to_bp(mp, tp, &imap, &bp, XFS_BUF_LOCK, imap_flags); + error = xfs_imap_to_bp(mp, tp, &imap, &bp, XBF_LOCK, imap_flags); if (error) return error; @@ -285,7 +285,7 @@ xfs_itobp( return error; if (!bp) { - ASSERT(buf_flags & XFS_BUF_TRYLOCK); + ASSERT(buf_flags & XBF_TRYLOCK); ASSERT(tp == NULL); *bpp = NULL; return EAGAIN; @@ -807,7 +807,7 @@ xfs_iread( * Get pointers to the on-disk inode and the buffer containing it. */ error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp, - XFS_BUF_LOCK, iget_flags); + XBF_LOCK, iget_flags); if (error) return error; dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset); @@ -1751,7 +1751,7 @@ xfs_iunlink( * Here we put the head pointer into our next pointer, * and then we fall through to point the head at us. */ - error = xfs_itobp(mp, tp, ip, &dip, &ibp, XFS_BUF_LOCK); + error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK); if (error) return error; @@ -1833,7 +1833,7 @@ xfs_iunlink_remove( * of dealing with the buffer when there is no need to * change it. */ - error = xfs_itobp(mp, tp, ip, &dip, &ibp, XFS_BUF_LOCK); + error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK); if (error) { cmn_err(CE_WARN, "xfs_iunlink_remove: xfs_itobp() returned an error %d on %s. Returning error.", @@ -1895,7 +1895,7 @@ xfs_iunlink_remove( * Now last_ibp points to the buffer previous to us on * the unlinked list. Pull us from the list. */ - error = xfs_itobp(mp, tp, ip, &dip, &ibp, XFS_BUF_LOCK); + error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK); if (error) { cmn_err(CE_WARN, "xfs_iunlink_remove: xfs_itobp() returned an error %d on %s. Returning error.", @@ -2040,7 +2040,7 @@ xfs_ifree_cluster( bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno, mp->m_bsize * blks_per_cluster, - XFS_BUF_LOCK); + XBF_LOCK); pre_flushed = 0; lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); @@ -2151,7 +2151,7 @@ xfs_ifree( xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, XFS_BUF_LOCK); + error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, XBF_LOCK); if (error) return error; @@ -2952,7 +2952,7 @@ xfs_iflush( * Get the buffer containing the on-disk inode. */ error = xfs_itobp(mp, NULL, ip, &dip, &bp, - noblock ? XFS_BUF_TRYLOCK : XFS_BUF_LOCK); + noblock ? XBF_TRYLOCK : XBF_LOCK); if (error || !bp) { xfs_ifunlock(ip); return error; diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index f38855d21ea5..6194fb5d3777 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -785,7 +785,7 @@ xfs_inode_item_pushbuf( mp = ip->i_mount; bp = xfs_incore(mp->m_ddev_targp, iip->ili_format.ilf_blkno, - iip->ili_format.ilf_len, XFS_INCORE_TRYLOCK); + iip->ili_format.ilf_len, XBF_TRYLOCK); if (bp != NULL) { if (XFS_BUF_ISDELAYWRITE(bp)) { diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 65f1f137d789..97148f0c4bdd 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2184,9 +2184,9 @@ xlog_recover_do_buffer_trans( } mp = log->l_mp; - buf_flags = XFS_BUF_LOCK; + buf_flags = XBF_LOCK; if (!(flags & XFS_BLI_INODE_BUF)) - buf_flags |= XFS_BUF_MAPPED; + buf_flags |= XBF_MAPPED; bp = xfs_buf_read(mp->m_ddev_targp, blkno, len, buf_flags); if (XFS_BUF_ISERROR(bp)) { @@ -2288,7 +2288,7 @@ xlog_recover_do_inode_trans( } bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, - XFS_BUF_LOCK); + XBF_LOCK); if (XFS_BUF_ISERROR(bp)) { xfs_ioerror_alert("xlog_recover_do..(read#2)", mp, bp, in_f->ilf_blkno); @@ -3146,7 +3146,7 @@ xlog_recover_process_one_iunlink( /* * Get the on disk inode to find the next inode in the bucket. */ - error = xfs_itobp(mp, NULL, ip, &dip, &ibp, XFS_BUF_LOCK); + error = xfs_itobp(mp, NULL, ip, &dip, &ibp, XBF_LOCK); if (error) goto fail_iput; diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index d95bd1809f3c..bb0154047e85 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -665,7 +665,7 @@ xfs_readsb(xfs_mount_t *mp, int flags) * access to the superblock. */ sector_size = xfs_getsize_buftarg(mp->m_ddev_targp); - extra_flags = XFS_BUF_LOCK | XFS_BUF_MANAGE | XFS_BUF_MAPPED; + extra_flags = XBF_LOCK | XBF_FS_MANAGED | XBF_MAPPED; bp = xfs_buf_read(mp->m_ddev_targp, XFS_SB_DADDR, BTOBB(sector_size), extra_flags); @@ -1969,7 +1969,7 @@ xfs_getsb( ASSERT(mp->m_sb_bp != NULL); bp = mp->m_sb_bp; - if (flags & XFS_BUF_TRYLOCK) { + if (flags & XBF_TRYLOCK) { if (!XFS_BUF_CPSEMA(bp)) { return NULL; } diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 49130628d5ef..5ffd544434eb 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -75,13 +75,14 @@ xfs_trans_get_buf(xfs_trans_t *tp, xfs_buf_log_item_t *bip; if (flags == 0) - flags = XFS_BUF_LOCK | XFS_BUF_MAPPED; + flags = XBF_LOCK | XBF_MAPPED; /* * Default to a normal get_buf() call if the tp is NULL. */ if (tp == NULL) - return xfs_buf_get(target_dev, blkno, len, flags | BUF_BUSY); + return xfs_buf_get(target_dev, blkno, len, + flags | XBF_DONT_BLOCK); /* * If we find the buffer in the cache with this transaction @@ -117,14 +118,14 @@ xfs_trans_get_buf(xfs_trans_t *tp, } /* - * We always specify the BUF_BUSY flag within a transaction so - * that get_buf does not try to push out a delayed write buffer + * We always specify the XBF_DONT_BLOCK flag within a transaction + * so that get_buf does not try to push out a delayed write buffer * which might cause another transaction to take place (if the * buffer was delayed alloc). Such recursive transactions can * easily deadlock with our current transaction as well as cause * us to run out of stack space. */ - bp = xfs_buf_get(target_dev, blkno, len, flags | BUF_BUSY); + bp = xfs_buf_get(target_dev, blkno, len, flags | XBF_DONT_BLOCK); if (bp == NULL) { return NULL; } @@ -290,15 +291,15 @@ xfs_trans_read_buf( int error; if (flags == 0) - flags = XFS_BUF_LOCK | XFS_BUF_MAPPED; + flags = XBF_LOCK | XBF_MAPPED; /* * Default to a normal get_buf() call if the tp is NULL. */ if (tp == NULL) { - bp = xfs_buf_read(target, blkno, len, flags | BUF_BUSY); + bp = xfs_buf_read(target, blkno, len, flags | XBF_DONT_BLOCK); if (!bp) - return (flags & XFS_BUF_TRYLOCK) ? + return (flags & XBF_TRYLOCK) ? EAGAIN : XFS_ERROR(ENOMEM); if (XFS_BUF_GETERROR(bp) != 0) { @@ -385,14 +386,14 @@ xfs_trans_read_buf( } /* - * We always specify the BUF_BUSY flag within a transaction so - * that get_buf does not try to push out a delayed write buffer + * We always specify the XBF_DONT_BLOCK flag within a transaction + * so that get_buf does not try to push out a delayed write buffer * which might cause another transaction to take place (if the * buffer was delayed alloc). Such recursive transactions can * easily deadlock with our current transaction as well as cause * us to run out of stack space. */ - bp = xfs_buf_read(target, blkno, len, flags | BUF_BUSY); + bp = xfs_buf_read(target, blkno, len, flags | XBF_DONT_BLOCK); if (bp == NULL) { *bpp = NULL; return 0; @@ -472,8 +473,8 @@ shutdown_abort: if (XFS_BUF_ISSTALE(bp) && XFS_BUF_ISDELAYWRITE(bp)) cmn_err(CE_NOTE, "about to pop assert, bp == 0x%p", bp); #endif - ASSERT((XFS_BUF_BFLAGS(bp) & (XFS_B_STALE|XFS_B_DELWRI)) != - (XFS_B_STALE|XFS_B_DELWRI)); + ASSERT((XFS_BUF_BFLAGS(bp) & (XBF_STALE|XBF_DELWRI)) != + (XBF_STALE|XBF_DELWRI)); trace_xfs_trans_read_buf_shut(bp, _RET_IP_); xfs_buf_relse(bp); diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 9f7c001ef469..4da96cdffb76 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -256,7 +256,7 @@ xfs_setattr( iattr->ia_size > ip->i_d.di_size) { code = xfs_flush_pages(ip, ip->i_d.di_size, iattr->ia_size, - XFS_B_ASYNC, FI_NONE); + XBF_ASYNC, FI_NONE); } /* wait for all I/O to complete */ @@ -1096,7 +1096,7 @@ xfs_release( */ truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED); if (truncated && VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0) - xfs_flush_pages(ip, 0, -1, XFS_B_ASYNC, FI_NONE); + xfs_flush_pages(ip, 0, -1, XBF_ASYNC, FI_NONE); } if (ip->i_d.di_nlink != 0) { -- cgit v1.2.3 From 4139b3b337cffd106744386c842b89dc86e31d4b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@infradead.org> Date: Tue, 19 Jan 2010 09:56:45 +0000 Subject: xfs: kill XLOG_VEC_SET_TYPE This macro only obsfucates the log item type assignments, so kill it. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Dave Chinner <david@fromorbit.com> Signed-off-by: Alex Elder <aelder@sgi.com> --- fs/xfs/quota/xfs_dquot_item.c | 6 +++--- fs/xfs/xfs_buf_item.c | 8 ++++---- fs/xfs/xfs_extfree_item.c | 4 ++-- fs/xfs/xfs_inode_item.c | 18 +++++++++--------- fs/xfs/xfs_log.c | 4 ++-- fs/xfs/xfs_log.h | 4 +--- fs/xfs/xfs_trans.c | 2 +- 7 files changed, 22 insertions(+), 24 deletions(-) (limited to 'fs') diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c index 37929d19ef44..116580d52fae 100644 --- a/fs/xfs/quota/xfs_dquot_item.c +++ b/fs/xfs/quota/xfs_dquot_item.c @@ -74,11 +74,11 @@ xfs_qm_dquot_logitem_format( logvec->i_addr = (xfs_caddr_t)&logitem->qli_format; logvec->i_len = sizeof(xfs_dq_logformat_t); - XLOG_VEC_SET_TYPE(logvec, XLOG_REG_TYPE_QFORMAT); + logvec->i_type = XLOG_REG_TYPE_QFORMAT; logvec++; logvec->i_addr = (xfs_caddr_t)&logitem->qli_dquot->q_core; logvec->i_len = sizeof(xfs_disk_dquot_t); - XLOG_VEC_SET_TYPE(logvec, XLOG_REG_TYPE_DQUOT); + logvec->i_type = XLOG_REG_TYPE_DQUOT; ASSERT(2 == logitem->qli_item.li_desc->lid_size); logitem->qli_format.qlf_size = 2; @@ -466,7 +466,7 @@ xfs_qm_qoff_logitem_format(xfs_qoff_logitem_t *qf, log_vector->i_addr = (xfs_caddr_t)&(qf->qql_format); log_vector->i_len = sizeof(xfs_qoff_logitem_t); - XLOG_VEC_SET_TYPE(log_vector, XLOG_REG_TYPE_QUOTAOFF); + log_vector->i_type = XLOG_REG_TYPE_QUOTAOFF; qf->qql_format.qf_size = 1; } diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index a30f7e9eb2b9..e0a11583ce5a 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -250,7 +250,7 @@ xfs_buf_item_format( ((bip->bli_format.blf_map_size - 1) * sizeof(uint))); vecp->i_addr = (xfs_caddr_t)&bip->bli_format; vecp->i_len = base_size; - XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_BFORMAT); + vecp->i_type = XLOG_REG_TYPE_BFORMAT; vecp++; nvecs = 1; @@ -297,14 +297,14 @@ xfs_buf_item_format( buffer_offset = first_bit * XFS_BLI_CHUNK; vecp->i_addr = xfs_buf_offset(bp, buffer_offset); vecp->i_len = nbits * XFS_BLI_CHUNK; - XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_BCHUNK); + vecp->i_type = XLOG_REG_TYPE_BCHUNK; nvecs++; break; } else if (next_bit != last_bit + 1) { buffer_offset = first_bit * XFS_BLI_CHUNK; vecp->i_addr = xfs_buf_offset(bp, buffer_offset); vecp->i_len = nbits * XFS_BLI_CHUNK; - XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_BCHUNK); + vecp->i_type = XLOG_REG_TYPE_BCHUNK; nvecs++; vecp++; first_bit = next_bit; @@ -316,7 +316,7 @@ xfs_buf_item_format( buffer_offset = first_bit * XFS_BLI_CHUNK; vecp->i_addr = xfs_buf_offset(bp, buffer_offset); vecp->i_len = nbits * XFS_BLI_CHUNK; - XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_BCHUNK); + vecp->i_type = XLOG_REG_TYPE_BCHUNK; /* You would think we need to bump the nvecs here too, but we do not * this number is used by recovery, and it gets confused by the boundary * split here diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 05a4bdd4be39..6f35ed1b39b9 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -82,7 +82,7 @@ xfs_efi_item_format(xfs_efi_log_item_t *efip, log_vector->i_addr = (xfs_caddr_t)&(efip->efi_format); log_vector->i_len = size; - XLOG_VEC_SET_TYPE(log_vector, XLOG_REG_TYPE_EFI_FORMAT); + log_vector->i_type = XLOG_REG_TYPE_EFI_FORMAT; ASSERT(size >= sizeof(xfs_efi_log_format_t)); } @@ -406,7 +406,7 @@ xfs_efd_item_format(xfs_efd_log_item_t *efdp, log_vector->i_addr = (xfs_caddr_t)&(efdp->efd_format); log_vector->i_len = size; - XLOG_VEC_SET_TYPE(log_vector, XLOG_REG_TYPE_EFD_FORMAT); + log_vector->i_type = XLOG_REG_TYPE_EFD_FORMAT; ASSERT(size >= sizeof(xfs_efd_log_format_t)); } diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 6194fb5d3777..da4cac67bdae 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -228,7 +228,7 @@ xfs_inode_item_format( vecp->i_addr = (xfs_caddr_t)&iip->ili_format; vecp->i_len = sizeof(xfs_inode_log_format_t); - XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IFORMAT); + vecp->i_type = XLOG_REG_TYPE_IFORMAT; vecp++; nvecs = 1; @@ -279,7 +279,7 @@ xfs_inode_item_format( vecp->i_addr = (xfs_caddr_t)&ip->i_d; vecp->i_len = sizeof(struct xfs_icdinode); - XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_ICORE); + vecp->i_type = XLOG_REG_TYPE_ICORE; vecp++; nvecs++; iip->ili_format.ilf_fields |= XFS_ILOG_CORE; @@ -336,7 +336,7 @@ xfs_inode_item_format( vecp->i_addr = (char *)(ip->i_df.if_u1.if_extents); vecp->i_len = ip->i_df.if_bytes; - XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IEXT); + vecp->i_type = XLOG_REG_TYPE_IEXT; } else #endif { @@ -355,7 +355,7 @@ xfs_inode_item_format( vecp->i_addr = (xfs_caddr_t)ext_buffer; vecp->i_len = xfs_iextents_copy(ip, ext_buffer, XFS_DATA_FORK); - XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IEXT); + vecp->i_type = XLOG_REG_TYPE_IEXT; } ASSERT(vecp->i_len <= ip->i_df.if_bytes); iip->ili_format.ilf_dsize = vecp->i_len; @@ -373,7 +373,7 @@ xfs_inode_item_format( ASSERT(ip->i_df.if_broot != NULL); vecp->i_addr = (xfs_caddr_t)ip->i_df.if_broot; vecp->i_len = ip->i_df.if_broot_bytes; - XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IBROOT); + vecp->i_type = XLOG_REG_TYPE_IBROOT; vecp++; nvecs++; iip->ili_format.ilf_dsize = ip->i_df.if_broot_bytes; @@ -399,7 +399,7 @@ xfs_inode_item_format( ASSERT((ip->i_df.if_real_bytes == 0) || (ip->i_df.if_real_bytes == data_bytes)); vecp->i_len = (int)data_bytes; - XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_ILOCAL); + vecp->i_type = XLOG_REG_TYPE_ILOCAL; vecp++; nvecs++; iip->ili_format.ilf_dsize = (unsigned)data_bytes; @@ -477,7 +477,7 @@ xfs_inode_item_format( vecp->i_len = xfs_iextents_copy(ip, ext_buffer, XFS_ATTR_FORK); #endif - XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IATTR_EXT); + vecp->i_type = XLOG_REG_TYPE_IATTR_EXT; iip->ili_format.ilf_asize = vecp->i_len; vecp++; nvecs++; @@ -492,7 +492,7 @@ xfs_inode_item_format( ASSERT(ip->i_afp->if_broot != NULL); vecp->i_addr = (xfs_caddr_t)ip->i_afp->if_broot; vecp->i_len = ip->i_afp->if_broot_bytes; - XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IATTR_BROOT); + vecp->i_type = XLOG_REG_TYPE_IATTR_BROOT; vecp++; nvecs++; iip->ili_format.ilf_asize = ip->i_afp->if_broot_bytes; @@ -516,7 +516,7 @@ xfs_inode_item_format( ASSERT((ip->i_afp->if_real_bytes == 0) || (ip->i_afp->if_real_bytes == data_bytes)); vecp->i_len = (int)data_bytes; - XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IATTR_LOCAL); + vecp->i_type = XLOG_REG_TYPE_IATTR_LOCAL; vecp++; nvecs++; iip->ili_format.ilf_asize = (unsigned)data_bytes; diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 0d17516fbb13..20118ddadef6 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -617,7 +617,7 @@ xfs_log_unmount_write(xfs_mount_t *mp) if (! (XLOG_FORCED_SHUTDOWN(log))) { reg[0].i_addr = (void*)&magic; reg[0].i_len = sizeof(magic); - XLOG_VEC_SET_TYPE(®[0], XLOG_REG_TYPE_UNMOUNT); + reg[0].i_type = XLOG_REG_TYPE_UNMOUNT; error = xfs_log_reserve(mp, 600, 1, &tic, XFS_LOG, 0, XLOG_UNMOUNT_REC_TYPE); @@ -1236,7 +1236,7 @@ xlog_commit_record(xfs_mount_t *mp, reg[0].i_addr = NULL; reg[0].i_len = 0; - XLOG_VEC_SET_TYPE(®[0], XLOG_REG_TYPE_COMMIT); + reg[0].i_type = XLOG_REG_TYPE_COMMIT; ASSERT_ALWAYS(iclog); if ((error = xlog_write(mp, reg, 1, ticket, commitlsnp, diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index d0c9baa50b1a..811ccf4d8b3e 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -110,10 +110,8 @@ static inline xfs_lsn_t _lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2) #define XLOG_REG_TYPE_TRANSHDR 19 #define XLOG_REG_TYPE_MAX 19 -#define XLOG_VEC_SET_TYPE(vecp, t) ((vecp)->i_type = (t)) - typedef struct xfs_log_iovec { - xfs_caddr_t i_addr; /* beginning address of region */ + xfs_caddr_t i_addr; /* beginning address of region */ int i_len; /* length in bytes of region */ uint i_type; /* type of region */ } xfs_log_iovec_t; diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 237badcbac3b..7dbe3c3051db 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -1121,7 +1121,7 @@ xfs_trans_fill_vecs( tp->t_header.th_num_items = nitems; log_vector->i_addr = (xfs_caddr_t)&tp->t_header; log_vector->i_len = sizeof(xfs_trans_header_t); - XLOG_VEC_SET_TYPE(log_vector, XLOG_REG_TYPE_TRANSHDR); + log_vector->i_type = XLOG_REG_TYPE_TRANSHDR; } -- cgit v1.2.3 From a14a348bff2f99471a28e5928eb6801224c053d8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@infradead.org> Date: Tue, 19 Jan 2010 09:56:46 +0000 Subject: xfs: cleanup up xfs_log_force calling conventions Remove the XFS_LOG_FORCE argument which was always set, and the XFS_LOG_URGE define, which was never used. Split xfs_log_force into a two helpers - xfs_log_force which forces the whole log, and xfs_log_force_lsn which forces up to the specified LSN. The underlying implementations already were entirely separate, as were the users. Also re-indent the new _xfs_log_force/_xfs_log_force which previously had a weird coding style. Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Alex Elder <aelder@sgi.com> --- fs/xfs/linux-2.6/xfs_sync.c | 17 +-- fs/xfs/quota/xfs_dquot.c | 10 +- fs/xfs/quota/xfs_dquot_item.c | 9 +- fs/xfs/quota/xfs_qm_syscalls.c | 4 +- fs/xfs/xfs_alloc.c | 2 +- fs/xfs/xfs_inode.c | 9 +- fs/xfs/xfs_inode_item.c | 7 +- fs/xfs/xfs_log.c | 312 ++++++++++++++++++++--------------------- fs/xfs/xfs_log.h | 15 +- fs/xfs/xfs_log_recover.c | 3 +- fs/xfs/xfs_mount.c | 4 +- fs/xfs/xfs_trans.c | 5 +- fs/xfs/xfs_trans_ail.c | 2 +- fs/xfs/xfs_vnodeops.c | 5 +- 14 files changed, 193 insertions(+), 211 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index 58c24be72c65..c9b863eacab7 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c @@ -296,10 +296,7 @@ xfs_sync_data( if (error) return XFS_ERROR(error); - xfs_log_force(mp, 0, - (flags & SYNC_WAIT) ? - XFS_LOG_FORCE | XFS_LOG_SYNC : - XFS_LOG_FORCE); + xfs_log_force(mp, (flags & SYNC_WAIT) ? XFS_LOG_SYNC : 0); return 0; } @@ -325,10 +322,6 @@ xfs_commit_dummy_trans( struct xfs_inode *ip = mp->m_rootip; struct xfs_trans *tp; int error; - int log_flags = XFS_LOG_FORCE; - - if (flags & SYNC_WAIT) - log_flags |= XFS_LOG_SYNC; /* * Put a dummy transaction in the log to tell recovery @@ -350,7 +343,7 @@ xfs_commit_dummy_trans( xfs_iunlock(ip, XFS_ILOCK_EXCL); /* the log force ensures this transaction is pushed to disk */ - xfs_log_force(mp, 0, log_flags); + xfs_log_force(mp, (flags & SYNC_WAIT) ? XFS_LOG_SYNC : 0); return error; } @@ -390,7 +383,7 @@ xfs_sync_fsdata( * become pinned in between there and here. */ if (XFS_BUF_ISPINNED(bp)) - xfs_log_force(mp, 0, XFS_LOG_FORCE); + xfs_log_force(mp, 0); } @@ -575,7 +568,7 @@ xfs_flush_inodes( igrab(inode); xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inodes_work, &completion); wait_for_completion(&completion); - xfs_log_force(ip->i_mount, (xfs_lsn_t)0, XFS_LOG_FORCE|XFS_LOG_SYNC); + xfs_log_force(ip->i_mount, XFS_LOG_SYNC); } /* @@ -591,7 +584,7 @@ xfs_sync_worker( int error; if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { - xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE); + xfs_log_force(mp, 0); xfs_reclaim_inodes(mp, XFS_IFLUSH_DELWRI_ELSE_ASYNC); /* dgc: errors ignored here */ error = xfs_qm_sync(mp, SYNC_TRYLOCK); diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c index 5756392ffdee..f9baeedbfdfe 100644 --- a/fs/xfs/quota/xfs_dquot.c +++ b/fs/xfs/quota/xfs_dquot.c @@ -1248,7 +1248,7 @@ xfs_qm_dqflush( */ if (XFS_BUF_ISPINNED(bp)) { trace_xfs_dqflush_force(dqp); - xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE); + xfs_log_force(mp, 0); } if (flags & XFS_QMOPT_DELWRI) { @@ -1531,11 +1531,9 @@ xfs_qm_dqflock_pushbuf_wait( if (bp != NULL) { if (XFS_BUF_ISDELAYWRITE(bp)) { int error; - if (XFS_BUF_ISPINNED(bp)) { - xfs_log_force(dqp->q_mount, - (xfs_lsn_t)0, - XFS_LOG_FORCE); - } + + if (XFS_BUF_ISPINNED(bp)) + xfs_log_force(dqp->q_mount, 0); error = xfs_bawrite(dqp->q_mount, bp); if (error) xfs_fs_cmn_err(CE_WARN, dqp->q_mount, diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c index 116580d52fae..1b564376d50c 100644 --- a/fs/xfs/quota/xfs_dquot_item.c +++ b/fs/xfs/quota/xfs_dquot_item.c @@ -190,7 +190,7 @@ xfs_qm_dqunpin_wait( /* * Give the log a push so we don't wait here too long. */ - xfs_log_force(dqp->q_mount, (xfs_lsn_t)0, XFS_LOG_FORCE); + xfs_log_force(dqp->q_mount, 0); wait_event(dqp->q_pinwait, (atomic_read(&dqp->q_pincount) == 0)); } @@ -245,10 +245,9 @@ xfs_qm_dquot_logitem_pushbuf( qip->qli_pushbuf_flag = 0; xfs_dqunlock(dqp); - if (XFS_BUF_ISPINNED(bp)) { - xfs_log_force(mp, (xfs_lsn_t)0, - XFS_LOG_FORCE); - } + if (XFS_BUF_ISPINNED(bp)) + xfs_log_force(mp, 0); + if (dopush) { int error; #ifdef XFSRACEDEBUG diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c index 873e07e29074..5d0ee8d492db 100644 --- a/fs/xfs/quota/xfs_qm_syscalls.c +++ b/fs/xfs/quota/xfs_qm_syscalls.c @@ -1192,9 +1192,9 @@ xfs_qm_internalqcheck( if (! XFS_IS_QUOTA_ON(mp)) return XFS_ERROR(ESRCH); - xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC); + xfs_log_force(mp, XFS_LOG_SYNC); XFS_bflush(mp->m_ddev_targp); - xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC); + xfs_log_force(mp, XFS_LOG_SYNC); XFS_bflush(mp->m_ddev_targp); mutex_lock(&qcheck_lock); diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index a27aeb7d9e74..94cddbfb2560 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -2601,5 +2601,5 @@ xfs_alloc_search_busy(xfs_trans_t *tp, * transaction that freed the block */ if (lsn) - xfs_log_force(tp->t_mountp, lsn, XFS_LOG_FORCE|XFS_LOG_SYNC); + xfs_log_force_lsn(tp->t_mountp, lsn, XFS_LOG_SYNC); } diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index bbb3bee8e936..d0d1b5a05183 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2484,8 +2484,11 @@ __xfs_iunpin_wait( return; /* Give the log a push to start the unpinning I/O */ - xfs_log_force(ip->i_mount, (iip && iip->ili_last_lsn) ? - iip->ili_last_lsn : 0, XFS_LOG_FORCE); + if (iip && iip->ili_last_lsn) + xfs_log_force_lsn(ip->i_mount, iip->ili_last_lsn, 0); + else + xfs_log_force(ip->i_mount, 0); + if (wait) wait_event(ip->i_ipin_wait, (atomic_read(&ip->i_pincount) == 0)); } @@ -2970,7 +2973,7 @@ xfs_iflush( * get stuck waiting in the write for too long. */ if (XFS_BUF_ISPINNED(bp)) - xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE); + xfs_log_force(mp, 0); /* * inode clustering: diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index da4cac67bdae..48ec1c0b23ce 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -804,10 +804,9 @@ xfs_inode_item_pushbuf( trace_xfs_inode_item_push(bp, _RET_IP_); - if (XFS_BUF_ISPINNED(bp)) { - xfs_log_force(mp, (xfs_lsn_t)0, - XFS_LOG_FORCE); - } + if (XFS_BUF_ISPINNED(bp)) + xfs_log_force(mp, 0); + if (dopush) { int error; error = xfs_bawrite(mp, bp); diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 20118ddadef6..4f16be4b6ee5 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -79,11 +79,6 @@ STATIC int xlog_state_release_iclog(xlog_t *log, STATIC void xlog_state_switch_iclogs(xlog_t *log, xlog_in_core_t *iclog, int eventual_size); -STATIC int xlog_state_sync(xlog_t *log, - xfs_lsn_t lsn, - uint flags, - int *log_flushed); -STATIC int xlog_state_sync_all(xlog_t *log, uint flags, int *log_flushed); STATIC void xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog); /* local functions to manipulate grant head */ @@ -296,65 +291,6 @@ xfs_log_done(xfs_mount_t *mp, return lsn; } /* xfs_log_done */ - -/* - * Force the in-core log to disk. If flags == XFS_LOG_SYNC, - * the force is done synchronously. - * - * Asynchronous forces are implemented by setting the WANT_SYNC - * bit in the appropriate in-core log and then returning. - * - * Synchronous forces are implemented with a signal variable. All callers - * to force a given lsn to disk will wait on a the sv attached to the - * specific in-core log. When given in-core log finally completes its - * write to disk, that thread will wake up all threads waiting on the - * sv. - */ -int -_xfs_log_force( - xfs_mount_t *mp, - xfs_lsn_t lsn, - uint flags, - int *log_flushed) -{ - xlog_t *log = mp->m_log; - int dummy; - - if (!log_flushed) - log_flushed = &dummy; - - ASSERT(flags & XFS_LOG_FORCE); - - XFS_STATS_INC(xs_log_force); - - if (log->l_flags & XLOG_IO_ERROR) - return XFS_ERROR(EIO); - if (lsn == 0) - return xlog_state_sync_all(log, flags, log_flushed); - else - return xlog_state_sync(log, lsn, flags, log_flushed); -} /* _xfs_log_force */ - -/* - * Wrapper for _xfs_log_force(), to be used when caller doesn't care - * about errors or whether the log was flushed or not. This is the normal - * interface to use when trying to unpin items or move the log forward. - */ -void -xfs_log_force( - xfs_mount_t *mp, - xfs_lsn_t lsn, - uint flags) -{ - int error; - error = _xfs_log_force(mp, lsn, flags, NULL); - if (error) { - xfs_fs_cmn_err(CE_WARN, mp, "xfs_log_force: " - "error %d returned.", error); - } -} - - /* * Attaches a new iclog I/O completion callback routine during * transaction commit. If the log is in error state, a non-zero @@ -601,7 +537,7 @@ xfs_log_unmount_write(xfs_mount_t *mp) if (mp->m_flags & XFS_MOUNT_RDONLY) return 0; - error = _xfs_log_force(mp, 0, XFS_LOG_FORCE|XFS_LOG_SYNC, NULL); + error = _xfs_log_force(mp, XFS_LOG_SYNC, NULL); ASSERT(error || !(XLOG_FORCED_SHUTDOWN(log))); #ifdef DEBUG @@ -2853,7 +2789,6 @@ xlog_state_switch_iclogs(xlog_t *log, log->l_iclog = iclog->ic_next; } /* xlog_state_switch_iclogs */ - /* * Write out all data in the in-core log as of this exact moment in time. * @@ -2881,11 +2816,17 @@ xlog_state_switch_iclogs(xlog_t *log, * b) when we return from flushing out this iclog, it is still * not in the active nor dirty state. */ -STATIC int -xlog_state_sync_all(xlog_t *log, uint flags, int *log_flushed) +int +_xfs_log_force( + struct xfs_mount *mp, + uint flags, + int *log_flushed) { - xlog_in_core_t *iclog; - xfs_lsn_t lsn; + struct log *log = mp->m_log; + struct xlog_in_core *iclog; + xfs_lsn_t lsn; + + XFS_STATS_INC(xs_log_force); spin_lock(&log->l_icloglock); @@ -2931,7 +2872,9 @@ xlog_state_sync_all(xlog_t *log, uint flags, int *log_flushed) if (xlog_state_release_iclog(log, iclog)) return XFS_ERROR(EIO); - *log_flushed = 1; + + if (log_flushed) + *log_flushed = 1; spin_lock(&log->l_icloglock); if (be64_to_cpu(iclog->ic_header.h_lsn) == lsn && iclog->ic_state != XLOG_STATE_DIRTY) @@ -2975,19 +2918,37 @@ maybe_sleep: */ if (iclog->ic_state & XLOG_STATE_IOERROR) return XFS_ERROR(EIO); - *log_flushed = 1; - + if (log_flushed) + *log_flushed = 1; } else { no_sleep: spin_unlock(&log->l_icloglock); } return 0; -} /* xlog_state_sync_all */ +} +/* + * Wrapper for _xfs_log_force(), to be used when caller doesn't care + * about errors or whether the log was flushed or not. This is the normal + * interface to use when trying to unpin items or move the log forward. + */ +void +xfs_log_force( + xfs_mount_t *mp, + uint flags) +{ + int error; + + error = _xfs_log_force(mp, flags, NULL); + if (error) { + xfs_fs_cmn_err(CE_WARN, mp, "xfs_log_force: " + "error %d returned.", error); + } +} /* - * Used by code which implements synchronous log forces. + * Force the in-core log to disk for a specific LSN. * * Find in-core log with lsn. * If it is in the DIRTY state, just return. @@ -2995,109 +2956,142 @@ no_sleep: * state and go to sleep or return. * If it is in any other state, go to sleep or return. * - * If filesystem activity goes to zero, the iclog will get flushed only by - * bdflush(). + * Synchronous forces are implemented with a signal variable. All callers + * to force a given lsn to disk will wait on a the sv attached to the + * specific in-core log. When given in-core log finally completes its + * write to disk, that thread will wake up all threads waiting on the + * sv. */ -STATIC int -xlog_state_sync(xlog_t *log, - xfs_lsn_t lsn, - uint flags, - int *log_flushed) +int +_xfs_log_force_lsn( + struct xfs_mount *mp, + xfs_lsn_t lsn, + uint flags, + int *log_flushed) { - xlog_in_core_t *iclog; - int already_slept = 0; - -try_again: - spin_lock(&log->l_icloglock); - iclog = log->l_iclog; + struct log *log = mp->m_log; + struct xlog_in_core *iclog; + int already_slept = 0; - if (iclog->ic_state & XLOG_STATE_IOERROR) { - spin_unlock(&log->l_icloglock); - return XFS_ERROR(EIO); - } + ASSERT(lsn != 0); - do { - if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn) { - iclog = iclog->ic_next; - continue; - } + XFS_STATS_INC(xs_log_force); - if (iclog->ic_state == XLOG_STATE_DIRTY) { +try_again: + spin_lock(&log->l_icloglock); + iclog = log->l_iclog; + if (iclog->ic_state & XLOG_STATE_IOERROR) { spin_unlock(&log->l_icloglock); - return 0; + return XFS_ERROR(EIO); } - if (iclog->ic_state == XLOG_STATE_ACTIVE) { - /* - * We sleep here if we haven't already slept (e.g. - * this is the first time we've looked at the correct - * iclog buf) and the buffer before us is going to - * be sync'ed. The reason for this is that if we - * are doing sync transactions here, by waiting for - * the previous I/O to complete, we can allow a few - * more transactions into this iclog before we close - * it down. - * - * Otherwise, we mark the buffer WANT_SYNC, and bump - * up the refcnt so we can release the log (which drops - * the ref count). The state switch keeps new transaction - * commits from using this buffer. When the current commits - * finish writing into the buffer, the refcount will drop to - * zero and the buffer will go out then. - */ - if (!already_slept && - (iclog->ic_prev->ic_state & (XLOG_STATE_WANT_SYNC | - XLOG_STATE_SYNCING))) { - ASSERT(!(iclog->ic_state & XLOG_STATE_IOERROR)); - XFS_STATS_INC(xs_log_force_sleep); - sv_wait(&iclog->ic_prev->ic_write_wait, PSWP, - &log->l_icloglock, s); - *log_flushed = 1; - already_slept = 1; - goto try_again; - } else { + do { + if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn) { + iclog = iclog->ic_next; + continue; + } + + if (iclog->ic_state == XLOG_STATE_DIRTY) { + spin_unlock(&log->l_icloglock); + return 0; + } + + if (iclog->ic_state == XLOG_STATE_ACTIVE) { + /* + * We sleep here if we haven't already slept (e.g. + * this is the first time we've looked at the correct + * iclog buf) and the buffer before us is going to + * be sync'ed. The reason for this is that if we + * are doing sync transactions here, by waiting for + * the previous I/O to complete, we can allow a few + * more transactions into this iclog before we close + * it down. + * + * Otherwise, we mark the buffer WANT_SYNC, and bump + * up the refcnt so we can release the log (which + * drops the ref count). The state switch keeps new + * transaction commits from using this buffer. When + * the current commits finish writing into the buffer, + * the refcount will drop to zero and the buffer will + * go out then. + */ + if (!already_slept && + (iclog->ic_prev->ic_state & + (XLOG_STATE_WANT_SYNC | XLOG_STATE_SYNCING))) { + ASSERT(!(iclog->ic_state & XLOG_STATE_IOERROR)); + + XFS_STATS_INC(xs_log_force_sleep); + + sv_wait(&iclog->ic_prev->ic_write_wait, + PSWP, &log->l_icloglock, s); + if (log_flushed) + *log_flushed = 1; + already_slept = 1; + goto try_again; + } atomic_inc(&iclog->ic_refcnt); xlog_state_switch_iclogs(log, iclog, 0); spin_unlock(&log->l_icloglock); if (xlog_state_release_iclog(log, iclog)) return XFS_ERROR(EIO); - *log_flushed = 1; + if (log_flushed) + *log_flushed = 1; spin_lock(&log->l_icloglock); } - } - if ((flags & XFS_LOG_SYNC) && /* sleep */ - !(iclog->ic_state & (XLOG_STATE_ACTIVE | XLOG_STATE_DIRTY))) { + if ((flags & XFS_LOG_SYNC) && /* sleep */ + !(iclog->ic_state & + (XLOG_STATE_ACTIVE | XLOG_STATE_DIRTY))) { + /* + * Don't wait on completion if we know that we've + * gotten a log write error. + */ + if (iclog->ic_state & XLOG_STATE_IOERROR) { + spin_unlock(&log->l_icloglock); + return XFS_ERROR(EIO); + } + XFS_STATS_INC(xs_log_force_sleep); + sv_wait(&iclog->ic_force_wait, PSWP, &log->l_icloglock, s); + /* + * No need to grab the log lock here since we're + * only deciding whether or not to return EIO + * and the memory read should be atomic. + */ + if (iclog->ic_state & XLOG_STATE_IOERROR) + return XFS_ERROR(EIO); - /* - * Don't wait on completion if we know that we've - * gotten a log write error. - */ - if (iclog->ic_state & XLOG_STATE_IOERROR) { + if (log_flushed) + *log_flushed = 1; + } else { /* just return */ spin_unlock(&log->l_icloglock); - return XFS_ERROR(EIO); } - XFS_STATS_INC(xs_log_force_sleep); - sv_wait(&iclog->ic_force_wait, PSWP, &log->l_icloglock, s); - /* - * No need to grab the log lock here since we're - * only deciding whether or not to return EIO - * and the memory read should be atomic. - */ - if (iclog->ic_state & XLOG_STATE_IOERROR) - return XFS_ERROR(EIO); - *log_flushed = 1; - } else { /* just return */ - spin_unlock(&log->l_icloglock); - } - return 0; - } while (iclog != log->l_iclog); + return 0; + } while (iclog != log->l_iclog); - spin_unlock(&log->l_icloglock); - return 0; -} /* xlog_state_sync */ + spin_unlock(&log->l_icloglock); + return 0; +} + +/* + * Wrapper for _xfs_log_force_lsn(), to be used when caller doesn't care + * about errors or whether the log was flushed or not. This is the normal + * interface to use when trying to unpin items or move the log forward. + */ +void +xfs_log_force_lsn( + xfs_mount_t *mp, + xfs_lsn_t lsn, + uint flags) +{ + int error; + error = _xfs_log_force_lsn(mp, lsn, flags, NULL); + if (error) { + xfs_fs_cmn_err(CE_WARN, mp, "xfs_log_force: " + "error %d returned.", error); + } +} /* * Called when we want to mark the current iclog as being ready to sync to @@ -3462,7 +3456,6 @@ xfs_log_force_umount( xlog_ticket_t *tic; xlog_t *log; int retval; - int dummy; log = mp->m_log; @@ -3536,13 +3529,14 @@ xfs_log_force_umount( } spin_unlock(&log->l_grant_lock); - if (! (log->l_iclog->ic_state & XLOG_STATE_IOERROR)) { + if (!(log->l_iclog->ic_state & XLOG_STATE_IOERROR)) { ASSERT(!logerror); /* * Force the incore logs to disk before shutting the * log down completely. */ - xlog_state_sync_all(log, XFS_LOG_FORCE|XFS_LOG_SYNC, &dummy); + _xfs_log_force(mp, XFS_LOG_SYNC, NULL); + spin_lock(&log->l_icloglock); retval = xlog_state_ioerror(log); spin_unlock(&log->l_icloglock); diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 811ccf4d8b3e..7074be9d13e9 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -70,14 +70,8 @@ static inline xfs_lsn_t _lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2) * Flags to xfs_log_force() * * XFS_LOG_SYNC: Synchronous force in-core log to disk - * XFS_LOG_FORCE: Start in-core log write now. - * XFS_LOG_URGE: Start write within some window of time. - * - * Note: Either XFS_LOG_FORCE or XFS_LOG_URGE must be set. */ #define XFS_LOG_SYNC 0x1 -#define XFS_LOG_FORCE 0x2 -#define XFS_LOG_URGE 0x4 #endif /* __KERNEL__ */ @@ -138,12 +132,17 @@ xfs_lsn_t xfs_log_done(struct xfs_mount *mp, void **iclog, uint flags); int _xfs_log_force(struct xfs_mount *mp, - xfs_lsn_t lsn, uint flags, int *log_forced); void xfs_log_force(struct xfs_mount *mp, - xfs_lsn_t lsn, uint flags); +int _xfs_log_force_lsn(struct xfs_mount *mp, + xfs_lsn_t lsn, + uint flags, + int *log_forced); +void xfs_log_force_lsn(struct xfs_mount *mp, + xfs_lsn_t lsn, + uint flags); int xfs_log_mount(struct xfs_mount *mp, struct xfs_buftarg *log_target, xfs_daddr_t start_block, diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 97148f0c4bdd..22e6efdc17ea 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -3913,8 +3913,7 @@ xlog_recover_finish( * case the unlink transactions would have problems * pushing the EFIs out of the way. */ - xfs_log_force(log->l_mp, (xfs_lsn_t)0, - (XFS_LOG_FORCE | XFS_LOG_SYNC)); + xfs_log_force(log->l_mp, XFS_LOG_SYNC); xlog_recover_process_iunlinks(log); diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index bb0154047e85..7f81ed72c875 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1455,7 +1455,7 @@ xfs_unmountfs( * push out the iclog we will never get that unlocked. hence we * need to force the log first. */ - xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC); + xfs_log_force(mp, XFS_LOG_SYNC); xfs_reclaim_inodes(mp, XFS_IFLUSH_ASYNC); xfs_qm_unmount(mp); @@ -1465,7 +1465,7 @@ xfs_unmountfs( * that nothing is pinned. This is important because bflush() * will skip pinned buffers. */ - xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC); + xfs_log_force(mp, XFS_LOG_SYNC); xfs_binval(mp->m_ddev_targp); if (mp->m_rtdev_targp) { diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 7dbe3c3051db..be942d4e3324 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -981,9 +981,8 @@ shut_us_down: */ if (sync) { if (!error) { - error = _xfs_log_force(mp, commit_lsn, - XFS_LOG_FORCE | XFS_LOG_SYNC, - log_flushed); + error = _xfs_log_force_lsn(mp, commit_lsn, + XFS_LOG_SYNC, log_flushed); } XFS_STATS_INC(xs_trans_sync); } else { diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 063dfbdca94b..d7b1af8a832d 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -371,7 +371,7 @@ xfsaild_push( * move forward in the AIL. */ XFS_STATS_INC(xs_push_ail_flush); - xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE); + xfs_log_force(mp, 0); } if (!count) { diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 4da96cdffb76..fd108b738559 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -631,9 +631,8 @@ xfs_fsync( xfs_iunlock(ip, XFS_ILOCK_SHARED); if (xfs_ipincount(ip)) { - error = _xfs_log_force(ip->i_mount, (xfs_lsn_t)0, - XFS_LOG_FORCE | XFS_LOG_SYNC, - &log_flushed); + error = _xfs_log_force(ip->i_mount, XFS_LOG_SYNC, + &log_flushed); } else { /* * If the inode is not pinned and nothing has changed -- cgit v1.2.3 From bdfb04301fa5fdd95f219539a9a5b9663b1e5fc2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@infradead.org> Date: Wed, 20 Jan 2010 21:55:30 +0000 Subject: xfs: replace KM_LARGE with explicit vmalloc use We use the KM_LARGE flag to make kmem_alloc and friends use vmalloc if necessary. As we only need this for a few boot/mount time allocations just switch to explicit vmalloc calls there. Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Alex Elder <aelder@sgi.com> --- fs/xfs/linux-2.6/kmem.c | 56 +++++++++++++++++----------------------------- fs/xfs/linux-2.6/kmem.h | 21 ++++++++++++++--- fs/xfs/linux-2.6/xfs_buf.c | 6 ++--- fs/xfs/quota/xfs_qm.c | 26 ++++++++++++++++----- fs/xfs/xfs_itable.c | 8 ++++--- 5 files changed, 66 insertions(+), 51 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/kmem.c b/fs/xfs/linux-2.6/kmem.c index 2d3f90afe5f1..bc7405585def 100644 --- a/fs/xfs/linux-2.6/kmem.c +++ b/fs/xfs/linux-2.6/kmem.c @@ -16,7 +16,6 @@ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include <linux/mm.h> -#include <linux/vmalloc.h> #include <linux/highmem.h> #include <linux/swap.h> #include <linux/blkdev.h> @@ -24,8 +23,25 @@ #include "time.h" #include "kmem.h" -#define MAX_VMALLOCS 6 -#define MAX_SLAB_SIZE 0x20000 +/* + * Greedy allocation. May fail and may return vmalloced memory. + * + * Must be freed using kmem_free_large. + */ +void * +kmem_zalloc_greedy(size_t *size, size_t minsize, size_t maxsize) +{ + void *ptr; + size_t kmsize = maxsize; + + while (!(ptr = kmem_zalloc_large(kmsize))) { + if ((kmsize >>= 1) <= minsize) + kmsize = minsize; + } + if (ptr) + *size = kmsize; + return ptr; +} void * kmem_alloc(size_t size, unsigned int __nocast flags) @@ -34,19 +50,8 @@ kmem_alloc(size_t size, unsigned int __nocast flags) gfp_t lflags = kmem_flags_convert(flags); void *ptr; -#ifdef DEBUG - if (unlikely(!(flags & KM_LARGE) && (size > PAGE_SIZE))) { - printk(KERN_WARNING "Large %s attempt, size=%ld\n", - __func__, (long)size); - dump_stack(); - } -#endif - do { - if (size < MAX_SLAB_SIZE || retries > MAX_VMALLOCS) - ptr = kmalloc(size, lflags); - else - ptr = __vmalloc(size, lflags, PAGE_KERNEL); + ptr = kmalloc(size, lflags); if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP))) return ptr; if (!(++retries % 100)) @@ -68,27 +73,6 @@ kmem_zalloc(size_t size, unsigned int __nocast flags) return ptr; } -void * -kmem_zalloc_greedy(size_t *size, size_t minsize, size_t maxsize, - unsigned int __nocast flags) -{ - void *ptr; - size_t kmsize = maxsize; - unsigned int kmflags = (flags & ~KM_SLEEP) | KM_NOSLEEP; - - while (!(ptr = kmem_zalloc(kmsize, kmflags))) { - if ((kmsize <= minsize) && (flags & KM_NOSLEEP)) - break; - if ((kmsize >>= 1) <= minsize) { - kmsize = minsize; - kmflags = flags; - } - } - if (ptr) - *size = kmsize; - return ptr; -} - void kmem_free(const void *ptr) { diff --git a/fs/xfs/linux-2.6/kmem.h b/fs/xfs/linux-2.6/kmem.h index 179cbd630f69..f7c8f7a9ea6d 100644 --- a/fs/xfs/linux-2.6/kmem.h +++ b/fs/xfs/linux-2.6/kmem.h @@ -21,6 +21,7 @@ #include <linux/slab.h> #include <linux/sched.h> #include <linux/mm.h> +#include <linux/vmalloc.h> /* * General memory allocation interfaces @@ -30,7 +31,6 @@ #define KM_NOSLEEP 0x0002u #define KM_NOFS 0x0004u #define KM_MAYFAIL 0x0008u -#define KM_LARGE 0x0010u /* * We use a special process flag to avoid recursive callbacks into @@ -42,7 +42,7 @@ kmem_flags_convert(unsigned int __nocast flags) { gfp_t lflags; - BUG_ON(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS|KM_MAYFAIL|KM_LARGE)); + BUG_ON(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS|KM_MAYFAIL)); if (flags & KM_NOSLEEP) { lflags = GFP_ATOMIC | __GFP_NOWARN; @@ -56,10 +56,25 @@ kmem_flags_convert(unsigned int __nocast flags) extern void *kmem_alloc(size_t, unsigned int __nocast); extern void *kmem_zalloc(size_t, unsigned int __nocast); -extern void *kmem_zalloc_greedy(size_t *, size_t, size_t, unsigned int __nocast); extern void *kmem_realloc(const void *, size_t, size_t, unsigned int __nocast); extern void kmem_free(const void *); +static inline void *kmem_zalloc_large(size_t size) +{ + void *ptr; + + ptr = vmalloc(size); + if (ptr) + memset(ptr, 0, size); + return ptr; +} +static inline void kmem_free_large(void *ptr) +{ + vfree(ptr); +} + +extern void *kmem_zalloc_greedy(size_t *, size_t, size_t); + /* * Zone interfaces */ diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 730eff1e71a3..44e20e578ba0 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -1525,8 +1525,8 @@ xfs_alloc_bufhash( btp->bt_hashshift = external ? 3 : 8; /* 8 or 256 buckets */ btp->bt_hashmask = (1 << btp->bt_hashshift) - 1; - btp->bt_hash = kmem_zalloc((1 << btp->bt_hashshift) * - sizeof(xfs_bufhash_t), KM_SLEEP | KM_LARGE); + btp->bt_hash = kmem_zalloc_large((1 << btp->bt_hashshift) * + sizeof(xfs_bufhash_t)); for (i = 0; i < (1 << btp->bt_hashshift); i++) { spin_lock_init(&btp->bt_hash[i].bh_lock); INIT_LIST_HEAD(&btp->bt_hash[i].bh_list); @@ -1537,7 +1537,7 @@ STATIC void xfs_free_bufhash( xfs_buftarg_t *btp) { - kmem_free(btp->bt_hash); + kmem_free_large(btp->bt_hash); btp->bt_hash = NULL; } diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c index 9e627a8b5b0e..11cfd8245c7c 100644 --- a/fs/xfs/quota/xfs_qm.c +++ b/fs/xfs/quota/xfs_qm.c @@ -118,9 +118,14 @@ xfs_Gqm_init(void) */ udqhash = kmem_zalloc_greedy(&hsize, XFS_QM_HASHSIZE_LOW * sizeof(xfs_dqhash_t), - XFS_QM_HASHSIZE_HIGH * sizeof(xfs_dqhash_t), - KM_SLEEP | KM_MAYFAIL | KM_LARGE); - gdqhash = kmem_zalloc(hsize, KM_SLEEP | KM_LARGE); + XFS_QM_HASHSIZE_HIGH * sizeof(xfs_dqhash_t)); + if (!udqhash) + goto out; + + gdqhash = kmem_zalloc_large(hsize); + if (!udqhash) + goto out_free_udqhash; + hsize /= sizeof(xfs_dqhash_t); ndquot = hsize << 8; @@ -170,6 +175,11 @@ xfs_Gqm_init(void) mutex_init(&qcheck_lock); #endif return xqm; + + out_free_udqhash: + kmem_free_large(udqhash); + out: + return NULL; } /* @@ -189,8 +199,8 @@ xfs_qm_destroy( xfs_qm_list_destroy(&(xqm->qm_usr_dqhtable[i])); xfs_qm_list_destroy(&(xqm->qm_grp_dqhtable[i])); } - kmem_free(xqm->qm_usr_dqhtable); - kmem_free(xqm->qm_grp_dqhtable); + kmem_free_large(xqm->qm_usr_dqhtable); + kmem_free_large(xqm->qm_grp_dqhtable); xqm->qm_usr_dqhtable = NULL; xqm->qm_grp_dqhtable = NULL; xqm->qm_dqhashmask = 0; @@ -219,8 +229,12 @@ xfs_qm_hold_quotafs_ref( */ mutex_lock(&xfs_Gqm_lock); - if (xfs_Gqm == NULL) + if (!xfs_Gqm) { xfs_Gqm = xfs_Gqm_init(); + if (!xfs_Gqm) + return ENOMEM; + } + /* * We can keep a list of all filesystems with quotas mounted for * debugging and statistical purposes, but ... diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index 940307a6a60b..3af02314c605 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -408,8 +408,10 @@ xfs_bulkstat( (XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog); nimask = ~(nicluster - 1); nbcluster = nicluster >> mp->m_sb.sb_inopblog; - irbuf = kmem_zalloc_greedy(&irbsize, PAGE_SIZE, PAGE_SIZE * 4, - KM_SLEEP | KM_MAYFAIL | KM_LARGE); + irbuf = kmem_zalloc_greedy(&irbsize, PAGE_SIZE, PAGE_SIZE * 4); + if (!irbuf) + return ENOMEM; + nirbuf = irbsize / sizeof(*irbuf); /* @@ -727,7 +729,7 @@ xfs_bulkstat( /* * Done, we're either out of filesystem or space to put the data. */ - kmem_free(irbuf); + kmem_free_large(irbuf); *ubcountp = ubelem; /* * Found some inodes, return them now and return the error next time. -- cgit v1.2.3 From 9b00f30762fe9f914eb6e03057a616ed63a4e8ca Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@infradead.org> Date: Thu, 21 Jan 2010 11:17:20 +0000 Subject: xfs: quota limit statvfs available blocks A "df" run on an NFS client of an exported XFS file system reports the wrong information for "available" blocks. When a block quota is enforced, the amount reported as free is limited by the quota, but the amount reported available is not (and should be). Reported-by: Guk-Bong, Kwon <gbkwon@gmail.com> Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Alex Elder <aelder@sgi.com> --- fs/xfs/quota/xfs_qm_bhv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c index a5346630dfae..97b410c12794 100644 --- a/fs/xfs/quota/xfs_qm_bhv.c +++ b/fs/xfs/quota/xfs_qm_bhv.c @@ -59,7 +59,7 @@ xfs_fill_statvfs_from_dquot( be64_to_cpu(dp->d_blk_hardlimit); if (limit && statp->f_blocks > limit) { statp->f_blocks = limit; - statp->f_bfree = + statp->f_bfree = statp->f_bavail = (statp->f_blocks > be64_to_cpu(dp->d_bcount)) ? (statp->f_blocks - be64_to_cpu(dp->d_bcount)) : 0; } -- cgit v1.2.3 From 19f5fb7ad679bb361222c7916086435020c37cce Mon Sep 17 00:00:00 2001 From: Theodore Ts'o <tytso@mit.edu> Date: Sun, 24 Jan 2010 14:34:07 -0500 Subject: ext4: Use bitops to read/modify EXT4_I(inode)->i_state At several places we modify EXT4_I(inode)->i_state without holding i_mutex (ext4_release_file, ext4_bmap, ext4_journalled_writepage, ext4_do_update_inode, ...). These modifications are racy and we can lose updates to i_state. So convert handling of i_state to use bitops which are atomic. Cc: Jan Kara <jack@suse.cz> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu> --- fs/ext4/ext4.h | 41 +++++++++++++++++++++++++++++------------ fs/ext4/extents.c | 8 ++++---- fs/ext4/file.c | 4 ++-- fs/ext4/ialloc.c | 3 ++- fs/ext4/inode.c | 38 ++++++++++++++++++++------------------ fs/ext4/migrate.c | 6 +++--- fs/ext4/xattr.c | 22 +++++++++++----------- 7 files changed, 71 insertions(+), 51 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 307ecd13a762..ffe1334c891e 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -313,17 +313,6 @@ static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags) return flags & EXT4_OTHER_FLMASK; } -/* - * Inode dynamic state flags - */ -#define EXT4_STATE_JDATA 0x00000001 /* journaled data exists */ -#define EXT4_STATE_NEW 0x00000002 /* inode is newly created */ -#define EXT4_STATE_XATTR 0x00000004 /* has in-inode xattrs */ -#define EXT4_STATE_NO_EXPAND 0x00000008 /* No space for expansion */ -#define EXT4_STATE_DA_ALLOC_CLOSE 0x00000010 /* Alloc DA blks on close */ -#define EXT4_STATE_EXT_MIGRATE 0x00000020 /* Inode is migrating */ -#define EXT4_STATE_DIO_UNWRITTEN 0x00000040 /* need convert on dio done*/ - /* Used to pass group descriptor data when online resize is done */ struct ext4_new_group_input { __u32 group; /* Group number for this data */ @@ -631,7 +620,7 @@ struct ext4_inode_info { * near to their parent directory's inode. */ ext4_group_t i_block_group; - __u32 i_state; /* Dynamic state flags for ext4 */ + unsigned long i_state_flags; /* Dynamic state flags */ ext4_lblk_t i_dir_start_lookup; #ifdef CONFIG_EXT4_FS_XATTR @@ -1051,6 +1040,34 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) (ino >= EXT4_FIRST_INO(sb) && ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)); } + +/* + * Inode dynamic state flags + */ +enum { + EXT4_STATE_JDATA, /* journaled data exists */ + EXT4_STATE_NEW, /* inode is newly created */ + EXT4_STATE_XATTR, /* has in-inode xattrs */ + EXT4_STATE_NO_EXPAND, /* No space for expansion */ + EXT4_STATE_DA_ALLOC_CLOSE, /* Alloc DA blks on close */ + EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */ + EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/ +}; + +static inline int ext4_test_inode_state(struct inode *inode, int bit) +{ + return test_bit(bit, &EXT4_I(inode)->i_state_flags); +} + +static inline void ext4_set_inode_state(struct inode *inode, int bit) +{ + set_bit(bit, &EXT4_I(inode)->i_state_flags); +} + +static inline void ext4_clear_inode_state(struct inode *inode, int bit) +{ + clear_bit(bit, &EXT4_I(inode)->i_state_flags); +} #else /* Assume that user mode programs are passing in an ext4fs superblock, not * a kernel struct super_block. This will allow us to call the feature-test diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index c56877972b0e..54616157c0f3 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3076,7 +3076,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, if (io) io->flag = DIO_AIO_UNWRITTEN; else - EXT4_I(inode)->i_state |= EXT4_STATE_DIO_UNWRITTEN; + ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); goto out; } /* async DIO end_io complete, convert the filled extent to written */ @@ -3362,8 +3362,8 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, if (io) io->flag = DIO_AIO_UNWRITTEN; else - EXT4_I(inode)->i_state |= - EXT4_STATE_DIO_UNWRITTEN;; + ext4_set_inode_state(inode, + EXT4_STATE_DIO_UNWRITTEN); } } err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); @@ -3739,7 +3739,7 @@ static int ext4_xattr_fiemap(struct inode *inode, int error = 0; /* in-inode? */ - if (EXT4_I(inode)->i_state & EXT4_STATE_XATTR) { + if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) { struct ext4_iloc iloc; int offset; /* offset of xattr in inode */ diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 9630583cef28..f6071ce0b155 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -35,9 +35,9 @@ */ static int ext4_release_file(struct inode *inode, struct file *filp) { - if (EXT4_I(inode)->i_state & EXT4_STATE_DA_ALLOC_CLOSE) { + if (ext4_test_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE)) { ext4_alloc_da_blocks(inode); - EXT4_I(inode)->i_state &= ~EXT4_STATE_DA_ALLOC_CLOSE; + ext4_clear_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); } /* if we are the last writer on the inode, drop the block reservation */ if ((filp->f_mode & FMODE_WRITE) && diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index f3624ead4f6c..2fab5adae1e2 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -1029,7 +1029,8 @@ got: inode->i_generation = sbi->s_next_generation++; spin_unlock(&sbi->s_next_gen_lock); - ei->i_state = EXT4_STATE_NEW; + ei->i_state_flags = 0; + ext4_set_inode_state(inode, EXT4_STATE_NEW); ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 1a3d7b232cd7..3e530119d7f0 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1307,7 +1307,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, * i_data's format changing. Force the migrate * to fail by clearing migrate flags */ - EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE; + ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE); } /* @@ -1794,7 +1794,7 @@ static int ext4_journalled_write_end(struct file *file, new_i_size = pos + copied; if (new_i_size > inode->i_size) i_size_write(inode, pos+copied); - EXT4_I(inode)->i_state |= EXT4_STATE_JDATA; + ext4_set_inode_state(inode, EXT4_STATE_JDATA); if (new_i_size > EXT4_I(inode)->i_disksize) { ext4_update_i_disksize(inode, new_i_size); ret2 = ext4_mark_inode_dirty(handle, inode); @@ -2632,7 +2632,7 @@ static int __ext4_journalled_writepage(struct page *page, ret = err; walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one); - EXT4_I(inode)->i_state |= EXT4_STATE_JDATA; + ext4_set_inode_state(inode, EXT4_STATE_JDATA); out: return ret; } @@ -3303,7 +3303,8 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block) filemap_write_and_wait(mapping); } - if (EXT4_JOURNAL(inode) && EXT4_I(inode)->i_state & EXT4_STATE_JDATA) { + if (EXT4_JOURNAL(inode) && + ext4_test_inode_state(inode, EXT4_STATE_JDATA)) { /* * This is a REALLY heavyweight approach, but the use of * bmap on dirty files is expected to be extremely rare: @@ -3322,7 +3323,7 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block) * everything they get. */ - EXT4_I(inode)->i_state &= ~EXT4_STATE_JDATA; + ext4_clear_inode_state(inode, EXT4_STATE_JDATA); journal = EXT4_JOURNAL(inode); jbd2_journal_lock_updates(journal); err = jbd2_journal_flush(journal); @@ -3790,8 +3791,8 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) { ext4_free_io_end(iocb->private); iocb->private = NULL; - } else if (ret > 0 && (EXT4_I(inode)->i_state & - EXT4_STATE_DIO_UNWRITTEN)) { + } else if (ret > 0 && ext4_test_inode_state(inode, + EXT4_STATE_DIO_UNWRITTEN)) { int err; /* * for non AIO case, since the IO is already @@ -3801,7 +3802,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, offset, ret); if (err < 0) ret = err; - EXT4_I(inode)->i_state &= ~EXT4_STATE_DIO_UNWRITTEN; + ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); } return ret; } @@ -4457,7 +4458,7 @@ void ext4_truncate(struct inode *inode) return; if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) - ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE; + ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { ext4_ext_truncate(inode); @@ -4743,7 +4744,7 @@ int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc) { /* We have all inode data except xattrs in memory here. */ return __ext4_get_inode_loc(inode, iloc, - !(EXT4_I(inode)->i_state & EXT4_STATE_XATTR)); + !ext4_test_inode_state(inode, EXT4_STATE_XATTR)); } void ext4_set_inode_flags(struct inode *inode) @@ -4837,7 +4838,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) } inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); - ei->i_state = 0; + ei->i_state_flags = 0; ei->i_dir_start_lookup = 0; ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); /* We now have enough fields to check if the inode was active or not. @@ -4920,7 +4921,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize; if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) - ei->i_state |= EXT4_STATE_XATTR; + ext4_set_inode_state(inode, EXT4_STATE_XATTR); } } else ei->i_extra_isize = 0; @@ -5060,7 +5061,7 @@ static int ext4_do_update_inode(handle_t *handle, /* For fields not not tracking in the in-memory inode, * initialise them to zero for new inodes. */ - if (ei->i_state & EXT4_STATE_NEW) + if (ext4_test_inode_state(inode, EXT4_STATE_NEW)) memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); ext4_get_inode_flags(ei); @@ -5156,7 +5157,7 @@ static int ext4_do_update_inode(handle_t *handle, rc = ext4_handle_dirty_metadata(handle, inode, bh); if (!err) err = rc; - ei->i_state &= ~EXT4_STATE_NEW; + ext4_clear_inode_state(inode, EXT4_STATE_NEW); ext4_update_inode_fsync_trans(handle, inode, 0); out_brelse: @@ -5580,8 +5581,8 @@ static int ext4_expand_extra_isize(struct inode *inode, entry = IFIRST(header); /* No extended attributes present */ - if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR) || - header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) { + if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) || + header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) { memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0, new_extra_isize); EXT4_I(inode)->i_extra_isize = new_extra_isize; @@ -5625,7 +5626,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) err = ext4_reserve_inode_write(handle, inode, &iloc); if (ext4_handle_valid(handle) && EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize && - !(EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND)) { + !ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) { /* * We need extra buffer credits since we may write into EA block * with this same handle. If journal_extend fails, then it will @@ -5639,7 +5640,8 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) sbi->s_want_extra_isize, iloc, handle); if (ret) { - EXT4_I(inode)->i_state |= EXT4_STATE_NO_EXPAND; + ext4_set_inode_state(inode, + EXT4_STATE_NO_EXPAND); if (mnt_count != le16_to_cpu(sbi->s_es->s_mnt_count)) { ext4_warning(inode->i_sb, __func__, diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index 81415814b00b..46a4101e0aec 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -365,12 +365,12 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode, * happened after we started the migrate. We need to * fail the migrate */ - if (!(EXT4_I(inode)->i_state & EXT4_STATE_EXT_MIGRATE)) { + if (!ext4_test_inode_state(inode, EXT4_STATE_EXT_MIGRATE)) { retval = -EAGAIN; up_write(&EXT4_I(inode)->i_data_sem); goto err_out; } else - EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE; + ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE); /* * We have the extent map build with the tmp inode. * Now copy the i_data across @@ -533,7 +533,7 @@ int ext4_ext_migrate(struct inode *inode) * allocation. */ down_read((&EXT4_I(inode)->i_data_sem)); - EXT4_I(inode)->i_state |= EXT4_STATE_EXT_MIGRATE; + ext4_set_inode_state(inode, EXT4_STATE_EXT_MIGRATE); up_read((&EXT4_I(inode)->i_data_sem)); handle = ext4_journal_start(inode, 1); diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index f3a2f7ed45aa..c619a7ea670d 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -267,7 +267,7 @@ ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name, void *end; int error; - if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR)) + if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR)) return -ENODATA; error = ext4_get_inode_loc(inode, &iloc); if (error) @@ -396,7 +396,7 @@ ext4_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size) void *end; int error; - if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR)) + if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR)) return 0; error = ext4_get_inode_loc(inode, &iloc); if (error) @@ -908,7 +908,7 @@ ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i, is->s.base = is->s.first = IFIRST(header); is->s.here = is->s.first; is->s.end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; - if (EXT4_I(inode)->i_state & EXT4_STATE_XATTR) { + if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) { error = ext4_xattr_check_names(IFIRST(header), is->s.end); if (error) return error; @@ -940,10 +940,10 @@ ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, header = IHDR(inode, ext4_raw_inode(&is->iloc)); if (!IS_LAST_ENTRY(s->first)) { header->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC); - EXT4_I(inode)->i_state |= EXT4_STATE_XATTR; + ext4_set_inode_state(inode, EXT4_STATE_XATTR); } else { header->h_magic = cpu_to_le32(0); - EXT4_I(inode)->i_state &= ~EXT4_STATE_XATTR; + ext4_clear_inode_state(inode, EXT4_STATE_XATTR); } return 0; } @@ -986,8 +986,8 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, if (strlen(name) > 255) return -ERANGE; down_write(&EXT4_I(inode)->xattr_sem); - no_expand = EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND; - EXT4_I(inode)->i_state |= EXT4_STATE_NO_EXPAND; + no_expand = ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND); + ext4_set_inode_state(inode, EXT4_STATE_NO_EXPAND); error = ext4_get_inode_loc(inode, &is.iloc); if (error) @@ -997,10 +997,10 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, if (error) goto cleanup; - if (EXT4_I(inode)->i_state & EXT4_STATE_NEW) { + if (ext4_test_inode_state(inode, EXT4_STATE_NEW)) { struct ext4_inode *raw_inode = ext4_raw_inode(&is.iloc); memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); - EXT4_I(inode)->i_state &= ~EXT4_STATE_NEW; + ext4_clear_inode_state(inode, EXT4_STATE_NEW); } error = ext4_xattr_ibody_find(inode, &i, &is); @@ -1052,7 +1052,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, ext4_xattr_update_super_block(handle, inode->i_sb); inode->i_ctime = ext4_current_time(inode); if (!value) - EXT4_I(inode)->i_state &= ~EXT4_STATE_NO_EXPAND; + ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND); error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); /* * The bh is consumed by ext4_mark_iloc_dirty, even with @@ -1067,7 +1067,7 @@ cleanup: brelse(is.iloc.bh); brelse(bs.bh); if (no_expand == 0) - EXT4_I(inode)->i_state &= ~EXT4_STATE_NO_EXPAND; + ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND); up_write(&EXT4_I(inode)->xattr_sem); return error; } -- cgit v1.2.3 From f710b4b96ba292dfed2153afc47e9063b0abfd89 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o <tytso@mit.edu> Date: Mon, 25 Jan 2010 03:31:32 -0500 Subject: ext4: Reserve INCOMPAT_EA_INODE and INCOMPAT_DIRDATA feature codepoints Signed-off-by: "Theodore Ts'o" <tytso@mit.edu> --- fs/ext4/ext4.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index ffe1334c891e..61cf3b3cde4e 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -284,6 +284,7 @@ struct flex_groups { #define EXT4_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ #define EXT4_HUGE_FILE_FL 0x00040000 /* Set to each huge file */ #define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */ +#define EXT4_EA_INODE_FL 0x00200000 /* Inode used for large EA */ #define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ #define EXT4_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */ @@ -1144,6 +1145,8 @@ static inline void ext4_clear_inode_state(struct inode *inode, int bit) #define EXT4_FEATURE_INCOMPAT_64BIT 0x0080 #define EXT4_FEATURE_INCOMPAT_MMP 0x0100 #define EXT4_FEATURE_INCOMPAT_FLEX_BG 0x0200 +#define EXT4_FEATURE_INCOMPAT_EA_INODE 0x0400 /* EA in inode */ +#define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */ #define EXT4_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR #define EXT4_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ -- cgit v1.2.3 From cbe132a8bdcff0f9afd9060948fb50597c7400b8 Mon Sep 17 00:00:00 2001 From: Dave Chinner <david@fromorbit.com> Date: Tue, 26 Jan 2010 15:08:49 +1100 Subject: xfs: don't hold onto reserved blocks on remount,ro If we hold onto reserved blocks when doing a remount,ro we end up writing the blocks used count to disk that includes the reserved blocks. Reserved blocks are not actually used, so this results in the values in the superblock being incorrect. Hence if we run xfs_check or xfs_repair -n while the filesystem is mounted remount,ro we end up with an inconsistent filesystem being reported. Also, running xfs_copy on the remount,ro filesystem will result in an inconsistent image being generated. To fix this, unreserve the blocks when doing the remount,ro, and reserved them again on remount,rw. This way a remount,ro filesystem will appear consistent on disk to all utilities. Signed-off-by: Dave Chinner <david@fromorbit.com> Reviewed-by: Christoph Hellwig <hch@lst.de> --- fs/xfs/linux-2.6/xfs_super.c | 28 ++++++++++++++++++++++++++++ fs/xfs/xfs_mount.h | 1 + 2 files changed, 29 insertions(+) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 9f2e398a5616..e9c21454b9e2 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -1318,6 +1318,8 @@ xfs_fs_remount( /* ro -> rw */ if ((mp->m_flags & XFS_MOUNT_RDONLY) && !(*flags & MS_RDONLY)) { + __uint64_t resblks; + mp->m_flags &= ~XFS_MOUNT_RDONLY; if (mp->m_flags & XFS_MOUNT_BARRIER) xfs_mountfs_check_barriers(mp); @@ -1335,11 +1337,37 @@ xfs_fs_remount( } mp->m_update_flags = 0; } + + /* + * Fill out the reserve pool if it is empty. Use the stashed + * value if it is non-zero, otherwise go with the default. + */ + if (mp->m_resblks_save) { + resblks = mp->m_resblks_save; + mp->m_resblks_save = 0; + } else { + resblks = mp->m_sb.sb_dblocks; + do_div(resblks, 20); + resblks = min_t(__uint64_t, resblks, 1024); + } + xfs_reserve_blocks(mp, &resblks, NULL); } /* rw -> ro */ if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) { + /* + * After we have synced the data but before we sync the + * metadata, we need to free up the reserve block pool so that + * the used block count in the superblock on disk is correct at + * the end of the remount. Stash the current reserve pool size + * so that if we get remounted rw, we can return it to the same + * size. + */ + __uint64_t resblks = 0; + xfs_quiesce_data(mp); + mp->m_resblks_save = mp->m_resblks; + xfs_reserve_blocks(mp, &resblks, NULL); xfs_quiesce_attr(mp); mp->m_flags |= XFS_MOUNT_RDONLY; } diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index f4d1441f3f15..02d45f213e58 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -225,6 +225,7 @@ typedef struct xfs_mount { __uint64_t m_maxioffset; /* maximum inode offset */ __uint64_t m_resblks; /* total reserved blocks */ __uint64_t m_resblks_avail;/* available reserved blocks */ + __uint64_t m_resblks_save; /* reserved blks @ remount,ro */ int m_dalign; /* stripe unit */ int m_swidth; /* stripe width */ int m_sinoalign; /* stripe unit inode alignment */ -- cgit v1.2.3 From 388f1f0c346b533b06d8bc792f7204ebc3e4b7da Mon Sep 17 00:00:00 2001 From: Dave Chinner <david@fromorbit.com> Date: Tue, 26 Jan 2010 15:10:15 +1100 Subject: xfs: turn off sign warnings Because they cause warnings in static inline functions conditionally compiled into XFS from the VFS (e.g. fsnotify). Signed-off-by: Dave Chinner <david@fromorbit.com> Reviewed-by: Christoph Hellwig <hch@lst.de> --- fs/xfs/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 19267019dacf..5c5a366aa332 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -16,7 +16,7 @@ # Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA # -EXTRA_CFLAGS += -I$(src) -I$(src)/linux-2.6 -Wpointer-sign +EXTRA_CFLAGS += -I$(src) -I$(src)/linux-2.6 XFS_LINUX := linux-2.6 -- cgit v1.2.3 From d5db0f97fbbeff11c88dec1aaf1536a975afbaeb Mon Sep 17 00:00:00 2001 From: Eric Sandeen <sandeen@sandeen.net> Date: Fri, 5 Feb 2010 22:59:53 +0000 Subject: xfs: more reserved blocks fixups This mangles the reserved blocks counts a little more. 1) add a helper function for the default reserved count 2) add helper functions to save/restore counts on ro/rw 3) save/restore reserved blocks on freeze/thaw 4) disallow changing reserved count while readonly V2: changed field name to match Dave's changes Signed-off-by: Eric Sandeen <sandeen@sandeen.net> Signed-off-by: Alex Elder <aelder@sgi.com> --- fs/xfs/linux-2.6/xfs_ioctl.c | 3 +++ fs/xfs/linux-2.6/xfs_super.c | 51 ++++++++++++++++++++++++++++++++------------ fs/xfs/xfs_mount.c | 34 +++++++++++++++++++---------- fs/xfs/xfs_mount.h | 1 + 4 files changed, 64 insertions(+), 25 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c index 3906e85abfdc..4ea1ee18aded 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl.c +++ b/fs/xfs/linux-2.6/xfs_ioctl.c @@ -1431,6 +1431,9 @@ xfs_file_ioctl( if (!capable(CAP_SYS_ADMIN)) return -EPERM; + if (mp->m_flags & XFS_MOUNT_RDONLY) + return -XFS_ERROR(EROFS); + if (copy_from_user(&inout, arg, sizeof(inout))) return -XFS_ERROR(EFAULT); diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index e9c21454b9e2..6ce828e0e17b 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -1256,6 +1256,29 @@ xfs_fs_statfs( return 0; } +STATIC void +xfs_save_resvblks(struct xfs_mount *mp) +{ + __uint64_t resblks = 0; + + mp->m_resblks_save = mp->m_resblks; + xfs_reserve_blocks(mp, &resblks, NULL); +} + +STATIC void +xfs_restore_resvblks(struct xfs_mount *mp) +{ + __uint64_t resblks; + + if (mp->m_resblks_save) { + resblks = mp->m_resblks_save; + mp->m_resblks_save = 0; + } else + resblks = xfs_default_resblks(mp); + + xfs_reserve_blocks(mp, &resblks, NULL); +} + STATIC int xfs_fs_remount( struct super_block *sb, @@ -1318,8 +1341,6 @@ xfs_fs_remount( /* ro -> rw */ if ((mp->m_flags & XFS_MOUNT_RDONLY) && !(*flags & MS_RDONLY)) { - __uint64_t resblks; - mp->m_flags &= ~XFS_MOUNT_RDONLY; if (mp->m_flags & XFS_MOUNT_BARRIER) xfs_mountfs_check_barriers(mp); @@ -1342,15 +1363,7 @@ xfs_fs_remount( * Fill out the reserve pool if it is empty. Use the stashed * value if it is non-zero, otherwise go with the default. */ - if (mp->m_resblks_save) { - resblks = mp->m_resblks_save; - mp->m_resblks_save = 0; - } else { - resblks = mp->m_sb.sb_dblocks; - do_div(resblks, 20); - resblks = min_t(__uint64_t, resblks, 1024); - } - xfs_reserve_blocks(mp, &resblks, NULL); + xfs_restore_resvblks(mp); } /* rw -> ro */ @@ -1363,11 +1376,9 @@ xfs_fs_remount( * so that if we get remounted rw, we can return it to the same * size. */ - __uint64_t resblks = 0; xfs_quiesce_data(mp); - mp->m_resblks_save = mp->m_resblks; - xfs_reserve_blocks(mp, &resblks, NULL); + xfs_save_resvblks(mp); xfs_quiesce_attr(mp); mp->m_flags |= XFS_MOUNT_RDONLY; } @@ -1386,10 +1397,21 @@ xfs_fs_freeze( { struct xfs_mount *mp = XFS_M(sb); + xfs_save_resvblks(mp); xfs_quiesce_attr(mp); return -xfs_fs_log_dummy(mp); } +STATIC int +xfs_fs_unfreeze( + struct super_block *sb) +{ + struct xfs_mount *mp = XFS_M(sb); + + xfs_restore_resvblks(mp); + return 0; +} + STATIC int xfs_fs_show_options( struct seq_file *m, @@ -1612,6 +1634,7 @@ static const struct super_operations xfs_super_operations = { .put_super = xfs_fs_put_super, .sync_fs = xfs_fs_sync_fs, .freeze_fs = xfs_fs_freeze, + .unfreeze_fs = xfs_fs_unfreeze, .statfs = xfs_fs_statfs, .remount_fs = xfs_fs_remount, .show_options = xfs_fs_show_options, diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 7f81ed72c875..5061149b2cc4 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1091,6 +1091,22 @@ xfs_mount_reset_sbqflags( return xfs_trans_commit(tp, 0); } +__uint64_t +xfs_default_resblks(xfs_mount_t *mp) +{ + __uint64_t resblks; + + /* + * We default to 5% or 1024 fsbs of space reserved, whichever is smaller. + * This may drive us straight to ENOSPC on mount, but that implies + * we were already there on the last unmount. Warn if this occurs. + */ + resblks = mp->m_sb.sb_dblocks; + do_div(resblks, 20); + resblks = min_t(__uint64_t, resblks, 1024); + return resblks; +} + /* * This function does the following on an initial mount of a file system: * - reads the superblock from disk and init the mount struct @@ -1401,18 +1417,14 @@ xfs_mountfs( * when at ENOSPC. This is needed for operations like create with * attr, unwritten extent conversion at ENOSPC, etc. Data allocations * are not allowed to use this reserved space. - * - * We default to 5% or 1024 fsbs of space reserved, whichever is smaller. - * This may drive us straight to ENOSPC on mount, but that implies - * we were already there on the last unmount. Warn if this occurs. */ - resblks = mp->m_sb.sb_dblocks; - do_div(resblks, 20); - resblks = min_t(__uint64_t, resblks, 1024); - error = xfs_reserve_blocks(mp, &resblks, NULL); - if (error) - cmn_err(CE_WARN, "XFS: Unable to allocate reserve blocks. " - "Continuing without a reserve pool."); + if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { + resblks = xfs_default_resblks(mp); + error = xfs_reserve_blocks(mp, &resblks, NULL); + if (error) + cmn_err(CE_WARN, "XFS: Unable to allocate reserve " + "blocks. Continuing without a reserve pool."); + } return 0; diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 02d45f213e58..70504fcf14cd 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -421,6 +421,7 @@ typedef struct xfs_mod_sb { } xfs_mod_sb_t; extern int xfs_log_sbcount(xfs_mount_t *, uint); +extern __uint64_t xfs_default_resblks(xfs_mount_t *mp); extern int xfs_mountfs(xfs_mount_t *mp); extern void xfs_unmountfs(xfs_mount_t *); -- cgit v1.2.3 From 777df5afdb26c71634edd60582be620ff94e87a0 Mon Sep 17 00:00:00 2001 From: Dave Chinner <david@fromorbit.com> Date: Sat, 6 Feb 2010 12:37:26 +1100 Subject: xfs: Make inode reclaim states explicit A.K.A.: don't rely on xfs_iflush() return value in reclaim We have gradually been moving checks out of the reclaim code because they are duplicated in xfs_iflush(). We've had a history of problems in this area, and many of them stem from the overloading of the return values from xfs_iflush() and interaction with inode flush locking to determine if the inode is safe to reclaim. With the desire to move to delayed write flushing of inodes and non-blocking inode tree reclaim walks, the overloading of the return value of xfs_iflush makes it very difficult to determine the correct thing to do next. This patch explicitly re-adds the checks to the inode reclaim code, removing the reliance on the return value of xfs_iflush() to determine what to do next. It also means that we can clearly document all the inode states that reclaim must handle and hence we can easily see that we handled all the necessary cases. This also removes the need for the xfs_inode_clean() check in xfs_iflush() as all callers now check this first (safely). Signed-off-by: Dave Chinner <david@fromorbit.com> Reviewed-by: Christoph Hellwig <hch@lst.de> --- fs/xfs/linux-2.6/xfs_sync.c | 81 ++++++++++++++++++++++++++++++++++----------- fs/xfs/xfs_inode.c | 11 +----- fs/xfs/xfs_inode.h | 1 + 3 files changed, 64 insertions(+), 29 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index c9b863eacab7..525260c7617f 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c @@ -706,12 +706,43 @@ __xfs_inode_clear_reclaim_tag( XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG); } +/* + * Inodes in different states need to be treated differently, and the return + * value of xfs_iflush is not sufficient to get this right. The following table + * lists the inode states and the reclaim actions necessary for non-blocking + * reclaim: + * + * + * inode state iflush ret required action + * --------------- ---------- --------------- + * bad - reclaim + * shutdown EIO unpin and reclaim + * clean, unpinned 0 reclaim + * stale, unpinned 0 reclaim + * clean, pinned(*) 0 unpin and reclaim + * stale, pinned 0 unpin and reclaim + * dirty, async 0 block on flush lock, reclaim + * dirty, sync flush 0 block on flush lock, reclaim + * + * (*) dgc: I don't think the clean, pinned state is possible but it gets + * handled anyway given the order of checks implemented. + * + * Hence the order of actions after gaining the locks should be: + * bad => reclaim + * shutdown => unpin and reclaim + * pinned => unpin + * stale => reclaim + * clean => reclaim + * dirty => flush, wait and reclaim + */ STATIC int xfs_reclaim_inode( struct xfs_inode *ip, struct xfs_perag *pag, int sync_mode) { + int error; + /* * The radix tree lock here protects a thread in xfs_iget from racing * with us starting reclaim on the inode. Once we have the @@ -729,30 +760,42 @@ xfs_reclaim_inode( spin_unlock(&ip->i_flags_lock); write_unlock(&pag->pag_ici_lock); - /* - * If the inode is still dirty, then flush it out. If the inode - * is not in the AIL, then it will be OK to flush it delwri as - * long as xfs_iflush() does not keep any references to the inode. - * We leave that decision up to xfs_iflush() since it has the - * knowledge of whether it's OK to simply do a delwri flush of - * the inode or whether we need to wait until the inode is - * pulled from the AIL. - * We get the flush lock regardless, though, just to make sure - * we don't free it while it is being flushed. - */ xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_iflock(ip); - /* - * In the case of a forced shutdown we rely on xfs_iflush() to - * wait for the inode to be unpinned before returning an error. - */ - if (!is_bad_inode(VFS_I(ip)) && xfs_iflush(ip, sync_mode) == 0) { - /* synchronize with xfs_iflush_done */ - xfs_iflock(ip); - xfs_ifunlock(ip); + if (is_bad_inode(VFS_I(ip))) + goto reclaim; + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { + xfs_iunpin_wait(ip); + goto reclaim; + } + if (xfs_ipincount(ip)) + xfs_iunpin_wait(ip); + if (xfs_iflags_test(ip, XFS_ISTALE)) + goto reclaim; + if (xfs_inode_clean(ip)) + goto reclaim; + + /* Now we have an inode that needs flushing */ + error = xfs_iflush(ip, sync_mode); + if (!error) { + switch(sync_mode) { + case XFS_IFLUSH_DELWRI_ELSE_ASYNC: + case XFS_IFLUSH_DELWRI: + case XFS_IFLUSH_ASYNC: + case XFS_IFLUSH_DELWRI_ELSE_SYNC: + case XFS_IFLUSH_SYNC: + /* IO issued, synchronise with IO completion */ + xfs_iflock(ip); + break; + default: + ASSERT(0); + break; + } } +reclaim: + xfs_ifunlock(ip); xfs_iunlock(ip, XFS_ILOCK_EXCL); xfs_ireclaim(ip); return 0; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index d0d1b5a05183..8d0666dd170a 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2493,7 +2493,7 @@ __xfs_iunpin_wait( wait_event(ip->i_ipin_wait, (atomic_read(&ip->i_pincount) == 0)); } -static inline void +void xfs_iunpin_wait( xfs_inode_t *ip) { @@ -2848,15 +2848,6 @@ xfs_iflush( iip = ip->i_itemp; mp = ip->i_mount; - /* - * If the inode isn't dirty, then just release the inode flush lock and - * do nothing. - */ - if (xfs_inode_clean(ip)) { - xfs_ifunlock(ip); - return 0; - } - /* * We can't flush the inode until it is unpinned, so wait for it if we * are allowed to block. We know noone new can pin it, because we are diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index ec1f28c4fc4f..8b618ea4d692 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -483,6 +483,7 @@ int xfs_iunlink(struct xfs_trans *, xfs_inode_t *); void xfs_iext_realloc(xfs_inode_t *, int, int); void xfs_ipin(xfs_inode_t *); void xfs_iunpin(xfs_inode_t *); +void xfs_iunpin_wait(xfs_inode_t *); int xfs_iflush(xfs_inode_t *, uint); void xfs_ichgtime(xfs_inode_t *, int); void xfs_lock_inodes(xfs_inode_t **, int, uint); -- cgit v1.2.3 From c854363e80b49dd04a4de18ebc379eb8c8806674 Mon Sep 17 00:00:00 2001 From: Dave Chinner <david@fromorbit.com> Date: Sat, 6 Feb 2010 12:39:36 +1100 Subject: xfs: Use delayed write for inodes rather than async V2 We currently do background inode flush asynchronously, resulting in inodes being written in whatever order the background writeback issues them. Not only that, there are also blocking and non-blocking asynchronous inode flushes, depending on where the flush comes from. This patch completely removes asynchronous inode writeback. It removes all the strange writeback modes and replaces them with either a synchronous flush or a non-blocking delayed write flush. That is, inode flushes will only issue IO directly if they are synchronous, and background flushing may do nothing if the operation would block (e.g. on a pinned inode or buffer lock). Delayed write flushes will now result in the inode buffer sitting in the delwri queue of the buffer cache to be flushed by either an AIL push or by the xfsbufd timing out the buffer. This will allow accumulation of dirty inode buffers in memory and allow optimisation of inode cluster writeback at the xfsbufd level where we have much greater queue depths than the block layer elevators. We will also get adjacent inode cluster buffer IO merging for free when a later patch in the series allows sorting of the delayed write buffers before dispatch. This effectively means that any inode that is written back by background writeback will be seen as flush locked during AIL pushing, and will result in the buffers being pushed from there. This writeback path is currently non-optimal, but the next patch in the series will fix that problem. A side effect of this delayed write mechanism is that background inode reclaim will no longer directly flush inodes, nor can it wait on the flush lock. The result is that inode reclaim must leave the inode in the reclaimable state until it is clean. Hence attempts to reclaim a dirty inode in the background will simply skip the inode until it is clean and this allows other mechanisms (i.e. xfsbufd) to do more optimal writeback of the dirty buffers. As a result, the inode reclaim code has been rewritten so that it no longer relies on the ambiguous return values of xfs_iflush() to determine whether it is safe to reclaim an inode. Portions of this patch are derived from patches by Christoph Hellwig. Version 2: - cleanup reclaim code as suggested by Christoph - log background reclaim inode flush errors - just pass sync flags to xfs_iflush Signed-off-by: Dave Chinner <david@fromorbit.com> Reviewed-by: Christoph Hellwig <hch@lst.de> --- fs/xfs/linux-2.6/xfs_super.c | 4 +- fs/xfs/linux-2.6/xfs_sync.c | 105 +++++++++++++++++++++++++++++++------------ fs/xfs/xfs_inode.c | 75 +++---------------------------- fs/xfs/xfs_inode.h | 10 ----- fs/xfs/xfs_inode_item.c | 10 +++-- fs/xfs/xfs_mount.c | 13 +++++- 6 files changed, 102 insertions(+), 115 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 6ce828e0e17b..3b5b46b8e3b9 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -1064,7 +1064,7 @@ xfs_fs_write_inode( xfs_ilock(ip, XFS_ILOCK_SHARED); xfs_iflock(ip); - error = xfs_iflush(ip, XFS_IFLUSH_SYNC); + error = xfs_iflush(ip, SYNC_WAIT); } else { error = EAGAIN; if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) @@ -1072,7 +1072,7 @@ xfs_fs_write_inode( if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip)) goto out_unlock; - error = xfs_iflush(ip, XFS_IFLUSH_ASYNC_NOBLOCK); + error = xfs_iflush(ip, 0); } out_unlock: diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index 525260c7617f..a9f6d20aff41 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c @@ -270,8 +270,7 @@ xfs_sync_inode_attr( goto out_unlock; } - error = xfs_iflush(ip, (flags & SYNC_WAIT) ? - XFS_IFLUSH_SYNC : XFS_IFLUSH_DELWRI); + error = xfs_iflush(ip, flags); out_unlock: xfs_iunlock(ip, XFS_ILOCK_SHARED); @@ -460,16 +459,18 @@ xfs_quiesce_fs( { int count = 0, pincount; + xfs_reclaim_inodes(mp, 0); xfs_flush_buftarg(mp->m_ddev_targp, 0); - xfs_reclaim_inodes(mp, XFS_IFLUSH_DELWRI_ELSE_ASYNC); /* * This loop must run at least twice. The first instance of the loop * will flush most meta data but that will generate more meta data * (typically directory updates). Which then must be flushed and - * logged before we can write the unmount record. + * logged before we can write the unmount record. We also so sync + * reclaim of inodes to catch any that the above delwri flush skipped. */ do { + xfs_reclaim_inodes(mp, SYNC_WAIT); xfs_sync_attr(mp, SYNC_WAIT); pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1); if (!pincount) { @@ -585,7 +586,7 @@ xfs_sync_worker( if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { xfs_log_force(mp, 0); - xfs_reclaim_inodes(mp, XFS_IFLUSH_DELWRI_ELSE_ASYNC); + xfs_reclaim_inodes(mp, 0); /* dgc: errors ignored here */ error = xfs_qm_sync(mp, SYNC_TRYLOCK); error = xfs_sync_fsdata(mp, SYNC_TRYLOCK); @@ -719,21 +720,42 @@ __xfs_inode_clear_reclaim_tag( * shutdown EIO unpin and reclaim * clean, unpinned 0 reclaim * stale, unpinned 0 reclaim - * clean, pinned(*) 0 unpin and reclaim - * stale, pinned 0 unpin and reclaim - * dirty, async 0 block on flush lock, reclaim - * dirty, sync flush 0 block on flush lock, reclaim + * clean, pinned(*) 0 requeue + * stale, pinned EAGAIN requeue + * dirty, delwri ok 0 requeue + * dirty, delwri blocked EAGAIN requeue + * dirty, sync flush 0 reclaim * * (*) dgc: I don't think the clean, pinned state is possible but it gets * handled anyway given the order of checks implemented. * + * As can be seen from the table, the return value of xfs_iflush() is not + * sufficient to correctly decide the reclaim action here. The checks in + * xfs_iflush() might look like duplicates, but they are not. + * + * Also, because we get the flush lock first, we know that any inode that has + * been flushed delwri has had the flush completed by the time we check that + * the inode is clean. The clean inode check needs to be done before flushing + * the inode delwri otherwise we would loop forever requeuing clean inodes as + * we cannot tell apart a successful delwri flush and a clean inode from the + * return value of xfs_iflush(). + * + * Note that because the inode is flushed delayed write by background + * writeback, the flush lock may already be held here and waiting on it can + * result in very long latencies. Hence for sync reclaims, where we wait on the + * flush lock, the caller should push out delayed write inodes first before + * trying to reclaim them to minimise the amount of time spent waiting. For + * background relaim, we just requeue the inode for the next pass. + * * Hence the order of actions after gaining the locks should be: * bad => reclaim * shutdown => unpin and reclaim - * pinned => unpin + * pinned, delwri => requeue + * pinned, sync => unpin * stale => reclaim * clean => reclaim - * dirty => flush, wait and reclaim + * dirty, delwri => flush and requeue + * dirty, sync => flush, wait and reclaim */ STATIC int xfs_reclaim_inode( @@ -741,7 +763,7 @@ xfs_reclaim_inode( struct xfs_perag *pag, int sync_mode) { - int error; + int error = 0; /* * The radix tree lock here protects a thread in xfs_iget from racing @@ -761,7 +783,11 @@ xfs_reclaim_inode( write_unlock(&pag->pag_ici_lock); xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_iflock(ip); + if (!xfs_iflock_nowait(ip)) { + if (!(sync_mode & SYNC_WAIT)) + goto out; + xfs_iflock(ip); + } if (is_bad_inode(VFS_I(ip))) goto reclaim; @@ -769,8 +795,13 @@ xfs_reclaim_inode( xfs_iunpin_wait(ip); goto reclaim; } - if (xfs_ipincount(ip)) + if (xfs_ipincount(ip)) { + if (!(sync_mode & SYNC_WAIT)) { + xfs_ifunlock(ip); + goto out; + } xfs_iunpin_wait(ip); + } if (xfs_iflags_test(ip, XFS_ISTALE)) goto reclaim; if (xfs_inode_clean(ip)) @@ -778,27 +809,43 @@ xfs_reclaim_inode( /* Now we have an inode that needs flushing */ error = xfs_iflush(ip, sync_mode); - if (!error) { - switch(sync_mode) { - case XFS_IFLUSH_DELWRI_ELSE_ASYNC: - case XFS_IFLUSH_DELWRI: - case XFS_IFLUSH_ASYNC: - case XFS_IFLUSH_DELWRI_ELSE_SYNC: - case XFS_IFLUSH_SYNC: - /* IO issued, synchronise with IO completion */ - xfs_iflock(ip); - break; - default: - ASSERT(0); - break; - } + if (sync_mode & SYNC_WAIT) { + xfs_iflock(ip); + goto reclaim; } + /* + * When we have to flush an inode but don't have SYNC_WAIT set, we + * flush the inode out using a delwri buffer and wait for the next + * call into reclaim to find it in a clean state instead of waiting for + * it now. We also don't return errors here - if the error is transient + * then the next reclaim pass will flush the inode, and if the error + * is permanent then the next sync reclaim will relcaim the inode and + * pass on the error. + */ + if (error && !XFS_FORCED_SHUTDOWN(ip->i_mount)) { + xfs_fs_cmn_err(CE_WARN, ip->i_mount, + "inode 0x%llx background reclaim flush failed with %d", + (long long)ip->i_ino, error); + } +out: + xfs_iflags_clear(ip, XFS_IRECLAIM); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + /* + * We could return EAGAIN here to make reclaim rescan the inode tree in + * a short while. However, this just burns CPU time scanning the tree + * waiting for IO to complete and xfssyncd never goes back to the idle + * state. Instead, return 0 to let the next scheduled background reclaim + * attempt to reclaim the inode again. + */ + return 0; + reclaim: xfs_ifunlock(ip); xfs_iunlock(ip, XFS_ILOCK_EXCL); xfs_ireclaim(ip); - return 0; + return error; + } int diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 8d0666dd170a..fa31360046d4 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2835,8 +2835,6 @@ xfs_iflush( xfs_dinode_t *dip; xfs_mount_t *mp; int error; - int noblock = (flags == XFS_IFLUSH_ASYNC_NOBLOCK); - enum { INT_DELWRI = (1 << 0), INT_ASYNC = (1 << 1) }; XFS_STATS_INC(xs_iflush_count); @@ -2859,7 +2857,7 @@ xfs_iflush( * in the same cluster are dirty, they will probably write the inode * out for us if they occur after the log force completes. */ - if (noblock && xfs_ipincount(ip)) { + if (!(flags & SYNC_WAIT) && xfs_ipincount(ip)) { xfs_iunpin_nowait(ip); xfs_ifunlock(ip); return EAGAIN; @@ -2892,61 +2890,11 @@ xfs_iflush( return XFS_ERROR(EIO); } - /* - * Decide how buffer will be flushed out. This is done before - * the call to xfs_iflush_int because this field is zeroed by it. - */ - if (iip != NULL && iip->ili_format.ilf_fields != 0) { - /* - * Flush out the inode buffer according to the directions - * of the caller. In the cases where the caller has given - * us a choice choose the non-delwri case. This is because - * the inode is in the AIL and we need to get it out soon. - */ - switch (flags) { - case XFS_IFLUSH_SYNC: - case XFS_IFLUSH_DELWRI_ELSE_SYNC: - flags = 0; - break; - case XFS_IFLUSH_ASYNC_NOBLOCK: - case XFS_IFLUSH_ASYNC: - case XFS_IFLUSH_DELWRI_ELSE_ASYNC: - flags = INT_ASYNC; - break; - case XFS_IFLUSH_DELWRI: - flags = INT_DELWRI; - break; - default: - ASSERT(0); - flags = 0; - break; - } - } else { - switch (flags) { - case XFS_IFLUSH_DELWRI_ELSE_SYNC: - case XFS_IFLUSH_DELWRI_ELSE_ASYNC: - case XFS_IFLUSH_DELWRI: - flags = INT_DELWRI; - break; - case XFS_IFLUSH_ASYNC_NOBLOCK: - case XFS_IFLUSH_ASYNC: - flags = INT_ASYNC; - break; - case XFS_IFLUSH_SYNC: - flags = 0; - break; - default: - ASSERT(0); - flags = 0; - break; - } - } - /* * Get the buffer containing the on-disk inode. */ error = xfs_itobp(mp, NULL, ip, &dip, &bp, - noblock ? XBF_TRYLOCK : XBF_LOCK); + (flags & SYNC_WAIT) ? XBF_LOCK : XBF_TRYLOCK); if (error || !bp) { xfs_ifunlock(ip); return error; @@ -2974,13 +2922,10 @@ xfs_iflush( if (error) goto cluster_corrupt_out; - if (flags & INT_DELWRI) { - xfs_bdwrite(mp, bp); - } else if (flags & INT_ASYNC) { - error = xfs_bawrite(mp, bp); - } else { + if (flags & SYNC_WAIT) error = xfs_bwrite(mp, bp); - } + else + xfs_bdwrite(mp, bp); return error; corrupt_out: @@ -3015,16 +2960,6 @@ xfs_iflush_int( iip = ip->i_itemp; mp = ip->i_mount; - - /* - * If the inode isn't dirty, then just release the inode - * flush lock and do nothing. - */ - if (xfs_inode_clean(ip)) { - xfs_ifunlock(ip); - return 0; - } - /* set *dip = inode's place in the buffer */ dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset); diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 8b618ea4d692..6c912b027596 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -419,16 +419,6 @@ static inline void xfs_ifunlock(xfs_inode_t *ip) #define XFS_IOLOCK_DEP(flags) (((flags) & XFS_IOLOCK_DEP_MASK) >> XFS_IOLOCK_SHIFT) #define XFS_ILOCK_DEP(flags) (((flags) & XFS_ILOCK_DEP_MASK) >> XFS_ILOCK_SHIFT) -/* - * Flags for xfs_iflush() - */ -#define XFS_IFLUSH_DELWRI_ELSE_SYNC 1 -#define XFS_IFLUSH_DELWRI_ELSE_ASYNC 2 -#define XFS_IFLUSH_SYNC 3 -#define XFS_IFLUSH_ASYNC 4 -#define XFS_IFLUSH_DELWRI 5 -#define XFS_IFLUSH_ASYNC_NOBLOCK 6 - /* * Flags for xfs_itruncate_start(). */ diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 48ec1c0b23ce..207553e82954 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -866,10 +866,14 @@ xfs_inode_item_push( iip->ili_format.ilf_fields != 0); /* - * Write out the inode. The completion routine ('iflush_done') will - * pull it from the AIL, mark it clean, unlock the flush lock. + * Push the inode to it's backing buffer. This will not remove the + * inode from the AIL - a further push will be required to trigger a + * buffer push. However, this allows all the dirty inodes to be pushed + * to the buffer before it is pushed to disk. THe buffer IO completion + * will pull th einode from the AIL, mark it clean and unlock the flush + * lock. */ - (void) xfs_iflush(ip, XFS_IFLUSH_ASYNC); + (void) xfs_iflush(ip, 0); xfs_iunlock(ip, XFS_ILOCK_SHARED); return; diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 5061149b2cc4..6afaaeb2950a 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1468,7 +1468,18 @@ xfs_unmountfs( * need to force the log first. */ xfs_log_force(mp, XFS_LOG_SYNC); - xfs_reclaim_inodes(mp, XFS_IFLUSH_ASYNC); + + /* + * Do a delwri reclaim pass first so that as many dirty inodes are + * queued up for IO as possible. Then flush the buffers before making + * a synchronous path to catch all the remaining inodes are reclaimed. + * This makes the reclaim process as quick as possible by avoiding + * synchronous writeout and blocking on inodes already in the delwri + * state as much as possible. + */ + xfs_reclaim_inodes(mp, 0); + XFS_bflush(mp->m_ddev_targp); + xfs_reclaim_inodes(mp, SYNC_WAIT); xfs_qm_unmount(mp); -- cgit v1.2.3 From d808f617ad00a413585b806de340feda5ad9a2da Mon Sep 17 00:00:00 2001 From: Dave Chinner <david@fromorbit.com> Date: Tue, 2 Feb 2010 10:13:42 +1100 Subject: xfs: Don't issue buffer IO direct from AIL push V2 All buffers logged into the AIL are marked as delayed write. When the AIL needs to push the buffer out, it issues an async write of the buffer. This means that IO patterns are dependent on the order of buffers in the AIL. Instead of flushing the buffer, promote the buffer in the delayed write list so that the next time the xfsbufd is run the buffer will be flushed by the xfsbufd. Return the state to the xfsaild that the buffer was promoted so that the xfsaild knows that it needs to cause the xfsbufd to run to flush the buffers that were promoted. Using the xfsbufd for issuing the IO allows us to dispatch all buffer IO from the one queue. This means that we can make much more enlightened decisions on what order to flush buffers to disk as we don't have multiple places issuing IO. Optimisations to xfsbufd will be in a future patch. Version 2 - kill XFS_ITEM_FLUSHING as it is now unused. Signed-off-by: Dave Chinner <david@fromorbit.com> Reviewed-by: Christoph Hellwig <hch@lst.de> --- fs/xfs/linux-2.6/xfs_buf.c | 29 +++++++++++++ fs/xfs/linux-2.6/xfs_buf.h | 2 + fs/xfs/linux-2.6/xfs_trace.h | 1 + fs/xfs/quota/xfs_dquot_item.c | 85 ++++++------------------------------- fs/xfs/quota/xfs_dquot_item.h | 4 -- fs/xfs/xfs_buf_item.c | 64 +++++++++++++++------------- fs/xfs/xfs_inode_item.c | 98 +++++++------------------------------------ fs/xfs/xfs_inode_item.h | 6 --- fs/xfs/xfs_trans.h | 3 +- fs/xfs/xfs_trans_ail.c | 13 +++--- 10 files changed, 102 insertions(+), 203 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 44e20e578ba0..b306265caa33 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -1778,6 +1778,35 @@ xfs_buf_delwri_dequeue( trace_xfs_buf_delwri_dequeue(bp, _RET_IP_); } +/* + * If a delwri buffer needs to be pushed before it has aged out, then promote + * it to the head of the delwri queue so that it will be flushed on the next + * xfsbufd run. We do this by resetting the queuetime of the buffer to be older + * than the age currently needed to flush the buffer. Hence the next time the + * xfsbufd sees it is guaranteed to be considered old enough to flush. + */ +void +xfs_buf_delwri_promote( + struct xfs_buf *bp) +{ + struct xfs_buftarg *btp = bp->b_target; + long age = xfs_buf_age_centisecs * msecs_to_jiffies(10) + 1; + + ASSERT(bp->b_flags & XBF_DELWRI); + ASSERT(bp->b_flags & _XBF_DELWRI_Q); + + /* + * Check the buffer age before locking the delayed write queue as we + * don't need to promote buffers that are already past the flush age. + */ + if (bp->b_queuetime < jiffies - age) + return; + bp->b_queuetime = jiffies - age; + spin_lock(&btp->bt_delwrite_lock); + list_move(&bp->b_list, &btp->bt_delwrite_queue); + spin_unlock(&btp->bt_delwrite_lock); +} + STATIC void xfs_buf_runall_queues( struct workqueue_struct *queue) diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h index ea8c198f0c39..be45e8c5768d 100644 --- a/fs/xfs/linux-2.6/xfs_buf.h +++ b/fs/xfs/linux-2.6/xfs_buf.h @@ -266,6 +266,7 @@ extern int xfs_buf_ispin(xfs_buf_t *); /* Delayed Write Buffer Routines */ extern void xfs_buf_delwri_dequeue(xfs_buf_t *); +extern void xfs_buf_delwri_promote(xfs_buf_t *); /* Buffer Daemon Setup Routines */ extern int xfs_buf_init(void); @@ -395,6 +396,7 @@ extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *); extern void xfs_wait_buftarg(xfs_buftarg_t *); extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int); extern int xfs_flush_buftarg(xfs_buftarg_t *, int); + #ifdef CONFIG_KDB_MODULES extern struct list_head *xfs_get_buftarg_list(void); #endif diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h index 1bb09e70b2eb..a4574dcf5065 100644 --- a/fs/xfs/linux-2.6/xfs_trace.h +++ b/fs/xfs/linux-2.6/xfs_trace.h @@ -483,6 +483,7 @@ DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock_stale); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_committed); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_push); +DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pushbuf); DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf); DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf_recur); DEFINE_BUF_ITEM_EVENT(xfs_trans_getsb); diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c index 1b564376d50c..dda0fb045c8a 100644 --- a/fs/xfs/quota/xfs_dquot_item.c +++ b/fs/xfs/quota/xfs_dquot_item.c @@ -212,18 +212,10 @@ xfs_qm_dquot_logitem_pushbuf( xfs_dquot_t *dqp; xfs_mount_t *mp; xfs_buf_t *bp; - uint dopush; dqp = qip->qli_dquot; ASSERT(XFS_DQ_IS_LOCKED(dqp)); - /* - * The qli_pushbuf_flag keeps others from - * trying to duplicate our effort. - */ - ASSERT(qip->qli_pushbuf_flag != 0); - ASSERT(qip->qli_push_owner == current_pid()); - /* * If flushlock isn't locked anymore, chances are that the * inode flush completed and the inode was taken off the AIL. @@ -231,47 +223,20 @@ xfs_qm_dquot_logitem_pushbuf( */ if (completion_done(&dqp->q_flush) || ((qip->qli_item.li_flags & XFS_LI_IN_AIL) == 0)) { - qip->qli_pushbuf_flag = 0; xfs_dqunlock(dqp); return; } mp = dqp->q_mount; bp = xfs_incore(mp->m_ddev_targp, qip->qli_format.qlf_blkno, XFS_QI_DQCHUNKLEN(mp), XBF_TRYLOCK); - if (bp != NULL) { - if (XFS_BUF_ISDELAYWRITE(bp)) { - dopush = ((qip->qli_item.li_flags & XFS_LI_IN_AIL) && - !completion_done(&dqp->q_flush)); - qip->qli_pushbuf_flag = 0; - xfs_dqunlock(dqp); - - if (XFS_BUF_ISPINNED(bp)) - xfs_log_force(mp, 0); - - if (dopush) { - int error; -#ifdef XFSRACEDEBUG - delay_for_intr(); - delay(300); -#endif - error = xfs_bawrite(mp, bp); - if (error) - xfs_fs_cmn_err(CE_WARN, mp, - "xfs_qm_dquot_logitem_pushbuf: pushbuf error %d on qip %p, bp %p", - error, qip, bp); - } else { - xfs_buf_relse(bp); - } - } else { - qip->qli_pushbuf_flag = 0; - xfs_dqunlock(dqp); - xfs_buf_relse(bp); - } + xfs_dqunlock(dqp); + if (!bp) return; - } + if (XFS_BUF_ISDELAYWRITE(bp)) + xfs_buf_delwri_promote(bp); + xfs_buf_relse(bp); + return; - qip->qli_pushbuf_flag = 0; - xfs_dqunlock(dqp); } /* @@ -289,50 +254,24 @@ xfs_qm_dquot_logitem_trylock( xfs_dq_logitem_t *qip) { xfs_dquot_t *dqp; - uint retval; dqp = qip->qli_dquot; if (atomic_read(&dqp->q_pincount) > 0) - return (XFS_ITEM_PINNED); + return XFS_ITEM_PINNED; if (! xfs_qm_dqlock_nowait(dqp)) - return (XFS_ITEM_LOCKED); + return XFS_ITEM_LOCKED; - retval = XFS_ITEM_SUCCESS; if (!xfs_dqflock_nowait(dqp)) { /* - * The dquot is already being flushed. It may have been - * flushed delayed write, however, and we don't want to - * get stuck waiting for that to complete. So, we want to check - * to see if we can lock the dquot's buffer without sleeping. - * If we can and it is marked for delayed write, then we - * hold it and send it out from the push routine. We don't - * want to do that now since we might sleep in the device - * strategy routine. We also don't want to grab the buffer lock - * here because we'd like not to call into the buffer cache - * while holding the AIL lock. - * Make sure to only return PUSHBUF if we set pushbuf_flag - * ourselves. If someone else is doing it then we don't - * want to go to the push routine and duplicate their efforts. + * dquot has already been flushed to the backing buffer, + * leave it locked, pushbuf routine will unlock it. */ - if (qip->qli_pushbuf_flag == 0) { - qip->qli_pushbuf_flag = 1; - ASSERT(qip->qli_format.qlf_blkno == dqp->q_blkno); -#ifdef DEBUG - qip->qli_push_owner = current_pid(); -#endif - /* - * The dquot is left locked. - */ - retval = XFS_ITEM_PUSHBUF; - } else { - retval = XFS_ITEM_FLUSHING; - xfs_dqunlock_nonotify(dqp); - } + return XFS_ITEM_PUSHBUF; } ASSERT(qip->qli_item.li_flags & XFS_LI_IN_AIL); - return (retval); + return XFS_ITEM_SUCCESS; } diff --git a/fs/xfs/quota/xfs_dquot_item.h b/fs/xfs/quota/xfs_dquot_item.h index 5a632531f843..5acae2ada70b 100644 --- a/fs/xfs/quota/xfs_dquot_item.h +++ b/fs/xfs/quota/xfs_dquot_item.h @@ -27,10 +27,6 @@ typedef struct xfs_dq_logitem { xfs_log_item_t qli_item; /* common portion */ struct xfs_dquot *qli_dquot; /* dquot ptr */ xfs_lsn_t qli_flush_lsn; /* lsn at last flush */ - unsigned short qli_pushbuf_flag; /* 1 bit used in push_ail */ -#ifdef DEBUG - uint64_t qli_push_owner; -#endif xfs_dq_logformat_t qli_format; /* logged structure */ } xfs_dq_logitem_t; diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index e0a11583ce5a..f3c49e69eab9 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -467,8 +467,10 @@ xfs_buf_item_unpin_remove( /* * This is called to attempt to lock the buffer associated with this * buf log item. Don't sleep on the buffer lock. If we can't get - * the lock right away, return 0. If we can get the lock, pull the - * buffer from the free list, mark it busy, and return 1. + * the lock right away, return 0. If we can get the lock, take a + * reference to the buffer. If this is a delayed write buffer that + * needs AIL help to be written back, invoke the pushbuf routine + * rather than the normal success path. */ STATIC uint xfs_buf_item_trylock( @@ -477,24 +479,18 @@ xfs_buf_item_trylock( xfs_buf_t *bp; bp = bip->bli_buf; - - if (XFS_BUF_ISPINNED(bp)) { + if (XFS_BUF_ISPINNED(bp)) return XFS_ITEM_PINNED; - } - - if (!XFS_BUF_CPSEMA(bp)) { + if (!XFS_BUF_CPSEMA(bp)) return XFS_ITEM_LOCKED; - } - /* - * Remove the buffer from the free list. Only do this - * if it's on the free list. Private buffers like the - * superblock buffer are not. - */ + /* take a reference to the buffer. */ XFS_BUF_HOLD(bp); ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); trace_xfs_buf_item_trylock(bip); + if (XFS_BUF_ISDELAYWRITE(bp)) + return XFS_ITEM_PUSHBUF; return XFS_ITEM_SUCCESS; } @@ -626,11 +622,9 @@ xfs_buf_item_committed( } /* - * This is called to asynchronously write the buffer associated with this - * buf log item out to disk. The buffer will already have been locked by - * a successful call to xfs_buf_item_trylock(). If the buffer still has - * B_DELWRI set, then get it going out to disk with a call to bawrite(). - * If not, then just release the buffer. + * The buffer is locked, but is not a delayed write buffer. This happens + * if we race with IO completion and hence we don't want to try to write it + * again. Just release the buffer. */ STATIC void xfs_buf_item_push( @@ -642,17 +636,29 @@ xfs_buf_item_push( trace_xfs_buf_item_push(bip); bp = bip->bli_buf; + ASSERT(!XFS_BUF_ISDELAYWRITE(bp)); + xfs_buf_relse(bp); +} - if (XFS_BUF_ISDELAYWRITE(bp)) { - int error; - error = xfs_bawrite(bip->bli_item.li_mountp, bp); - if (error) - xfs_fs_cmn_err(CE_WARN, bip->bli_item.li_mountp, - "xfs_buf_item_push: pushbuf error %d on bip %p, bp %p", - error, bip, bp); - } else { - xfs_buf_relse(bp); - } +/* + * The buffer is locked and is a delayed write buffer. Promote the buffer + * in the delayed write queue as the caller knows that they must invoke + * the xfsbufd to get this buffer written. We have to unlock the buffer + * to allow the xfsbufd to write it, too. + */ +STATIC void +xfs_buf_item_pushbuf( + xfs_buf_log_item_t *bip) +{ + xfs_buf_t *bp; + + ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); + trace_xfs_buf_item_pushbuf(bip); + + bp = bip->bli_buf; + ASSERT(XFS_BUF_ISDELAYWRITE(bp)); + xfs_buf_delwri_promote(bp); + xfs_buf_relse(bp); } /* ARGSUSED */ @@ -677,7 +683,7 @@ static struct xfs_item_ops xfs_buf_item_ops = { .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t)) xfs_buf_item_committed, .iop_push = (void(*)(xfs_log_item_t*))xfs_buf_item_push, - .iop_pushbuf = NULL, + .iop_pushbuf = (void(*)(xfs_log_item_t*))xfs_buf_item_pushbuf, .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t)) xfs_buf_item_committing }; diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 207553e82954..d4dc063111f8 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -602,33 +602,20 @@ xfs_inode_item_trylock( if (!xfs_iflock_nowait(ip)) { /* - * If someone else isn't already trying to push the inode - * buffer, we get to do it. + * inode has already been flushed to the backing buffer, + * leave it locked in shared mode, pushbuf routine will + * unlock it. */ - if (iip->ili_pushbuf_flag == 0) { - iip->ili_pushbuf_flag = 1; -#ifdef DEBUG - iip->ili_push_owner = current_pid(); -#endif - /* - * Inode is left locked in shared mode. - * Pushbuf routine gets to unlock it. - */ - return XFS_ITEM_PUSHBUF; - } else { - /* - * We hold the AIL lock, so we must specify the - * NONOTIFY flag so that we won't double trip. - */ - xfs_iunlock(ip, XFS_ILOCK_SHARED|XFS_IUNLOCK_NONOTIFY); - return XFS_ITEM_FLUSHING; - } - /* NOTREACHED */ + return XFS_ITEM_PUSHBUF; } /* Stale items should force out the iclog */ if (ip->i_flags & XFS_ISTALE) { xfs_ifunlock(ip); + /* + * we hold the AIL lock - notify the unlock routine of this + * so it doesn't try to get the lock again. + */ xfs_iunlock(ip, XFS_ILOCK_SHARED|XFS_IUNLOCK_NONOTIFY); return XFS_ITEM_PINNED; } @@ -746,11 +733,8 @@ xfs_inode_item_committed( * This gets called by xfs_trans_push_ail(), when IOP_TRYLOCK * failed to get the inode flush lock but did get the inode locked SHARED. * Here we're trying to see if the inode buffer is incore, and if so whether it's - * marked delayed write. If that's the case, we'll initiate a bawrite on that - * buffer to expedite the process. - * - * We aren't holding the AIL lock (or the flush lock) when this gets called, - * so it is inherently race-y. + * marked delayed write. If that's the case, we'll promote it and that will + * allow the caller to write the buffer by triggering the xfsbufd to run. */ STATIC void xfs_inode_item_pushbuf( @@ -759,26 +743,16 @@ xfs_inode_item_pushbuf( xfs_inode_t *ip; xfs_mount_t *mp; xfs_buf_t *bp; - uint dopush; ip = iip->ili_inode; - ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED)); - /* - * The ili_pushbuf_flag keeps others from - * trying to duplicate our effort. - */ - ASSERT(iip->ili_pushbuf_flag != 0); - ASSERT(iip->ili_push_owner == current_pid()); - /* * If a flush is not in progress anymore, chances are that the * inode was taken off the AIL. So, just get out. */ if (completion_done(&ip->i_flush) || ((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0)) { - iip->ili_pushbuf_flag = 0; xfs_iunlock(ip, XFS_ILOCK_SHARED); return; } @@ -787,53 +761,12 @@ xfs_inode_item_pushbuf( bp = xfs_incore(mp->m_ddev_targp, iip->ili_format.ilf_blkno, iip->ili_format.ilf_len, XBF_TRYLOCK); - if (bp != NULL) { - if (XFS_BUF_ISDELAYWRITE(bp)) { - /* - * We were racing with iflush because we don't hold - * the AIL lock or the flush lock. However, at this point, - * we have the buffer, and we know that it's dirty. - * So, it's possible that iflush raced with us, and - * this item is already taken off the AIL. - * If not, we can flush it async. - */ - dopush = ((iip->ili_item.li_flags & XFS_LI_IN_AIL) && - !completion_done(&ip->i_flush)); - iip->ili_pushbuf_flag = 0; - xfs_iunlock(ip, XFS_ILOCK_SHARED); - - trace_xfs_inode_item_push(bp, _RET_IP_); - - if (XFS_BUF_ISPINNED(bp)) - xfs_log_force(mp, 0); - - if (dopush) { - int error; - error = xfs_bawrite(mp, bp); - if (error) - xfs_fs_cmn_err(CE_WARN, mp, - "xfs_inode_item_pushbuf: pushbuf error %d on iip %p, bp %p", - error, iip, bp); - } else { - xfs_buf_relse(bp); - } - } else { - iip->ili_pushbuf_flag = 0; - xfs_iunlock(ip, XFS_ILOCK_SHARED); - xfs_buf_relse(bp); - } - return; - } - /* - * We have to be careful about resetting pushbuf flag too early (above). - * Even though in theory we can do it as soon as we have the buflock, - * we don't want others to be doing work needlessly. They'll come to - * this function thinking that pushing the buffer is their - * responsibility only to find that the buffer is still locked by - * another doing the same thing - */ - iip->ili_pushbuf_flag = 0; xfs_iunlock(ip, XFS_ILOCK_SHARED); + if (!bp) + return; + if (XFS_BUF_ISDELAYWRITE(bp)) + xfs_buf_delwri_promote(bp); + xfs_buf_relse(bp); return; } @@ -937,7 +870,6 @@ xfs_inode_item_init( /* We have zeroed memory. No need ... iip->ili_extents_buf = NULL; - iip->ili_pushbuf_flag = 0; */ iip->ili_format.ilf_type = XFS_LI_INODE; diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h index cc8df1ac7783..9a467958ecdd 100644 --- a/fs/xfs/xfs_inode_item.h +++ b/fs/xfs/xfs_inode_item.h @@ -144,12 +144,6 @@ typedef struct xfs_inode_log_item { data exts */ struct xfs_bmbt_rec *ili_aextents_buf; /* array of logged attr exts */ - unsigned int ili_pushbuf_flag; /* one bit used in push_ail */ - -#ifdef DEBUG - uint64_t ili_push_owner; /* one who sets pushbuf_flag - above gets to push the buf */ -#endif #ifdef XFS_TRANS_DEBUG int ili_root_size; char *ili_orig_root; diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index ca64f33c63a3..c93e3a102857 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -861,8 +861,7 @@ typedef struct xfs_item_ops { #define XFS_ITEM_SUCCESS 0 #define XFS_ITEM_PINNED 1 #define XFS_ITEM_LOCKED 2 -#define XFS_ITEM_FLUSHING 3 -#define XFS_ITEM_PUSHBUF 4 +#define XFS_ITEM_PUSHBUF 3 /* * This structure is used to maintain a list of block ranges that have been diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index d7b1af8a832d..e799824f7245 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -253,6 +253,7 @@ xfsaild_push( int flush_log, count, stuck; xfs_mount_t *mp = ailp->xa_mount; struct xfs_ail_cursor *cur = &ailp->xa_cursors; + int push_xfsbufd = 0; spin_lock(&ailp->xa_lock); xfs_trans_ail_cursor_init(ailp, cur); @@ -308,6 +309,7 @@ xfsaild_push( XFS_STATS_INC(xs_push_ail_pushbuf); IOP_PUSHBUF(lip); last_pushed_lsn = lsn; + push_xfsbufd = 1; break; case XFS_ITEM_PINNED: @@ -322,12 +324,6 @@ xfsaild_push( stuck++; break; - case XFS_ITEM_FLUSHING: - XFS_STATS_INC(xs_push_ail_flushing); - last_pushed_lsn = lsn; - stuck++; - break; - default: ASSERT(0); break; @@ -374,6 +370,11 @@ xfsaild_push( xfs_log_force(mp, 0); } + if (push_xfsbufd) { + /* we've got delayed write buffers to flush */ + wake_up_process(mp->m_ddev_targp->bt_task); + } + if (!count) { /* We're past our target or empty, so idle */ last_pushed_lsn = 0; -- cgit v1.2.3 From 089716aa1480b7197bcd678b8477774c379a2768 Mon Sep 17 00:00:00 2001 From: Dave Chinner <david@fromorbit.com> Date: Tue, 26 Jan 2010 15:13:25 +1100 Subject: xfs: Sort delayed write buffers before dispatch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently when the xfsbufd writes delayed write buffers, it pushes them to disk in the order they come off the delayed write list. If there are lots of buffers Ñ•pread widely over the disk, this results in overwhelming the elevator sort queues in the block layer and we end up losing the posibility of merging adjacent buffers to minimise the number of IOs. Use the new generic list_sort function to sort the delwri dispatch queue before issue to ensure that the buffers are pushed in the most friendly order possible to the lower layers. Signed-off-by: Dave Chinner <david@fromorbit.com> Reviewed-by: Christoph Hellwig <hch@lst.de> --- fs/xfs/linux-2.6/xfs_buf.c | 87 ++++++++++++++++++++++++++++++++-------------- 1 file changed, 60 insertions(+), 27 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index b306265caa33..4556a4c31e36 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -33,6 +33,7 @@ #include <linux/migrate.h> #include <linux/backing-dev.h> #include <linux/freezer.h> +#include <linux/list_sort.h> #include "xfs_sb.h" #include "xfs_inum.h" @@ -1877,14 +1878,42 @@ xfs_buf_delwri_split( } +/* + * Compare function is more complex than it needs to be because + * the return value is only 32 bits and we are doing comparisons + * on 64 bit values + */ +static int +xfs_buf_cmp( + void *priv, + struct list_head *a, + struct list_head *b) +{ + struct xfs_buf *ap = container_of(a, struct xfs_buf, b_list); + struct xfs_buf *bp = container_of(b, struct xfs_buf, b_list); + xfs_daddr_t diff; + + diff = ap->b_bn - bp->b_bn; + if (diff < 0) + return -1; + if (diff > 0) + return 1; + return 0; +} + +void +xfs_buf_delwri_sort( + xfs_buftarg_t *target, + struct list_head *list) +{ + list_sort(NULL, list, xfs_buf_cmp); +} + STATIC int xfsbufd( void *data) { - struct list_head tmp; - xfs_buftarg_t *target = (xfs_buftarg_t *)data; - int count; - xfs_buf_t *bp; + xfs_buftarg_t *target = (xfs_buftarg_t *)data; current->flags |= PF_MEMALLOC; @@ -1893,6 +1922,8 @@ xfsbufd( do { long age = xfs_buf_age_centisecs * msecs_to_jiffies(10); long tout = xfs_buf_timer_centisecs * msecs_to_jiffies(10); + int count = 0; + struct list_head tmp; if (unlikely(freezing(current))) { set_bit(XBT_FORCE_SLEEP, &target->bt_flags); @@ -1907,11 +1938,10 @@ xfsbufd( schedule_timeout_interruptible(tout); xfs_buf_delwri_split(target, &tmp, age); - count = 0; + list_sort(NULL, &tmp, xfs_buf_cmp); while (!list_empty(&tmp)) { - bp = list_entry(tmp.next, xfs_buf_t, b_list); - ASSERT(target == bp->b_target); - + struct xfs_buf *bp; + bp = list_first_entry(&tmp, struct xfs_buf, b_list); list_del_init(&bp->b_list); xfs_buf_iostrategy(bp); count++; @@ -1937,42 +1967,45 @@ xfs_flush_buftarg( xfs_buftarg_t *target, int wait) { - struct list_head tmp; - xfs_buf_t *bp, *n; + xfs_buf_t *bp; int pincount = 0; + LIST_HEAD(tmp_list); + LIST_HEAD(wait_list); xfs_buf_runall_queues(xfsconvertd_workqueue); xfs_buf_runall_queues(xfsdatad_workqueue); xfs_buf_runall_queues(xfslogd_workqueue); set_bit(XBT_FORCE_FLUSH, &target->bt_flags); - pincount = xfs_buf_delwri_split(target, &tmp, 0); + pincount = xfs_buf_delwri_split(target, &tmp_list, 0); /* - * Dropped the delayed write list lock, now walk the temporary list + * Dropped the delayed write list lock, now walk the temporary list. + * All I/O is issued async and then if we need to wait for completion + * we do that after issuing all the IO. */ - list_for_each_entry_safe(bp, n, &tmp, b_list) { + list_sort(NULL, &tmp_list, xfs_buf_cmp); + while (!list_empty(&tmp_list)) { + bp = list_first_entry(&tmp_list, struct xfs_buf, b_list); ASSERT(target == bp->b_target); - if (wait) + list_del_init(&bp->b_list); + if (wait) { bp->b_flags &= ~XBF_ASYNC; - else - list_del_init(&bp->b_list); - + list_add(&bp->b_list, &wait_list); + } xfs_buf_iostrategy(bp); } - if (wait) + if (wait) { + /* Expedite and wait for IO to complete. */ blk_run_address_space(target->bt_mapping); + while (!list_empty(&wait_list)) { + bp = list_first_entry(&wait_list, struct xfs_buf, b_list); - /* - * Remaining list items must be flushed before returning - */ - while (!list_empty(&tmp)) { - bp = list_entry(tmp.next, xfs_buf_t, b_list); - - list_del_init(&bp->b_list); - xfs_iowait(bp); - xfs_buf_relse(bp); + list_del_init(&bp->b_list); + xfs_iowait(bp); + xfs_buf_relse(bp); + } } return pincount; -- cgit v1.2.3 From 7d6a7bde52e449f21a0e86a7a4955b4e08a49d69 Mon Sep 17 00:00:00 2001 From: Dave Chinner <david@fromorbit.com> Date: Tue, 26 Jan 2010 15:13:41 +1100 Subject: xfs: Use delay write promotion for dquot flushing xfs_qm_dqflock_pushbuf_wait() does a very similar trick to item pushing used to do to flush out delayed write dquot buffers. Change it to use the new promotion method rather than an async flush. Also, xfs_qm_dqflock_pushbuf_wait() can return without the flush lock held, yet the callers make the assumption that after this call the flush lock is held. Always return with the flush lock held. Signed-off-by: Dave Chinner <david@fromorbit.com> Reviewed-by: Christoph Hellwig <hch@lst.de> --- fs/xfs/quota/xfs_dquot.c | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c index f9baeedbfdfe..1620a56b067e 100644 --- a/fs/xfs/quota/xfs_dquot.c +++ b/fs/xfs/quota/xfs_dquot.c @@ -1528,21 +1528,16 @@ xfs_qm_dqflock_pushbuf_wait( */ bp = xfs_incore(dqp->q_mount->m_ddev_targp, dqp->q_blkno, XFS_QI_DQCHUNKLEN(dqp->q_mount), XBF_TRYLOCK); - if (bp != NULL) { - if (XFS_BUF_ISDELAYWRITE(bp)) { - int error; - - if (XFS_BUF_ISPINNED(bp)) - xfs_log_force(dqp->q_mount, 0); - error = xfs_bawrite(dqp->q_mount, bp); - if (error) - xfs_fs_cmn_err(CE_WARN, dqp->q_mount, - "xfs_qm_dqflock_pushbuf_wait: " - "pushbuf error %d on dqp %p, bp %p", - error, dqp, bp); - } else { - xfs_buf_relse(bp); - } + if (!bp) + goto out_lock; + + if (XFS_BUF_ISDELAYWRITE(bp)) { + if (XFS_BUF_ISPINNED(bp)) + xfs_log_force(dqp->q_mount, 0); + xfs_buf_delwri_promote(bp); + wake_up_process(bp->b_target->bt_task); } + xfs_buf_relse(bp); +out_lock: xfs_dqflock(dqp); } -- cgit v1.2.3 From d6783b2b6c4050df0ba0a84c6842cf5bc2212ef9 Mon Sep 17 00:00:00 2001 From: Chuck Lever <chuck.lever@oracle.com> Date: Tue, 26 Jan 2010 14:04:04 -0500 Subject: SUNRPC: Bury "#ifdef IPV6" in svc_create_xprt() Clean up: Bruce observed we have more or less common logic in each of svc_create_xprt()'s callers: the check to create an IPv6 RPC listener socket only if CONFIG_IPV6 is set. I'm about to add another case that does just the same. If we move the ifdefs into __svc_xpo_create(), then svc_create_xprt() call sites can get rid of the "#ifdef" ugliness, and can use the same logic with or without IPv6 support available in the kernel. Signed-off-by: Chuck Lever <chuck.lever@oracle.com> Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu> --- fs/lockd/svc.c | 2 -- fs/nfs/callback.c | 2 -- net/sunrpc/svc_xprt.c | 4 ++++ 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index e50cfa3d9654..7d150517ddf0 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -243,11 +243,9 @@ static int make_socks(struct svc_serv *serv) if (err < 0) goto out_err; -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) err = create_lockd_family(serv, PF_INET6); if (err < 0 && err != -EAFNOSUPPORT) goto out_err; -#endif /* CONFIG_IPV6 || CONFIG_IPV6_MODULE */ warned = 0; return 0; diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 73ab220354df..36dfdae95123 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -118,7 +118,6 @@ nfs4_callback_up(struct svc_serv *serv) dprintk("NFS: Callback listener port = %u (af %u)\n", nfs_callback_tcpport, PF_INET); -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) ret = svc_create_xprt(serv, "tcp", PF_INET6, nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS); if (ret > 0) { @@ -129,7 +128,6 @@ nfs4_callback_up(struct svc_serv *serv) ret = 0; else goto out_err; -#endif /* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */ return svc_prepare_thread(serv, &serv->sv_pools[0]); diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 7d1f9e928f69..f886ff367c80 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -173,11 +173,13 @@ static struct svc_xprt *__svc_xpo_create(struct svc_xprt_class *xcl, .sin_addr.s_addr = htonl(INADDR_ANY), .sin_port = htons(port), }; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) struct sockaddr_in6 sin6 = { .sin6_family = AF_INET6, .sin6_addr = IN6ADDR_ANY_INIT, .sin6_port = htons(port), }; +#endif /* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */ struct sockaddr *sap; size_t len; @@ -186,10 +188,12 @@ static struct svc_xprt *__svc_xpo_create(struct svc_xprt_class *xcl, sap = (struct sockaddr *)&sin; len = sizeof(sin); break; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) case PF_INET6: sap = (struct sockaddr *)&sin6; len = sizeof(sin6); break; +#endif /* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */ default: return ERR_PTR(-EAFNOSUPPORT); } -- cgit v1.2.3 From 68717908155a9dcd4161f4d730fea478712d9794 Mon Sep 17 00:00:00 2001 From: Chuck Lever <chuck.lever@oracle.com> Date: Tue, 26 Jan 2010 14:04:13 -0500 Subject: SUNRPC: NFS kernel APIs shouldn't return ENOENT for "transport not found" write_ports() converts svc_create_xprt()'s ENOENT error return to EPROTONOSUPPORT so that rpc.nfsd (in user space) can report an error message that makes sense. It turns out that several of the other kernel APIs rpc.nfsd use can also return ENOENT from svc_create_xprt(), by way of lockd_up(). On the client side, an NFSv2 or NFSv3 mount request can also return the result of lockd_up(). This error may also be returned during an NFSv4 mount request, since the NFSv4 callback service uses svc_create_xprt() to create the callback listener. An ENOENT error return results in a confusing error message from the mount command. Let's have svc_create_xprt() return EPROTONOSUPPORT instead of ENOENT. Signed-off-by: Chuck Lever <chuck.lever@oracle.com> Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu> --- fs/nfsd/nfsctl.c | 6 +----- net/sunrpc/svc_xprt.c | 5 ++++- 2 files changed, 5 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 2604c3e70ea5..f43ecd61fa81 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1002,12 +1002,8 @@ static ssize_t __write_ports_addxprt(char *buf) err = svc_create_xprt(nfsd_serv, transport, PF_INET, port, SVC_SOCK_ANONYMOUS); - if (err < 0) { - /* Give a reasonable perror msg for bad transport string */ - if (err == -ENOENT) - err = -EPROTONOSUPPORT; + if (err < 0) return err; - } return 0; } diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index f886ff367c80..d7ec5caf998c 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -235,7 +235,10 @@ int svc_create_xprt(struct svc_serv *serv, const char *xprt_name, err: spin_unlock(&svc_xprt_class_lock); dprintk("svc: transport %s not found\n", xprt_name); - return -ENOENT; + + /* This errno is exposed to user space. Provide a reasonable + * perror msg for a bad transport. */ + return -EPROTONOSUPPORT; } EXPORT_SYMBOL_GPL(svc_create_xprt); -- cgit v1.2.3 From 37498292aa97658a5d0a9bb84699ce8c1016bb74 Mon Sep 17 00:00:00 2001 From: Chuck Lever <chuck.lever@oracle.com> Date: Tue, 26 Jan 2010 14:04:22 -0500 Subject: NFSD: Create PF_INET6 listener in write_ports Try to create a PF_INET6 listener for NFSD, if IPv6 is enabled in the kernel. Make sure nfsd_serv's reference count is decreased if __write_ports_addxprt() failed to create a listener. See __write_ports_addfd(). Our current plan is to rely on rpc.nfsd to create appropriate IPv6 listeners when server-side NFS/IPv6 support is desired. Legacy behavior, via the write_threads or write_svc kernel APIs, will remain the same -- only IPv4 listeners are created. Signed-off-by: Chuck Lever <chuck.lever@oracle.com> [bfields@citi.umich.edu: Move error-handling code to end] Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu> --- fs/nfsd/nfsctl.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index f43ecd61fa81..0f0e77f2012f 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -988,6 +988,7 @@ static ssize_t __write_ports_delfd(char *buf) static ssize_t __write_ports_addxprt(char *buf) { char transport[16]; + struct svc_xprt *xprt; int port, err; if (sscanf(buf, "%15s %4u", transport, &port) != 2) @@ -1003,8 +1004,23 @@ static ssize_t __write_ports_addxprt(char *buf) err = svc_create_xprt(nfsd_serv, transport, PF_INET, port, SVC_SOCK_ANONYMOUS); if (err < 0) - return err; + goto out_err; + + err = svc_create_xprt(nfsd_serv, transport, + PF_INET6, port, SVC_SOCK_ANONYMOUS); + if (err < 0 && err != -EAFNOSUPPORT) + goto out_close; return 0; +out_close: + xprt = svc_find_xprt(nfsd_serv, transport, PF_INET, port); + if (xprt != NULL) { + svc_close_xprt(xprt); + svc_xprt_put(xprt); + } +out_err: + /* Decrease the count, but don't shut down the service */ + nfsd_serv->sv_nrthreads--; + return err; } /* -- cgit v1.2.3 From aa696a6f349638428982bb52763f4cda851632fa Mon Sep 17 00:00:00 2001 From: Trond Myklebust <Trond.Myklebust@netapp.com> Date: Fri, 29 Jan 2010 16:44:25 -0500 Subject: nfsd: Use vfs_fsync_range() in nfsd_commit The NFS COMMIT operation allows the client to specify the exact byte range that it wishes to sync to disk in order to optimise server performance. Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu> --- fs/nfsd/vfs.c | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 79d216f276d9..ed024d329056 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1141,8 +1141,9 @@ out: #ifdef CONFIG_NFSD_V3 /* * Commit all pending writes to stable storage. - * Strictly speaking, we could sync just the indicated file region here, - * but there's currently no way we can ask the VFS to do so. + * + * Note: we only guarantee that data that lies within the range specified + * by the 'offset' and 'count' parameters will be synced. * * Unfortunately we cannot lock the file to make sure we return full WCC * data to the client, as locking happens lower down in the filesystem. @@ -1152,23 +1153,32 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset, unsigned long count) { struct file *file; - __be32 err; + loff_t end = LLONG_MAX; + __be32 err = nfserr_inval; - if ((u64)count > ~(u64)offset) - return nfserr_inval; + if (offset < 0) + goto out; + if (count != 0) { + end = offset + (loff_t)count - 1; + if (end < offset) + goto out; + } err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_WRITE, &file); if (err) - return err; + goto out; if (EX_ISSYNC(fhp->fh_export)) { - if (file->f_op && file->f_op->fsync) { - err = nfserrno(vfs_fsync(file, file->f_path.dentry, 0)); - } else { + int err2 = vfs_fsync_range(file, file->f_path.dentry, + offset, end, 0); + + if (err2 != -EINVAL) + err = nfserrno(err2); + else err = nfserr_notsupp; - } } nfsd_close(file); +out: return err; } #endif /* CONFIG_NFSD_V3 */ -- cgit v1.2.3 From 20026d92013d7bb3abb295337191def6758fc086 Mon Sep 17 00:00:00 2001 From: Dave Chinner <david@fromorbit.com> Date: Thu, 4 Feb 2010 09:48:58 +1100 Subject: xfs: kill the unused XFS_QMOPT_* flush flags V2 dquots are never flushed asynchronously. Remove the flag and the async write support from the flush function. Make the default flush a delwri flush to make the inode flush code, which leaves the XFS_QMOPT_SYNC the only flag remaining. Convert that to use SYNC_WAIT instead, just like the inode flush code. V2: - just pass flush flags straight through Signed-off-by: Dave Chinner <david@fromorbit.com> Reviewed-by: Christoph Hellwig <hch@lst.de> --- fs/xfs/quota/xfs_dquot.c | 13 ++++++------- fs/xfs/quota/xfs_dquot_item.c | 2 +- fs/xfs/quota/xfs_qm.c | 14 ++++++-------- fs/xfs/xfs_quota.h | 8 +------- 4 files changed, 14 insertions(+), 23 deletions(-) (limited to 'fs') diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c index 1620a56b067e..5f79dd78626b 100644 --- a/fs/xfs/quota/xfs_dquot.c +++ b/fs/xfs/quota/xfs_dquot.c @@ -1187,7 +1187,7 @@ xfs_qm_dqflush( * block, nada. */ if (!XFS_DQ_IS_DIRTY(dqp) || - (!(flags & XFS_QMOPT_SYNC) && atomic_read(&dqp->q_pincount) > 0)) { + (!(flags & SYNC_WAIT) && atomic_read(&dqp->q_pincount) > 0)) { xfs_dqfunlock(dqp); return 0; } @@ -1251,18 +1251,17 @@ xfs_qm_dqflush( xfs_log_force(mp, 0); } - if (flags & XFS_QMOPT_DELWRI) { - xfs_bdwrite(mp, bp); - } else { + if (flags & SYNC_WAIT) error = xfs_bwrite(mp, bp); - } + else + xfs_bdwrite(mp, bp); trace_xfs_dqflush_done(dqp); /* * dqp is still locked, but caller is free to unlock it now. */ - return (error); + return error; } @@ -1443,7 +1442,7 @@ xfs_qm_dqpurge( * We don't care about getting disk errors here. We need * to purge this dquot anyway, so we go ahead regardless. */ - error = xfs_qm_dqflush(dqp, XFS_QMOPT_SYNC); + error = xfs_qm_dqflush(dqp, SYNC_WAIT); if (error) xfs_fs_cmn_err(CE_WARN, mp, "xfs_qm_dqpurge: dquot %p flush failed", dqp); diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c index dda0fb045c8a..4e4ee9a57194 100644 --- a/fs/xfs/quota/xfs_dquot_item.c +++ b/fs/xfs/quota/xfs_dquot_item.c @@ -153,7 +153,7 @@ xfs_qm_dquot_logitem_push( * lock without sleeping, then there must not have been * anyone in the process of flushing the dquot. */ - error = xfs_qm_dqflush(dqp, XFS_QMOPT_DELWRI); + error = xfs_qm_dqflush(dqp, 0); if (error) xfs_fs_cmn_err(CE_WARN, dqp->q_mount, "xfs_qm_dquot_logitem_push: push error %d on dqp %p", diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c index 11cfd8245c7c..8699e51cb45e 100644 --- a/fs/xfs/quota/xfs_qm.c +++ b/fs/xfs/quota/xfs_qm.c @@ -450,7 +450,7 @@ xfs_qm_unmount_quotas( STATIC int xfs_qm_dqflush_all( xfs_mount_t *mp, - int flags) + int sync_mode) { int recl; xfs_dquot_t *dqp; @@ -486,7 +486,7 @@ again: * across a disk write. */ xfs_qm_mplist_unlock(mp); - error = xfs_qm_dqflush(dqp, flags); + error = xfs_qm_dqflush(dqp, sync_mode); xfs_dqunlock(dqp); if (error) return error; @@ -926,13 +926,11 @@ xfs_qm_sync( { int recl, restarts; xfs_dquot_t *dqp; - uint flush_flags; int error; if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp)) return 0; - flush_flags = (flags & SYNC_WAIT) ? XFS_QMOPT_SYNC : XFS_QMOPT_DELWRI; restarts = 0; again: @@ -992,7 +990,7 @@ xfs_qm_sync( * across a disk write */ xfs_qm_mplist_unlock(mp); - error = xfs_qm_dqflush(dqp, flush_flags); + error = xfs_qm_dqflush(dqp, flags); xfs_dqunlock(dqp); if (error && XFS_FORCED_SHUTDOWN(mp)) return 0; /* Need to prevent umount failure */ @@ -1796,7 +1794,7 @@ xfs_qm_quotacheck( * successfully. */ if (!error) - error = xfs_qm_dqflush_all(mp, XFS_QMOPT_DELWRI); + error = xfs_qm_dqflush_all(mp, 0); /* * We can get this error if we couldn't do a dquot allocation inside @@ -2018,7 +2016,7 @@ xfs_qm_shake_freelist( * We flush it delayed write, so don't bother * releasing the mplock. */ - error = xfs_qm_dqflush(dqp, XFS_QMOPT_DELWRI); + error = xfs_qm_dqflush(dqp, 0); if (error) { xfs_fs_cmn_err(CE_WARN, dqp->q_mount, "xfs_qm_dqflush_all: dquot %p flush failed", dqp); @@ -2201,7 +2199,7 @@ xfs_qm_dqreclaim_one(void) * We flush it delayed write, so don't bother * releasing the freelist lock. */ - error = xfs_qm_dqflush(dqp, XFS_QMOPT_DELWRI); + error = xfs_qm_dqflush(dqp, 0); if (error) { xfs_fs_cmn_err(CE_WARN, dqp->q_mount, "xfs_qm_dqreclaim: dquot %p flush failed", dqp); diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index 21d11d9f48f2..fdcab3f81dde 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -222,16 +222,10 @@ typedef struct xfs_qoff_logformat { #define XFS_QMOPT_DELRTBCOUNT 0x0400000 #define XFS_QMOPT_RES_INOS 0x0800000 -/* - * flags for dqflush and dqflush_all. - */ -#define XFS_QMOPT_SYNC 0x1000000 -#define XFS_QMOPT_DELWRI 0x4000000 - /* * flags for dqalloc. */ -#define XFS_QMOPT_INHERIT 0x8000000 +#define XFS_QMOPT_INHERIT 0x1000000 /* * flags to xfs_trans_mod_dquot. -- cgit v1.2.3 From e8b217e7530c6a073ac69f1c85b922d93fdf5647 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Tue, 2 Feb 2010 10:16:26 +1100 Subject: xfs: remove invalid barrier optimization from xfs_fsync We always need to flush the disk write cache and can't skip it just because the no inode attributes have changed. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Dave Chinner <david@fromorbit.com> --- fs/xfs/xfs_vnodeops.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index fd108b738559..43241e289800 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -597,7 +597,7 @@ xfs_fsync( { xfs_trans_t *tp; int error = 0; - int log_flushed = 0, changed = 1; + int log_flushed = 0; xfs_itrace_entry(ip); @@ -627,18 +627,10 @@ xfs_fsync( * disk yet, the inode will be still be pinned. If it is, * force the log. */ - xfs_iunlock(ip, XFS_ILOCK_SHARED); - if (xfs_ipincount(ip)) { error = _xfs_log_force(ip->i_mount, XFS_LOG_SYNC, &log_flushed); - } else { - /* - * If the inode is not pinned and nothing has changed - * we don't need to flush the cache. - */ - changed = 0; } } else { /* @@ -673,7 +665,7 @@ xfs_fsync( xfs_iunlock(ip, XFS_ILOCK_EXCL); } - if ((ip->i_mount->m_flags & XFS_MOUNT_BARRIER) && changed) { + if (ip->i_mount->m_flags & XFS_MOUNT_BARRIER) { /* * If the log write didn't issue an ordered tag we need * to flush the disk cache for the data device now. -- cgit v1.2.3 From 07fec73625dc0db6f9aed68019918208a2ca53f5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@infradead.org> Date: Tue, 9 Feb 2010 11:43:49 +1100 Subject: xfs: log changed inodes instead of writing them synchronously When an inode has already be flushed delayed write, xfs_inode_clean() returns true and hence xfs_fs_write_inode() can return on a synchronous inode write without having written the inode. Currently these sycnhronous writes only come sync(1), unmount, a sycnhronous NFS export and cachefiles so should be relatively rare and out of common performance paths. Realistically, a synchronous inode write is not necessary here; we can avoid writing the inode by logging any non-transactional changes that are pending. This needs to be done with synchronous transactions, but it avoids seeking between the log and inode clusters as we do now. We don't force the log if the inode is pinned, though, so this differs from the fsync case. For normal sys_sync and unmount behaviour this is fine because we do a synchronous log force in xfs_sync_data which is called from the ->sync_fs code. It does however break the NFS synchronous export guarantees for now, but work is under way to fix this at a higher level or for the higher level to provide an additional flag in the writeback control to tell us that a log force is needed. Portions of this patch are based on work from Dave Chinner. Signed-off-by: Christoph Hellwig <hch@infradead.org> Reviewed-by: Dave Chinner <david@fromorbit.com> Reviewed-by: Alex Elder <aelder@sgi.com> --- fs/xfs/linux-2.6/xfs_super.c | 111 ++++++++++++++++++++++++++++++++----------- 1 file changed, 82 insertions(+), 29 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 3b5b46b8e3b9..25ea2408118f 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -1021,12 +1021,45 @@ xfs_fs_dirty_inode( XFS_I(inode)->i_update_core = 1; } -/* - * Attempt to flush the inode, this will actually fail - * if the inode is pinned, but we dirty the inode again - * at the point when it is unpinned after a log write, - * since this is when the inode itself becomes flushable. - */ +STATIC int +xfs_log_inode( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + int error; + + xfs_iunlock(ip, XFS_ILOCK_SHARED); + tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); + error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0); + + if (error) { + xfs_trans_cancel(tp, 0); + /* we need to return with the lock hold shared */ + xfs_ilock(ip, XFS_ILOCK_SHARED); + return error; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + + /* + * Note - it's possible that we might have pushed ourselves out of the + * way during trans_reserve which would flush the inode. But there's + * no guarantee that the inode buffer has actually gone out yet (it's + * delwri). Plus the buffer could be pinned anyway if it's part of + * an inode in another recent transaction. So we play it safe and + * fire off the transaction anyway. + */ + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_ihold(tp, ip); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + xfs_trans_set_sync(tp); + error = xfs_trans_commit(tp, 0); + xfs_ilock_demote(ip, XFS_ILOCK_EXCL); + + return error; +} + STATIC int xfs_fs_write_inode( struct inode *inode, @@ -1034,7 +1067,7 @@ xfs_fs_write_inode( { struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; - int error = 0; + int error = EAGAIN; xfs_itrace_entry(ip); @@ -1045,35 +1078,55 @@ xfs_fs_write_inode( error = xfs_wait_on_pages(ip, 0, -1); if (error) goto out; - } - - /* - * Bypass inodes which have already been cleaned by - * the inode flush clustering code inside xfs_iflush - */ - if (xfs_inode_clean(ip)) - goto out; - /* - * We make this non-blocking if the inode is contended, return - * EAGAIN to indicate to the caller that they did not succeed. - * This prevents the flush path from blocking on inodes inside - * another operation right now, they get caught later by xfs_sync. - */ - if (sync) { + /* + * Make sure the inode has hit stable storage. By using the + * log and the fsync transactions we reduce the IOs we have + * to do here from two (log and inode) to just the log. + * + * Note: We still need to do a delwri write of the inode after + * this to flush it to the backing buffer so that bulkstat + * works properly if this is the first time the inode has been + * written. Because we hold the ilock atomically over the + * transaction commit and the inode flush we are guaranteed + * that the inode is not pinned when it returns. If the flush + * lock is already held, then the inode has already been + * flushed once and we don't need to flush it again. Hence + * the code will only flush the inode if it isn't already + * being flushed. + */ xfs_ilock(ip, XFS_ILOCK_SHARED); - xfs_iflock(ip); - - error = xfs_iflush(ip, SYNC_WAIT); + if (ip->i_update_core) { + error = xfs_log_inode(ip); + if (error) + goto out_unlock; + } } else { - error = EAGAIN; + /* + * We make this non-blocking if the inode is contended, return + * EAGAIN to indicate to the caller that they did not succeed. + * This prevents the flush path from blocking on inodes inside + * another operation right now, they get caught later by xfs_sync. + */ if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) goto out; - if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip)) - goto out_unlock; + } + + if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip)) + goto out_unlock; - error = xfs_iflush(ip, 0); + /* + * Now we have the flush lock and the inode is not pinned, we can check + * if the inode is really clean as we know that there are no pending + * transaction completions, it is not waiting on the delayed write + * queue and there is no IO in progress. + */ + if (xfs_inode_clean(ip)) { + xfs_ifunlock(ip); + error = 0; + goto out_unlock; } + error = xfs_iflush(ip, 0); out_unlock: xfs_iunlock(ip, XFS_ILOCK_SHARED); -- cgit v1.2.3 From 5322892d867e186c6b4c5fff5c99ea4863696a60 Mon Sep 17 00:00:00 2001 From: Dave Chinner <david@fromorbit.com> Date: Thu, 4 Feb 2010 10:09:14 +1100 Subject: xfs: kill xfs_bawrite There are no more users of this function left in the XFS code now that we've switched everything to delayed write flushing. Remove it. Signed-off-by: Dave Chinner <david@fromorbit.com> Reviewed-by: Christoph Hellwig <hch@lst.de> --- fs/xfs/linux-2.6/xfs_buf.c | 19 ------------------- fs/xfs/linux-2.6/xfs_buf.h | 1 - 2 files changed, 20 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 4556a4c31e36..d50df3a8101c 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -1078,25 +1078,6 @@ xfs_bwrite( return error; } -int -xfs_bawrite( - void *mp, - struct xfs_buf *bp) -{ - trace_xfs_buf_bawrite(bp, _RET_IP_); - - ASSERT(bp->b_bn != XFS_BUF_DADDR_NULL); - - xfs_buf_delwri_dequeue(bp); - - bp->b_flags &= ~(XBF_READ | XBF_DELWRI | XBF_READ_AHEAD); - bp->b_flags |= (XBF_WRITE | XBF_ASYNC | _XBF_RUN_QUEUES); - - bp->b_mount = mp; - bp->b_strat = xfs_bdstrat_cb; - return xfs_bdstrat_cb(bp); -} - void xfs_bdwrite( void *mp, diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h index be45e8c5768d..386e7361e50e 100644 --- a/fs/xfs/linux-2.6/xfs_buf.h +++ b/fs/xfs/linux-2.6/xfs_buf.h @@ -233,7 +233,6 @@ extern void xfs_buf_unlock(xfs_buf_t *); /* Buffer Read and Write Routines */ extern int xfs_bwrite(struct xfs_mount *mp, struct xfs_buf *bp); -extern int xfs_bawrite(void *mp, xfs_buf_t *bp); extern void xfs_bdwrite(void *mp, xfs_buf_t *bp); extern void xfsbdstrat(struct xfs_mount *, struct xfs_buf *); -- cgit v1.2.3 From 002345925e6c45861f60db6f4fc6236713fd8847 Mon Sep 17 00:00:00 2001 From: Kees Cook <kees.cook@canonical.com> Date: Wed, 3 Feb 2010 15:36:43 -0800 Subject: syslog: distinguish between /proc/kmsg and syscalls This allows the LSM to distinguish between syslog functions originating from /proc/kmsg access and direct syscalls. By default, the commoncaps will now no longer require CAP_SYS_ADMIN to read an opened /proc/kmsg file descriptor. For example the kernel syslog reader can now drop privileges after opening /proc/kmsg, instead of staying privileged with CAP_SYS_ADMIN. MAC systems that implement security_syslog have unchanged behavior. Signed-off-by: Kees Cook <kees.cook@canonical.com> Acked-by: Serge Hallyn <serue@us.ibm.com> Acked-by: John Johansen <john.johansen@canonical.com> Signed-off-by: James Morris <jmorris@namei.org> --- fs/proc/kmsg.c | 14 +++++++------- include/linux/security.h | 11 ++++++----- include/linux/syslog.h | 29 +++++++++++++++++++++++++++++ kernel/printk.c | 7 ++++--- security/commoncap.c | 7 ++++++- security/security.c | 4 ++-- security/selinux/hooks.c | 5 +++-- security/smack/smack_lsm.c | 4 ++-- 8 files changed, 59 insertions(+), 22 deletions(-) create mode 100644 include/linux/syslog.h (limited to 'fs') diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c index 7ca78346d3f0..6a3d843a1088 100644 --- a/fs/proc/kmsg.c +++ b/fs/proc/kmsg.c @@ -12,37 +12,37 @@ #include <linux/poll.h> #include <linux/proc_fs.h> #include <linux/fs.h> +#include <linux/syslog.h> #include <asm/uaccess.h> #include <asm/io.h> extern wait_queue_head_t log_wait; -extern int do_syslog(int type, char __user *bug, int count); - static int kmsg_open(struct inode * inode, struct file * file) { - return do_syslog(1,NULL,0); + return do_syslog(1, NULL, 0, SYSLOG_FROM_FILE); } static int kmsg_release(struct inode * inode, struct file * file) { - (void) do_syslog(0,NULL,0); + (void) do_syslog(0, NULL, 0, SYSLOG_FROM_FILE); return 0; } static ssize_t kmsg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { - if ((file->f_flags & O_NONBLOCK) && !do_syslog(9, NULL, 0)) + if ((file->f_flags & O_NONBLOCK) && + !do_syslog(9, NULL, 0, SYSLOG_FROM_FILE)) return -EAGAIN; - return do_syslog(2, buf, count); + return do_syslog(2, buf, count, SYSLOG_FROM_FILE); } static unsigned int kmsg_poll(struct file *file, poll_table *wait) { poll_wait(file, &log_wait, wait); - if (do_syslog(9, NULL, 0)) + if (do_syslog(9, NULL, 0, SYSLOG_FROM_FILE)) return POLLIN | POLLRDNORM; return 0; } diff --git a/include/linux/security.h b/include/linux/security.h index 26eca85b2417..a4dc74d86ac6 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -76,7 +76,7 @@ extern int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3, extern int cap_task_setscheduler(struct task_struct *p, int policy, struct sched_param *lp); extern int cap_task_setioprio(struct task_struct *p, int ioprio); extern int cap_task_setnice(struct task_struct *p, int nice); -extern int cap_syslog(int type); +extern int cap_syslog(int type, bool from_file); extern int cap_vm_enough_memory(struct mm_struct *mm, long pages); struct msghdr; @@ -1349,6 +1349,7 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) * logging to the console. * See the syslog(2) manual page for an explanation of the @type values. * @type contains the type of action. + * @from_file indicates the context of action (if it came from /proc). * Return 0 if permission is granted. * @settime: * Check permission to change the system time. @@ -1463,7 +1464,7 @@ struct security_operations { int (*sysctl) (struct ctl_table *table, int op); int (*quotactl) (int cmds, int type, int id, struct super_block *sb); int (*quota_on) (struct dentry *dentry); - int (*syslog) (int type); + int (*syslog) (int type, bool from_file); int (*settime) (struct timespec *ts, struct timezone *tz); int (*vm_enough_memory) (struct mm_struct *mm, long pages); @@ -1762,7 +1763,7 @@ int security_acct(struct file *file); int security_sysctl(struct ctl_table *table, int op); int security_quotactl(int cmds, int type, int id, struct super_block *sb); int security_quota_on(struct dentry *dentry); -int security_syslog(int type); +int security_syslog(int type, bool from_file); int security_settime(struct timespec *ts, struct timezone *tz); int security_vm_enough_memory(long pages); int security_vm_enough_memory_mm(struct mm_struct *mm, long pages); @@ -2008,9 +2009,9 @@ static inline int security_quota_on(struct dentry *dentry) return 0; } -static inline int security_syslog(int type) +static inline int security_syslog(int type, bool from_file) { - return cap_syslog(type); + return cap_syslog(type, from_file); } static inline int security_settime(struct timespec *ts, struct timezone *tz) diff --git a/include/linux/syslog.h b/include/linux/syslog.h new file mode 100644 index 000000000000..5f02b1817be1 --- /dev/null +++ b/include/linux/syslog.h @@ -0,0 +1,29 @@ +/* Syslog internals + * + * Copyright 2010 Canonical, Ltd. + * Author: Kees Cook <kees.cook@canonical.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#ifndef _LINUX_SYSLOG_H +#define _LINUX_SYSLOG_H + +#define SYSLOG_FROM_CALL 0 +#define SYSLOG_FROM_FILE 1 + +int do_syslog(int type, char __user *buf, int count, bool from_file); + +#endif /* _LINUX_SYSLOG_H */ diff --git a/kernel/printk.c b/kernel/printk.c index 17463ca2e229..809cf9a258a0 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -35,6 +35,7 @@ #include <linux/kexec.h> #include <linux/ratelimit.h> #include <linux/kmsg_dump.h> +#include <linux/syslog.h> #include <asm/uaccess.h> @@ -273,14 +274,14 @@ static inline void boot_delay_msec(void) * 9 -- Return number of unread characters in the log buffer * 10 -- Return size of the log buffer */ -int do_syslog(int type, char __user *buf, int len) +int do_syslog(int type, char __user *buf, int len, bool from_file) { unsigned i, j, limit, count; int do_clear = 0; char c; int error = 0; - error = security_syslog(type); + error = security_syslog(type, from_file); if (error) return error; @@ -417,7 +418,7 @@ out: SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) { - return do_syslog(type, buf, len); + return do_syslog(type, buf, len, SYSLOG_FROM_CALL); } /* diff --git a/security/commoncap.c b/security/commoncap.c index f800fdb3de94..677fad9d5cba 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -27,6 +27,7 @@ #include <linux/sched.h> #include <linux/prctl.h> #include <linux/securebits.h> +#include <linux/syslog.h> /* * If a non-root user executes a setuid-root binary in @@ -888,12 +889,16 @@ error: /** * cap_syslog - Determine whether syslog function is permitted * @type: Function requested + * @from_file: Whether this request came from an open file (i.e. /proc) * * Determine whether the current process is permitted to use a particular * syslog function, returning 0 if permission is granted, -ve if not. */ -int cap_syslog(int type) +int cap_syslog(int type, bool from_file) { + /* /proc/kmsg can open be opened by CAP_SYS_ADMIN */ + if (type != 1 && from_file) + return 0; if ((type != 3 && type != 10) && !capable(CAP_SYS_ADMIN)) return -EPERM; return 0; diff --git a/security/security.c b/security/security.c index 440afe5eb54c..971092c06f31 100644 --- a/security/security.c +++ b/security/security.c @@ -203,9 +203,9 @@ int security_quota_on(struct dentry *dentry) return security_ops->quota_on(dentry); } -int security_syslog(int type) +int security_syslog(int type, bool from_file) { - return security_ops->syslog(type); + return security_ops->syslog(type, from_file); } int security_settime(struct timespec *ts, struct timezone *tz) diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 9a2ee845e9d4..a4862a0730fa 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -76,6 +76,7 @@ #include <linux/selinux.h> #include <linux/mutex.h> #include <linux/posix-timers.h> +#include <linux/syslog.h> #include "avc.h" #include "objsec.h" @@ -2049,11 +2050,11 @@ static int selinux_quota_on(struct dentry *dentry) return dentry_has_perm(cred, NULL, dentry, FILE__QUOTAON); } -static int selinux_syslog(int type) +static int selinux_syslog(int type, bool from_file) { int rc; - rc = cap_syslog(type); + rc = cap_syslog(type, from_file); if (rc) return rc; diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index 529c9ca65878..a5721b373f53 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -157,12 +157,12 @@ static int smack_ptrace_traceme(struct task_struct *ptp) * * Returns 0 on success, error code otherwise. */ -static int smack_syslog(int type) +static int smack_syslog(int type, bool from_file) { int rc; char *sp = current_security(); - rc = cap_syslog(type); + rc = cap_syslog(type, from_file); if (rc != 0) return rc; -- cgit v1.2.3 From d78ca3cd733d8a2c3dcd88471beb1a15d973eed8 Mon Sep 17 00:00:00 2001 From: Kees Cook <kees.cook@canonical.com> Date: Wed, 3 Feb 2010 15:37:13 -0800 Subject: syslog: use defined constants instead of raw numbers Right now the syslog "type" action are just raw numbers which makes the source difficult to follow. This patch replaces the raw numbers with defined constants for some level of sanity. Signed-off-by: Kees Cook <kees.cook@canonical.com> Acked-by: John Johansen <john.johansen@canonical.com> Acked-by: Serge Hallyn <serue@us.ibm.com> Signed-off-by: James Morris <jmorris@namei.org> --- fs/proc/kmsg.c | 10 +++++----- include/linux/syslog.h | 23 +++++++++++++++++++++++ kernel/printk.c | 45 +++++++++++++++++++-------------------------- security/commoncap.c | 5 +++-- security/selinux/hooks.c | 21 +++++++++++---------- 5 files changed, 61 insertions(+), 43 deletions(-) (limited to 'fs') diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c index 6a3d843a1088..cfe90a48a6e8 100644 --- a/fs/proc/kmsg.c +++ b/fs/proc/kmsg.c @@ -21,12 +21,12 @@ extern wait_queue_head_t log_wait; static int kmsg_open(struct inode * inode, struct file * file) { - return do_syslog(1, NULL, 0, SYSLOG_FROM_FILE); + return do_syslog(SYSLOG_ACTION_OPEN, NULL, 0, SYSLOG_FROM_FILE); } static int kmsg_release(struct inode * inode, struct file * file) { - (void) do_syslog(0, NULL, 0, SYSLOG_FROM_FILE); + (void) do_syslog(SYSLOG_ACTION_CLOSE, NULL, 0, SYSLOG_FROM_FILE); return 0; } @@ -34,15 +34,15 @@ static ssize_t kmsg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { if ((file->f_flags & O_NONBLOCK) && - !do_syslog(9, NULL, 0, SYSLOG_FROM_FILE)) + !do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_FILE)) return -EAGAIN; - return do_syslog(2, buf, count, SYSLOG_FROM_FILE); + return do_syslog(SYSLOG_ACTION_READ, buf, count, SYSLOG_FROM_FILE); } static unsigned int kmsg_poll(struct file *file, poll_table *wait) { poll_wait(file, &log_wait, wait); - if (do_syslog(9, NULL, 0, SYSLOG_FROM_FILE)) + if (do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_FILE)) return POLLIN | POLLRDNORM; return 0; } diff --git a/include/linux/syslog.h b/include/linux/syslog.h index 5f02b1817be1..38911391a139 100644 --- a/include/linux/syslog.h +++ b/include/linux/syslog.h @@ -21,6 +21,29 @@ #ifndef _LINUX_SYSLOG_H #define _LINUX_SYSLOG_H +/* Close the log. Currently a NOP. */ +#define SYSLOG_ACTION_CLOSE 0 +/* Open the log. Currently a NOP. */ +#define SYSLOG_ACTION_OPEN 1 +/* Read from the log. */ +#define SYSLOG_ACTION_READ 2 +/* Read all messages remaining in the ring buffer. */ +#define SYSLOG_ACTION_READ_ALL 3 +/* Read and clear all messages remaining in the ring buffer */ +#define SYSLOG_ACTION_READ_CLEAR 4 +/* Clear ring buffer. */ +#define SYSLOG_ACTION_CLEAR 5 +/* Disable printk's to console */ +#define SYSLOG_ACTION_CONSOLE_OFF 6 +/* Enable printk's to console */ +#define SYSLOG_ACTION_CONSOLE_ON 7 +/* Set level of messages printed to console */ +#define SYSLOG_ACTION_CONSOLE_LEVEL 8 +/* Return number of unread characters in the log buffer */ +#define SYSLOG_ACTION_SIZE_UNREAD 9 +/* Return size of the log buffer */ +#define SYSLOG_ACTION_SIZE_BUFFER 10 + #define SYSLOG_FROM_CALL 0 #define SYSLOG_FROM_FILE 1 diff --git a/kernel/printk.c b/kernel/printk.c index 809cf9a258a0..3e162d867098 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -259,21 +259,6 @@ static inline void boot_delay_msec(void) } #endif -/* - * Commands to do_syslog: - * - * 0 -- Close the log. Currently a NOP. - * 1 -- Open the log. Currently a NOP. - * 2 -- Read from the log. - * 3 -- Read all messages remaining in the ring buffer. - * 4 -- Read and clear all messages remaining in the ring buffer - * 5 -- Clear ring buffer. - * 6 -- Disable printk's to console - * 7 -- Enable printk's to console - * 8 -- Set level of messages printed to console - * 9 -- Return number of unread characters in the log buffer - * 10 -- Return size of the log buffer - */ int do_syslog(int type, char __user *buf, int len, bool from_file) { unsigned i, j, limit, count; @@ -286,11 +271,11 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) return error; switch (type) { - case 0: /* Close log */ + case SYSLOG_ACTION_CLOSE: /* Close log */ break; - case 1: /* Open log */ + case SYSLOG_ACTION_OPEN: /* Open log */ break; - case 2: /* Read from log */ + case SYSLOG_ACTION_READ: /* Read from log */ error = -EINVAL; if (!buf || len < 0) goto out; @@ -321,10 +306,12 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) if (!error) error = i; break; - case 4: /* Read/clear last kernel messages */ + /* Read/clear last kernel messages */ + case SYSLOG_ACTION_READ_CLEAR: do_clear = 1; /* FALL THRU */ - case 3: /* Read last kernel messages */ + /* Read last kernel messages */ + case SYSLOG_ACTION_READ_ALL: error = -EINVAL; if (!buf || len < 0) goto out; @@ -377,21 +364,25 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) } } break; - case 5: /* Clear ring buffer */ + /* Clear ring buffer */ + case SYSLOG_ACTION_CLEAR: logged_chars = 0; break; - case 6: /* Disable logging to console */ + /* Disable logging to console */ + case SYSLOG_ACTION_CONSOLE_OFF: if (saved_console_loglevel == -1) saved_console_loglevel = console_loglevel; console_loglevel = minimum_console_loglevel; break; - case 7: /* Enable logging to console */ + /* Enable logging to console */ + case SYSLOG_ACTION_CONSOLE_ON: if (saved_console_loglevel != -1) { console_loglevel = saved_console_loglevel; saved_console_loglevel = -1; } break; - case 8: /* Set level of messages printed to console */ + /* Set level of messages printed to console */ + case SYSLOG_ACTION_CONSOLE_LEVEL: error = -EINVAL; if (len < 1 || len > 8) goto out; @@ -402,10 +393,12 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) saved_console_loglevel = -1; error = 0; break; - case 9: /* Number of chars in the log buffer */ + /* Number of chars in the log buffer */ + case SYSLOG_ACTION_SIZE_UNREAD: error = log_end - log_start; break; - case 10: /* Size of the log buffer */ + /* Size of the log buffer */ + case SYSLOG_ACTION_SIZE_BUFFER: error = log_buf_len; break; default: diff --git a/security/commoncap.c b/security/commoncap.c index 677fad9d5cba..cf01b2eebb60 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -897,9 +897,10 @@ error: int cap_syslog(int type, bool from_file) { /* /proc/kmsg can open be opened by CAP_SYS_ADMIN */ - if (type != 1 && from_file) + if (type != SYSLOG_ACTION_OPEN && from_file) return 0; - if ((type != 3 && type != 10) && !capable(CAP_SYS_ADMIN)) + if ((type != SYSLOG_ACTION_READ_ALL && + type != SYSLOG_ACTION_SIZE_BUFFER) && !capable(CAP_SYS_ADMIN)) return -EPERM; return 0; } diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index a4862a0730fa..6b36ce2eef2e 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -2059,20 +2059,21 @@ static int selinux_syslog(int type, bool from_file) return rc; switch (type) { - case 3: /* Read last kernel messages */ - case 10: /* Return size of the log buffer */ + case SYSLOG_ACTION_READ_ALL: /* Read last kernel messages */ + case SYSLOG_ACTION_SIZE_BUFFER: /* Return size of the log buffer */ rc = task_has_system(current, SYSTEM__SYSLOG_READ); break; - case 6: /* Disable logging to console */ - case 7: /* Enable logging to console */ - case 8: /* Set level of messages printed to console */ + case SYSLOG_ACTION_CONSOLE_OFF: /* Disable logging to console */ + case SYSLOG_ACTION_CONSOLE_ON: /* Enable logging to console */ + /* Set level of messages printed to console */ + case SYSLOG_ACTION_CONSOLE_LEVEL: rc = task_has_system(current, SYSTEM__SYSLOG_CONSOLE); break; - case 0: /* Close log */ - case 1: /* Open log */ - case 2: /* Read from log */ - case 4: /* Read/clear last kernel messages */ - case 5: /* Clear ring buffer */ + case SYSLOG_ACTION_CLOSE: /* Close log */ + case SYSLOG_ACTION_OPEN: /* Open log */ + case SYSLOG_ACTION_READ: /* Read from log */ + case SYSLOG_ACTION_READ_CLEAR: /* Read/clear last kernel messages */ + case SYSLOG_ACTION_CLEAR: /* Clear ring buffer */ default: rc = task_has_system(current, SYSTEM__SYSLOG_MOD); break; -- cgit v1.2.3 From b21dda438baa450a76a375a35653ae0377793fab Mon Sep 17 00:00:00 2001 From: Miklos Szeredi <mszeredi@suse.cz> Date: Fri, 5 Feb 2010 12:08:31 +0100 Subject: fuse: cleanup in fuse_notify_inval_...() Small cleanup in fuse_notify_inval_inode() and fuse_notify_inval_entry(). Signed-off-by: Miklos Szeredi <mszeredi@suse.cz> --- fs/fuse/dev.c | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 51d9e33d634f..ab622305c2f5 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -865,13 +865,10 @@ static int fuse_notify_inval_inode(struct fuse_conn *fc, unsigned int size, down_read(&fc->killsb); err = -ENOENT; - if (!fc->sb) - goto err_unlock; - - err = fuse_reverse_inval_inode(fc->sb, outarg.ino, - outarg.off, outarg.len); - -err_unlock: + if (fc->sb) { + err = fuse_reverse_inval_inode(fc->sb, outarg.ino, + outarg.off, outarg.len); + } up_read(&fc->killsb); return err; @@ -910,12 +907,8 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size, down_read(&fc->killsb); err = -ENOENT; - if (!fc->sb) - goto err_unlock; - - err = fuse_reverse_inval_entry(fc->sb, outarg.parent, &name); - -err_unlock: + if (fc->sb) + err = fuse_reverse_inval_entry(fc->sb, outarg.parent, &name); up_read(&fc->killsb); return err; -- cgit v1.2.3 From b2d82ee3c8b2193ee5bc7eca4687ee9be30abd34 Mon Sep 17 00:00:00 2001 From: Fang Wenqi <anton.fang@gmail.com> Date: Wed, 30 Dec 2009 18:37:13 +0800 Subject: fuse: fix large stack use MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit gcc 4.4 warns about: fs/fuse/dev.c: In function ‘fuse_notify_inval_entry’: fs/fuse/dev.c:925: warning: the frame size of 1060 bytes is larger than 1024 bytes The problem is we declare two structures and a large array on the stack, I move the array alway from the stack and allocate memory for it dynamically. Signed-off-by: Fang Wenqi <antonf@turbolinux.com.cn> Signed-off-by: Miklos Szeredi <mszeredi@suse.cz> --- fs/fuse/dev.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index ab622305c2f5..eb7e9423691f 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -881,10 +881,15 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size, struct fuse_copy_state *cs) { struct fuse_notify_inval_entry_out outarg; - int err = -EINVAL; - char buf[FUSE_NAME_MAX+1]; + int err = -ENOMEM; + char *buf; struct qstr name; + buf = kzalloc(FUSE_NAME_MAX + 1, GFP_KERNEL); + if (!buf) + goto err; + + err = -EINVAL; if (size < sizeof(outarg)) goto err; @@ -910,9 +915,11 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size, if (fc->sb) err = fuse_reverse_inval_entry(fc->sb, outarg.parent, &name); up_read(&fc->killsb); + kfree(buf); return err; err: + kfree(buf); fuse_copy_finish(cs); return err; } -- cgit v1.2.3 From 73c77e2ccc14413c232c3e0b3aa43a0c4b72ec70 Mon Sep 17 00:00:00 2001 From: James Bottomley <James.Bottomley@suse.de> Date: Mon, 25 Jan 2010 11:42:24 -0600 Subject: xfs: fix xfs to work with Virtually Indexed architectures xfs_buf.c includes what is essentially a hand rolled version of blk_rq_map_kern(). In order to work properly with the vmalloc buffers that xfs uses, this hand rolled routine must also implement the flushing API for vmap/vmalloc areas. [style updates from hch@lst.de] Acked-by: Christoph Hellwig <hch@lst.de> Signed-off-by: James Bottomley <James.Bottomley@suse.de> --- fs/xfs/linux-2.6/xfs_buf.c | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 77b8be81c769..6f3ebb634b8b 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -76,6 +76,27 @@ struct workqueue_struct *xfsconvertd_workqueue; #define xfs_buf_deallocate(bp) \ kmem_zone_free(xfs_buf_zone, (bp)); +static inline int +xfs_buf_is_vmapped( + struct xfs_buf *bp) +{ + /* + * Return true if the buffer is vmapped. + * + * The XBF_MAPPED flag is set if the buffer should be mapped, but the + * code is clever enough to know it doesn't have to map a single page, + * so the check has to be both for XBF_MAPPED and bp->b_page_count > 1. + */ + return (bp->b_flags & XBF_MAPPED) && bp->b_page_count > 1; +} + +static inline int +xfs_buf_vmap_len( + struct xfs_buf *bp) +{ + return (bp->b_page_count * PAGE_SIZE) - bp->b_offset; +} + /* * Page Region interfaces. * @@ -314,7 +335,7 @@ xfs_buf_free( if (bp->b_flags & (_XBF_PAGE_CACHE|_XBF_PAGES)) { uint i; - if ((bp->b_flags & XBF_MAPPED) && (bp->b_page_count > 1)) + if (xfs_buf_is_vmapped(bp)) free_address(bp->b_addr - bp->b_offset); for (i = 0; i < bp->b_page_count; i++) { @@ -1107,6 +1128,9 @@ xfs_buf_bio_end_io( xfs_buf_ioerror(bp, -error); + if (!error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ)) + invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp)); + do { struct page *page = bvec->bv_page; @@ -1216,6 +1240,10 @@ next_chunk: submit_io: if (likely(bio->bi_size)) { + if (xfs_buf_is_vmapped(bp)) { + flush_kernel_vmap_range(bp->b_addr, + xfs_buf_vmap_len(bp)); + } submit_bio(rw, bio); if (size) goto next_chunk; -- cgit v1.2.3 From 123df2944c436c80640c4281c5bc9c7950b18687 Mon Sep 17 00:00:00 2001 From: Al Viro <viro@zeniv.linux.org.uk> Date: Fri, 25 Dec 2009 04:57:57 -0500 Subject: Lose the new_name argument of fsnotify_move() it's always new_dentry->d_name.name Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> --- fs/debugfs/inode.c | 2 +- fs/namei.c | 6 ++---- include/linux/fsnotify.h | 3 ++- 3 files changed, 5 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 274ac865bae8..049d6c36da09 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -496,7 +496,7 @@ struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry, } d_move(old_dentry, dentry); fsnotify_move(old_dir->d_inode, new_dir->d_inode, old_name, - old_dentry->d_name.name, S_ISDIR(old_dentry->d_inode->i_mode), + S_ISDIR(old_dentry->d_inode->i_mode), NULL, old_dentry); fsnotify_oldname_free(old_name); unlock_rename(new_dir, old_dir); diff --git a/fs/namei.c b/fs/namei.c index d62fdc875f22..f69df876fac3 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2661,11 +2661,9 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, error = vfs_rename_dir(old_dir,old_dentry,new_dir,new_dentry); else error = vfs_rename_other(old_dir,old_dentry,new_dir,new_dentry); - if (!error) { - const char *new_name = old_dentry->d_name.name; - fsnotify_move(old_dir, new_dir, old_name, new_name, is_dir, + if (!error) + fsnotify_move(old_dir, new_dir, old_name, is_dir, new_dentry->d_inode, old_dentry); - } fsnotify_oldname_free(old_name); return error; diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index 936f9aa8bb97..2d755c49c324 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -65,7 +65,7 @@ static inline void fsnotify_link_count(struct inode *inode) * fsnotify_move - file old_name at old_dir was moved to new_name at new_dir */ static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir, - const char *old_name, const char *new_name, + const char *old_name, int isdir, struct inode *target, struct dentry *moved) { struct inode *source = moved->d_inode; @@ -73,6 +73,7 @@ static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir, u32 fs_cookie = fsnotify_get_cookie(); __u32 old_dir_mask = (FS_EVENT_ON_CHILD | FS_MOVED_FROM); __u32 new_dir_mask = (FS_EVENT_ON_CHILD | FS_MOVED_TO); + const char *new_name = moved->d_name.name; if (old_dir == new_dir) old_dir_mask |= FS_DN_RENAME; -- cgit v1.2.3 From cccc6bba3f771ef29b33e4f79e70ebc3dba245b0 Mon Sep 17 00:00:00 2001 From: Al Viro <viro@zeniv.linux.org.uk> Date: Fri, 25 Dec 2009 05:07:33 -0500 Subject: Lose the first argument of audit_inode_child() it's always equal to ->d_name.name of the second argument Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> --- fs/namei.c | 2 +- include/linux/audit.h | 11 +++++------ include/linux/fsnotify.h | 8 ++++---- kernel/auditsc.c | 7 ++----- 4 files changed, 12 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index f69df876fac3..865282f8e012 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1337,7 +1337,7 @@ static int may_delete(struct inode *dir,struct dentry *victim,int isdir) return -ENOENT; BUG_ON(victim->d_parent->d_inode != dir); - audit_inode_child(victim->d_name.name, victim, dir); + audit_inode_child(victim, dir); error = inode_permission(dir, MAY_WRITE | MAY_EXEC); if (error) diff --git a/include/linux/audit.h b/include/linux/audit.h index 3c7a358241a7..f391d45c8aea 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -424,7 +424,7 @@ extern void audit_syscall_exit(int failed, long return_code); extern void __audit_getname(const char *name); extern void audit_putname(const char *name); extern void __audit_inode(const char *name, const struct dentry *dentry); -extern void __audit_inode_child(const char *dname, const struct dentry *dentry, +extern void __audit_inode_child(const struct dentry *dentry, const struct inode *parent); extern void __audit_ptrace(struct task_struct *t); @@ -442,11 +442,10 @@ static inline void audit_inode(const char *name, const struct dentry *dentry) { if (unlikely(!audit_dummy_context())) __audit_inode(name, dentry); } -static inline void audit_inode_child(const char *dname, - const struct dentry *dentry, +static inline void audit_inode_child(const struct dentry *dentry, const struct inode *parent) { if (unlikely(!audit_dummy_context())) - __audit_inode_child(dname, dentry, parent); + __audit_inode_child(dentry, parent); } void audit_core_dumps(long signr); @@ -544,9 +543,9 @@ extern int audit_signals; #define audit_getname(n) do { ; } while (0) #define audit_putname(n) do { ; } while (0) #define __audit_inode(n,d) do { ; } while (0) -#define __audit_inode_child(d,i,p) do { ; } while (0) +#define __audit_inode_child(i,p) do { ; } while (0) #define audit_inode(n,d) do { ; } while (0) -#define audit_inode_child(d,i,p) do { ; } while (0) +#define audit_inode_child(i,p) do { ; } while (0) #define audit_core_dumps(i) do { ; } while (0) #define auditsc_get_stamp(c,t,s) (0) #define audit_get_loginuid(t) (-1) diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index 2d755c49c324..df8fd9a3b214 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -104,7 +104,7 @@ static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir, inotify_inode_queue_event(source, IN_MOVE_SELF, 0, NULL, NULL); fsnotify(source, FS_MOVE_SELF, moved->d_inode, FSNOTIFY_EVENT_INODE, NULL, 0); } - audit_inode_child(new_name, moved, new_dir); + audit_inode_child(moved, new_dir); } /* @@ -147,7 +147,7 @@ static inline void fsnotify_create(struct inode *inode, struct dentry *dentry) { inotify_inode_queue_event(inode, IN_CREATE, 0, dentry->d_name.name, dentry->d_inode); - audit_inode_child(dentry->d_name.name, dentry, inode); + audit_inode_child(dentry, inode); fsnotify(inode, FS_CREATE, dentry->d_inode, FSNOTIFY_EVENT_INODE, dentry->d_name.name, 0); } @@ -162,7 +162,7 @@ static inline void fsnotify_link(struct inode *dir, struct inode *inode, struct inotify_inode_queue_event(dir, IN_CREATE, 0, new_dentry->d_name.name, inode); fsnotify_link_count(inode); - audit_inode_child(new_dentry->d_name.name, new_dentry, dir); + audit_inode_child(new_dentry, dir); fsnotify(dir, FS_CREATE, inode, FSNOTIFY_EVENT_INODE, new_dentry->d_name.name, 0); } @@ -176,7 +176,7 @@ static inline void fsnotify_mkdir(struct inode *inode, struct dentry *dentry) struct inode *d_inode = dentry->d_inode; inotify_inode_queue_event(inode, mask, 0, dentry->d_name.name, d_inode); - audit_inode_child(dentry->d_name.name, dentry, inode); + audit_inode_child(dentry, inode); fsnotify(inode, mask, d_inode, FSNOTIFY_EVENT_INODE, dentry->d_name.name, 0); } diff --git a/kernel/auditsc.c b/kernel/auditsc.c index fc0f928167e7..f3a461c0970a 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1988,7 +1988,6 @@ void __audit_inode(const char *name, const struct dentry *dentry) /** * audit_inode_child - collect inode info for created/removed objects - * @dname: inode's dentry name * @dentry: dentry being audited * @parent: inode of dentry parent * @@ -2000,13 +1999,14 @@ void __audit_inode(const char *name, const struct dentry *dentry) * must be hooked prior, in order to capture the target inode during * unsuccessful attempts. */ -void __audit_inode_child(const char *dname, const struct dentry *dentry, +void __audit_inode_child(const struct dentry *dentry, const struct inode *parent) { int idx; struct audit_context *context = current->audit_context; const char *found_parent = NULL, *found_child = NULL; const struct inode *inode = dentry->d_inode; + const char *dname = dentry->d_name.name; int dirlen = 0; if (!context->in_syscall) @@ -2014,9 +2014,6 @@ void __audit_inode_child(const char *dname, const struct dentry *dentry, if (inode) handle_one(inode); - /* determine matching parent */ - if (!dname) - goto add_names; /* parent is more likely, look for it first */ for (idx = 0; idx < context->name_count; idx++) { -- cgit v1.2.3 From cdd30fa1664e0245fa64330c7cc2ddab7e47c223 Mon Sep 17 00:00:00 2001 From: Jeff Layton <jlayton@redhat.com> Date: Fri, 5 Feb 2010 15:09:12 -0500 Subject: lockd: release reference to nsm_handle in nlm_host_rebooted nsm_reboot_lookup takes a reference to the nsm_handle that it returns, but nlm_host_rebooted never releases that reference. Signed-off-by: Jeff Layton <jlayton@redhat.com> Reviewed-by: Chuck Lever <chuck.lever@oracle.com> Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu> --- fs/lockd/host.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/lockd/host.c b/fs/lockd/host.c index 4600c2037b8b..bb464d12104c 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -479,8 +479,8 @@ again: mutex_lock(&nlm_host_mutex); } } } - mutex_unlock(&nlm_host_mutex); + nsm_release(nsm); } /* -- cgit v1.2.3 From 7e469af97eed947ba9204712601281a69ae8eb6c Mon Sep 17 00:00:00 2001 From: Jeff Layton <jlayton@redhat.com> Date: Fri, 5 Feb 2010 15:09:22 -0500 Subject: lockd: don't clear sm_monitored on nsm_reboot_lookup When lockd gets a notify downcall from statd, it'll search its hosts cache and then clear the sm_monitored bit on the host it finds. The idea is apparently to make lockd redo a SM_MON on the next lock request. This is unnecessary and causes the kernel's NSM cache to go out of sync with statd. statd doesn't stop monitoring a host when it gets a SM_NOTIFY and there's no guarantee that another lock will occur after the reclaim and before the unmount. In that event, no SM_UNMON will occur. Signed-off-by: Jeff Layton <jlayton@redhat.com> Reviewed-by: Chuck Lever <chuck.lever@oracle.com> Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu> --- fs/lockd/mon.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c index f956651d0f65..fefa4df3f005 100644 --- a/fs/lockd/mon.c +++ b/fs/lockd/mon.c @@ -349,9 +349,9 @@ retry: * nsm_reboot_lookup - match NLMPROC_SM_NOTIFY arguments to an nsm_handle * @info: pointer to NLMPROC_SM_NOTIFY arguments * - * Returns a matching nsm_handle if found in the nsm cache; the returned - * nsm_handle's reference count is bumped and sm_monitored is cleared. - * Otherwise returns NULL if some error occurred. + * Returns a matching nsm_handle if found in the nsm cache. The returned + * nsm_handle's reference count is bumped. Otherwise returns NULL if some + * error occurred. */ struct nsm_handle *nsm_reboot_lookup(const struct nlm_reboot *info) { @@ -370,12 +370,6 @@ struct nsm_handle *nsm_reboot_lookup(const struct nlm_reboot *info) atomic_inc(&cached->sm_count); spin_unlock(&nsm_lock); - /* - * During subsequent lock activity, force a fresh - * notification to be set up for this host. - */ - cached->sm_monitored = 0; - dprintk("lockd: host %s (%s) rebooted, cnt %d\n", cached->sm_name, cached->sm_addrbuf, atomic_read(&cached->sm_count)); -- cgit v1.2.3 From 8cfb3343f70bcf9403218df120ecf345f06dd585 Mon Sep 17 00:00:00 2001 From: Jeremy Kerr <jeremy.kerr@canonical.com> Date: Mon, 1 Feb 2010 21:34:14 -0700 Subject: of: make set_node_proc_entry private to proc_devtree.c We only need set_node_proc_entry in proc_devtree.c, so move it there. This fixes the !HAVE_ARCH_DEVTREE_FIXUPS build, as we can't make make the definition in linux/of.h conditional on this #define (definitions in asm/prom.h can't be exposed to linux/of.h, due to the enforced #include ordering). Signed-off-by: Jeremy Kerr <jeremy.kerr@canonical.com> Signed-off-by: Grant Likely <grant.likely@secretlab.ca> --- fs/proc/proc_devtree.c | 5 +++-- include/linux/of.h | 6 ------ 2 files changed, 3 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c index 123257bb356b..2309bf17203f 100644 --- a/fs/proc/proc_devtree.c +++ b/fs/proc/proc_devtree.c @@ -14,12 +14,13 @@ #include <asm/uaccess.h> #include "internal.h" -#ifndef HAVE_ARCH_DEVTREE_FIXUPS static inline void set_node_proc_entry(struct device_node *np, struct proc_dir_entry *de) { -} +#ifdef HAVE_ARCH_DEVTREE_FIXUPS + np->pde = de; #endif +} static struct proc_dir_entry *proc_device_tree; diff --git a/include/linux/of.h b/include/linux/of.h index 3cc0d7ae290e..fd47c81d7a25 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -73,12 +73,6 @@ static inline void of_node_set_flag(struct device_node *n, unsigned long flag) set_bit(flag, &n->_flags); } -static inline void -set_node_proc_entry(struct device_node *dn, struct proc_dir_entry *de) -{ - dn->pde = de; -} - extern struct device_node *of_find_all_nodes(struct device_node *prev); #if defined(CONFIG_SPARC) -- cgit v1.2.3 From 50ab2fe147e22c8786552cda1791a61ae81b84d2 Mon Sep 17 00:00:00 2001 From: Jeremy Kerr <jeremy.kerr@canonical.com> Date: Mon, 1 Feb 2010 21:34:14 -0700 Subject: proc_devtree: include linux/of.h Currenly, proc_devtree.c depends on asm/prom.h to include linux/of.h, to provide some device-tree definitions (eg, struct property). Instead, include linux/of.h directly. We still need asm/prom.h for HAVE_ARCH_DEVTREE_FIXUPS. Signed-off-by: Jeremy Kerr <jeremy.kerr@canonical.com> Signed-off-by: Grant Likely <grant.likely@secretlab.ca> --- fs/proc/proc_devtree.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c index 2309bf17203f..0ec45110e15e 100644 --- a/fs/proc/proc_devtree.c +++ b/fs/proc/proc_devtree.c @@ -10,6 +10,7 @@ #include <linux/seq_file.h> #include <linux/stat.h> #include <linux/string.h> +#include <linux/of.h> #include <asm/prom.h> #include <asm/uaccess.h> #include "internal.h" -- cgit v1.2.3 From 2c6434888cef9e5f450d6c5b7df6d8c625ed27c1 Mon Sep 17 00:00:00 2001 From: Jeff Layton <jlayton@redhat.com> Date: Thu, 7 Jan 2010 09:42:03 -0500 Subject: nfs4: handle -EKEYEXPIRED errors from RPC layer If a KRB5 TGT ticket expires, we don't want to return an error immediatel. If someone has a long running job and just forgets to run "kinit" in time then this will make it fail. Instead, we want to treat this situation as we would NFS4ERR_DELAY and retry the upcall after delaying a bit with an exponential backoff. This patch just makes any place that would handle NFS4ERR_DELAY also handle -EKEYEXPIRED the same way. In the future, we may want to be more sophisticated however and handle hard vs. soft mounts differently, or specify some upper limit on how long we'll wait for a new TGT to be acquired. Signed-off-by: Jeff Layton <jlayton@redhat.com> Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com> --- fs/nfs/nfs4proc.c | 11 +++++++++-- fs/nfs/nfs4state.c | 1 + 2 files changed, 10 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 375f0fae2c6a..8d0c3a977c36 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -281,6 +281,7 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, } case -NFS4ERR_GRACE: case -NFS4ERR_DELAY: + case -EKEYEXPIRED: ret = nfs4_delay(server->client, &exception->timeout); if (ret != 0) break; @@ -1163,7 +1164,7 @@ static int nfs4_do_open_reclaim(struct nfs_open_context *ctx, struct nfs4_state int err; do { err = _nfs4_do_open_reclaim(ctx, state); - if (err != -NFS4ERR_DELAY) + if (err != -NFS4ERR_DELAY && err != -EKEYEXPIRED) break; nfs4_handle_exception(server, err, &exception); } while (exception.retry); @@ -1582,6 +1583,7 @@ static int nfs4_do_open_expired(struct nfs_open_context *ctx, struct nfs4_state goto out; case -NFS4ERR_GRACE: case -NFS4ERR_DELAY: + case -EKEYEXPIRED: nfs4_handle_exception(server, err, &exception); err = 0; } @@ -3452,6 +3454,7 @@ _nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, if (server) nfs_inc_server_stats(server, NFSIOS_DELAY); case -NFS4ERR_GRACE: + case -EKEYEXPIRED: rpc_delay(task, NFS4_POLL_RETRY_MAX); task->tk_status = 0; return -EAGAIN; @@ -3564,6 +3567,7 @@ int nfs4_proc_setclientid_confirm(struct nfs_client *clp, struct rpc_cred *cred) case -NFS4ERR_RESOURCE: /* The IBM lawyers misread another document! */ case -NFS4ERR_DELAY: + case -EKEYEXPIRED: err = nfs4_delay(clp->cl_rpcclient, &timeout); } } while (err == 0); @@ -4179,7 +4183,7 @@ static int nfs4_lock_reclaim(struct nfs4_state *state, struct file_lock *request if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0) return 0; err = _nfs4_do_setlk(state, F_SETLK, request, NFS_LOCK_RECLAIM); - if (err != -NFS4ERR_DELAY) + if (err != -NFS4ERR_DELAY && err != -EKEYEXPIRED) break; nfs4_handle_exception(server, err, &exception); } while (exception.retry); @@ -4204,6 +4208,7 @@ static int nfs4_lock_expired(struct nfs4_state *state, struct file_lock *request goto out; case -NFS4ERR_GRACE: case -NFS4ERR_DELAY: + case -EKEYEXPIRED: nfs4_handle_exception(server, err, &exception); err = 0; } @@ -4355,6 +4360,7 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl) err = 0; goto out; case -NFS4ERR_DELAY: + case -EKEYEXPIRED: break; } err = nfs4_handle_exception(server, err, &exception); @@ -4554,6 +4560,7 @@ static void nfs4_get_lease_time_done(struct rpc_task *task, void *calldata) switch (task->tk_status) { case -NFS4ERR_DELAY: case -NFS4ERR_GRACE: + case -EKEYEXPIRED: dprintk("%s Retry: tk_status %d\n", __func__, task->tk_status); rpc_delay(task, NFS4_POLL_RETRY_MIN); task->tk_status = 0; diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index c1e2733f4fa4..8406cacd3240 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1314,6 +1314,7 @@ static void nfs4_set_lease_expired(struct nfs_client *clp, int status) case -NFS4ERR_DELAY: case -NFS4ERR_CLID_INUSE: case -EAGAIN: + case -EKEYEXPIRED: break; case -NFS4ERR_NOT_SAME: /* FixMe: implement recovery -- cgit v1.2.3 From b68d69b8c6d19f4c2174f26fe8b750a0e82eb732 Mon Sep 17 00:00:00 2001 From: Jeff Layton <jlayton@redhat.com> Date: Thu, 7 Jan 2010 09:42:04 -0500 Subject: nfs: handle NFSv3 -EKEYEXPIRED errors as we would -EJUKEBOX We're using -EKEYEXPIRED to indicate that a krb5 credcache contains an expired ticket and that we should have the NFS layer retry the RPC call instead of returning an error back to the caller. Handle this as we would an -EJUKEBOX error return. Signed-off-by: Jeff Layton <jlayton@redhat.com> Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com> --- fs/nfs/nfs3proc.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 3f8881d1a050..24992f0a29f2 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -22,14 +22,14 @@ #define NFSDBG_FACILITY NFSDBG_PROC -/* A wrapper to handle the EJUKEBOX error message */ +/* A wrapper to handle the EJUKEBOX and EKEYEXPIRED error messages */ static int nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags) { int res; do { res = rpc_call_sync(clnt, msg, flags); - if (res != -EJUKEBOX) + if (res != -EJUKEBOX && res != -EKEYEXPIRED) break; schedule_timeout_killable(NFS_JUKEBOX_RETRY_TIME); res = -ERESTARTSYS; @@ -42,9 +42,10 @@ nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags) static int nfs3_async_handle_jukebox(struct rpc_task *task, struct inode *inode) { - if (task->tk_status != -EJUKEBOX) + if (task->tk_status != -EJUKEBOX && task->tk_status != -EKEYEXPIRED) return 0; - nfs_inc_stats(inode, NFSIOS_DELAY); + if (task->tk_status == -EJUKEBOX) + nfs_inc_stats(inode, NFSIOS_DELAY); task->tk_status = 0; rpc_restart_call(task); rpc_delay(task, NFS_JUKEBOX_RETRY_TIME); -- cgit v1.2.3 From 97cefcc6d0aa6b4fc9ba67eb1ef4cc9e25f826f2 Mon Sep 17 00:00:00 2001 From: Jeff Layton <jlayton@redhat.com> Date: Fri, 8 Jan 2010 12:17:21 -0500 Subject: nfs: handle NFSv2 -EKEYEXPIRED returns from RPC layer appropriately Add a wrapper around rpc_call_sync that handles -EKEYEXPIRED errors from the RPC layer as it would an -EJUKEBOX error if NFSv2 had such a thing. Also, add a handler for that error for async calls that makes it resubmit the RPC on -EKEYEXPIRED. Signed-off-by: Jeff Layton <jlayton@redhat.com> Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com> --- fs/nfs/proc.c | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) (limited to 'fs') diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index ef583854d8d0..c752d944fe9e 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -46,6 +46,39 @@ #define NFSDBG_FACILITY NFSDBG_PROC +/* + * wrapper to handle the -EKEYEXPIRED error message. This should generally + * only happen if using krb5 auth and a user's TGT expires. NFSv2 doesn't + * support the NFSERR_JUKEBOX error code, but we handle this situation in the + * same way that we handle that error with NFSv3. + */ +static int +nfs_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags) +{ + int res; + do { + res = rpc_call_sync(clnt, msg, flags); + if (res != -EKEYEXPIRED) + break; + schedule_timeout_killable(NFS_JUKEBOX_RETRY_TIME); + res = -ERESTARTSYS; + } while (!fatal_signal_pending(current)); + return res; +} + +#define rpc_call_sync(clnt, msg, flags) nfs_rpc_wrapper(clnt, msg, flags) + +static int +nfs_async_handle_expired_key(struct rpc_task *task) +{ + if (task->tk_status != -EKEYEXPIRED) + return 0; + task->tk_status = 0; + rpc_restart_call(task); + rpc_delay(task, NFS_JUKEBOX_RETRY_TIME); + return 1; +} + /* * Bare-bones access to getattr: this is for nfs_read_super. */ @@ -307,6 +340,8 @@ nfs_proc_unlink_setup(struct rpc_message *msg, struct inode *dir) static int nfs_proc_unlink_done(struct rpc_task *task, struct inode *dir) { + if (nfs_async_handle_expired_key(task)) + return 0; nfs_mark_for_revalidate(dir); return 1; } @@ -560,6 +595,9 @@ nfs_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle, static int nfs_read_done(struct rpc_task *task, struct nfs_read_data *data) { + if (nfs_async_handle_expired_key(task)) + return -EAGAIN; + nfs_invalidate_atime(data->inode); if (task->tk_status >= 0) { nfs_refresh_inode(data->inode, data->res.fattr); @@ -579,6 +617,9 @@ static void nfs_proc_read_setup(struct nfs_read_data *data, struct rpc_message * static int nfs_write_done(struct rpc_task *task, struct nfs_write_data *data) { + if (nfs_async_handle_expired_key(task)) + return -EAGAIN; + if (task->tk_status >= 0) nfs_post_op_update_inode_force_wcc(data->inode, data->res.fattr); return 0; -- cgit v1.2.3 From 8e0d46e13833b06832395e7eacccae8af8743461 Mon Sep 17 00:00:00 2001 From: Mike Sager <sager@netapp.com> Date: Thu, 17 Dec 2009 12:06:26 -0500 Subject: nfs41: Adjust max cache response size value For the CREATE_SESSION attribute ca_maxresponsesize_cached, calculate the value based on the rpc reply header size plus the maximum nfs compound reply size. Signed-off-by: Mike Sager <sager@netapp.com> Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com> --- fs/nfs/nfs4proc.c | 6 ++---- fs/nfs/nfs4xdr.c | 10 +++++++++- 2 files changed, 11 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 8d0c3a977c36..b829118c7e04 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -4805,16 +4805,14 @@ static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args) args->fc_attrs.headerpadsz = 0; args->fc_attrs.max_rqst_sz = mxrqst_sz; args->fc_attrs.max_resp_sz = mxresp_sz; - args->fc_attrs.max_resp_sz_cached = mxresp_sz; args->fc_attrs.max_ops = NFS4_MAX_OPS; args->fc_attrs.max_reqs = session->clp->cl_rpcclient->cl_xprt->max_reqs; dprintk("%s: Fore Channel : max_rqst_sz=%u max_resp_sz=%u " - "max_resp_sz_cached=%u max_ops=%u max_reqs=%u\n", + "max_ops=%u max_reqs=%u\n", __func__, args->fc_attrs.max_rqst_sz, args->fc_attrs.max_resp_sz, - args->fc_attrs.max_resp_sz_cached, args->fc_attrs.max_ops, - args->fc_attrs.max_reqs); + args->fc_attrs.max_ops, args->fc_attrs.max_reqs); /* Back channel attributes */ args->bc_attrs.headerpadsz = 0; diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index e437fd6a819f..020ebf151184 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -1578,6 +1578,14 @@ static void encode_create_session(struct xdr_stream *xdr, char machine_name[NFS4_MAX_MACHINE_NAME_LEN]; uint32_t len; struct nfs_client *clp = args->client; + u32 max_resp_sz_cached; + + /* + * Assumes OPEN is the biggest non-idempotent compound. + * 2 is the verifier. + */ + max_resp_sz_cached = (NFS4_dec_open_sz + RPC_REPHDRSIZE + + RPC_MAX_AUTH_SIZE + 2) * XDR_UNIT; len = scnprintf(machine_name, sizeof(machine_name), "%s", clp->cl_ipaddr); @@ -1592,7 +1600,7 @@ static void encode_create_session(struct xdr_stream *xdr, *p++ = cpu_to_be32(args->fc_attrs.headerpadsz); /* header padding size */ *p++ = cpu_to_be32(args->fc_attrs.max_rqst_sz); /* max req size */ *p++ = cpu_to_be32(args->fc_attrs.max_resp_sz); /* max resp size */ - *p++ = cpu_to_be32(args->fc_attrs.max_resp_sz_cached); /* Max resp sz cached */ + *p++ = cpu_to_be32(max_resp_sz_cached); /* Max resp sz cached */ *p++ = cpu_to_be32(args->fc_attrs.max_ops); /* max operations */ *p++ = cpu_to_be32(args->fc_attrs.max_reqs); /* max requests */ *p++ = cpu_to_be32(0); /* rdmachannel_attrs */ -- cgit v1.2.3 From a7989c3e4702203baa5ddb3614f92bfc49a6e491 Mon Sep 17 00:00:00 2001 From: Mike Sager <sager@netapp.com> Date: Tue, 19 Jan 2010 12:54:40 -0500 Subject: nfs41: Check slot table for referring calls Traverse a list of referring calls and look for a session/slot/seq number match. Signed-off-by: Mike Sager <sager@netapp.com> Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com> --- fs/nfs/callback_proc.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) (limited to 'fs') diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index defa9b4c470e..631b44c1439b 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -225,6 +225,61 @@ validate_seqid(struct nfs4_slot_table *tbl, u32 slotid, u32 seqid) return NULL; } +/* + * For each referring call triple, check the session's slot table for + * a match. If the slot is in use and the sequence numbers match, the + * client is still waiting for a response to the original request. + */ +static bool referring_call_exists(struct nfs_client *clp, + uint32_t nrclists, + struct referring_call_list *rclists) +{ + bool status = 0; + int i, j; + struct nfs4_session *session; + struct nfs4_slot_table *tbl; + struct referring_call_list *rclist; + struct referring_call *ref; + + /* + * XXX When client trunking is implemented, this becomes + * a session lookup from within the loop + */ + session = clp->cl_session; + tbl = &session->fc_slot_table; + + for (i = 0; i < nrclists; i++) { + rclist = &rclists[i]; + if (memcmp(session->sess_id.data, + rclist->rcl_sessionid.data, + NFS4_MAX_SESSIONID_LEN) != 0) + continue; + + for (j = 0; j < rclist->rcl_nrefcalls; j++) { + ref = &rclist->rcl_refcalls[j]; + + dprintk("%s: sessionid %x:%x:%x:%x sequenceid %u " + "slotid %u\n", __func__, + ((u32 *)&rclist->rcl_sessionid.data)[0], + ((u32 *)&rclist->rcl_sessionid.data)[1], + ((u32 *)&rclist->rcl_sessionid.data)[2], + ((u32 *)&rclist->rcl_sessionid.data)[3], + ref->rc_sequenceid, ref->rc_slotid); + + spin_lock(&tbl->slot_tbl_lock); + status = (test_bit(ref->rc_slotid, tbl->used_slots) && + tbl->slots[ref->rc_slotid].seq_nr == + ref->rc_sequenceid); + spin_unlock(&tbl->slot_tbl_lock); + if (status) + goto out; + } + } + +out: + return status; +} + /* FIXME: referring calls should be processed */ unsigned nfs4_callback_sequence(struct cb_sequenceargs *args, struct cb_sequenceres *res) -- cgit v1.2.3 From 72ce2b3c064471fc511a9ca2fb6c38d90d2ab826 Mon Sep 17 00:00:00 2001 From: Mike Sager <sager@netapp.com> Date: Tue, 19 Jan 2010 12:54:41 -0500 Subject: nfs41: Process callback's referring call list If a CB_SEQUENCE referring call triple matches a slot table entry, the client is still waiting for a response to the original request. In this case, return NFS4ERR_DELAY as the response to the callback. Signed-off-by: Mike Sager <sager@netapp.com> Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com> --- fs/nfs/callback_proc.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 631b44c1439b..49c4b548b4d0 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -280,17 +280,12 @@ out: return status; } -/* FIXME: referring calls should be processed */ unsigned nfs4_callback_sequence(struct cb_sequenceargs *args, struct cb_sequenceres *res) { struct nfs_client *clp; int i, status; - for (i = 0; i < args->csa_nrclists; i++) - kfree(args->csa_rclists[i].rcl_refcalls); - kfree(args->csa_rclists); - status = htonl(NFS4ERR_BADSESSION); clp = find_client_with_session(args->csa_addr, 4, &args->csa_sessionid); if (clp == NULL) @@ -301,6 +296,16 @@ unsigned nfs4_callback_sequence(struct cb_sequenceargs *args, if (status) goto out_putclient; + /* + * Check for pending referring calls. If a match is found, a + * related callback was received before the response to the original + * call. + */ + if (referring_call_exists(clp, args->csa_nrclists, args->csa_rclists)) { + status = htonl(NFS4ERR_DELAY); + goto out_putclient; + } + memcpy(&res->csr_sessionid, &args->csa_sessionid, sizeof(res->csr_sessionid)); res->csr_sequenceid = args->csa_sequenceid; @@ -311,6 +316,10 @@ unsigned nfs4_callback_sequence(struct cb_sequenceargs *args, out_putclient: nfs_put_client(clp); out: + for (i = 0; i < args->csa_nrclists; i++) + kfree(args->csa_rclists[i].rcl_refcalls); + kfree(args->csa_rclists); + dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); res->csr_status = status; return res->csr_status; -- cgit v1.2.3 From 31d2b4356b054537c35f4f8a7533e0b4a494dcc6 Mon Sep 17 00:00:00 2001 From: Andy Adamson <andros@netapp.com> Date: Thu, 14 Jan 2010 17:45:04 -0500 Subject: nfs41: fix wrong error on callback header xdr overflow Set NFS4ERR_RESOURCE as CB_COMPOUND status and do not return an op on decode_op_hdr or encode_op_hdr buffer overflow. NFS4ERR_RESOURCE is correct for v4.0. Will fix the return for v4.1 along with all the other NFS4ERR_RESOURCE errors in a later patch. Signed-off-by: Andy Adamson <andros@netapp.com> Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com> --- fs/nfs/callback_xdr.c | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index 8e1a2511c8be..6ae327871b86 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -28,6 +28,9 @@ #define NFSDBG_FACILITY NFSDBG_CALLBACK +/* Internal error code */ +#define NFS4ERR_RESOURCE_HDR 11050 + typedef __be32 (*callback_process_op_t)(void *, void *); typedef __be32 (*callback_decode_arg_t)(struct svc_rqst *, struct xdr_stream *, void *); typedef __be32 (*callback_encode_res_t)(struct svc_rqst *, struct xdr_stream *, void *); @@ -173,7 +176,7 @@ static __be32 decode_op_hdr(struct xdr_stream *xdr, unsigned int *op) __be32 *p; p = read_buf(xdr, 4); if (unlikely(p == NULL)) - return htonl(NFS4ERR_RESOURCE); + return htonl(NFS4ERR_RESOURCE_HDR); *op = ntohl(*p); return 0; } @@ -465,7 +468,7 @@ static __be32 encode_op_hdr(struct xdr_stream *xdr, uint32_t op, __be32 res) p = xdr_reserve_space(xdr, 8); if (unlikely(p == NULL)) - return htonl(NFS4ERR_RESOURCE); + return htonl(NFS4ERR_RESOURCE_HDR); *p++ = htonl(op); *p = res; return 0; @@ -605,17 +608,15 @@ static __be32 process_op(uint32_t minorversion, int nop, struct xdr_stream *xdr_out, void *resp) { struct callback_op *op = &callback_ops[0]; - unsigned int op_nr = OP_CB_ILLEGAL; + unsigned int op_nr; __be32 status; long maxlen; __be32 res; dprintk("%s: start\n", __func__); status = decode_op_hdr(xdr_in, &op_nr); - if (unlikely(status)) { - status = htonl(NFS4ERR_OP_ILLEGAL); - goto out; - } + if (unlikely(status)) + return status; dprintk("%s: minorversion=%d nop=%d op_nr=%u\n", __func__, minorversion, nop, op_nr); @@ -624,7 +625,7 @@ static __be32 process_op(uint32_t minorversion, int nop, preprocess_nfs4_op(op_nr, &op); if (status == htonl(NFS4ERR_OP_ILLEGAL)) op_nr = OP_CB_ILLEGAL; -out: + maxlen = xdr_out->end - xdr_out->p; if (maxlen > 0 && maxlen < PAGE_SIZE) { if (likely(status == 0 && op->decode_args != NULL)) @@ -635,8 +636,8 @@ out: status = htonl(NFS4ERR_RESOURCE); res = encode_op_hdr(xdr_out, op_nr, status); - if (status == 0) - status = res; + if (unlikely(res)) + return res; if (op->encode_res != NULL && status == 0) status = op->encode_res(rqstp, xdr_out, resp); dprintk("%s: done, status = %d\n", __func__, ntohl(status)); @@ -677,6 +678,13 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r nops++; } + /* Buffer overflow in decode_ops_hdr or encode_ops_hdr. Return + * resource error in cb_compound status without returning op */ + if (unlikely(status == htonl(NFS4ERR_RESOURCE_HDR))) { + status = htonl(NFS4ERR_RESOURCE); + nops--; + } + *hdr_res.status = status; *hdr_res.nops = htonl(nops); dprintk("%s: done, status = %u\n", __func__, ntohl(status)); -- cgit v1.2.3 From b92b30190093377828efcde5fc4cf7598fa1ee46 Mon Sep 17 00:00:00 2001 From: Andy Adamson <andros@netapp.com> Date: Thu, 14 Jan 2010 17:45:05 -0500 Subject: nfs41: directly encode back channel error Skip all other processing when error is encountered. Signed-off-by: Andy Adamson <andros@netapp.com> Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com> --- fs/nfs/callback_xdr.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index 6ae327871b86..d3e07f469949 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -625,16 +625,19 @@ static __be32 process_op(uint32_t minorversion, int nop, preprocess_nfs4_op(op_nr, &op); if (status == htonl(NFS4ERR_OP_ILLEGAL)) op_nr = OP_CB_ILLEGAL; + if (status) + goto encode_hdr; maxlen = xdr_out->end - xdr_out->p; if (maxlen > 0 && maxlen < PAGE_SIZE) { - if (likely(status == 0 && op->decode_args != NULL)) + if (likely(op->decode_args != NULL)) status = op->decode_args(rqstp, xdr_in, argp); if (likely(status == 0 && op->process_op != NULL)) status = op->process_op(argp, resp); } else status = htonl(NFS4ERR_RESOURCE); +encode_hdr: res = encode_op_hdr(xdr_out, op_nr, status); if (unlikely(res)) return res; -- cgit v1.2.3 From e95e60daee44fade63f32429ddcf1c2012a95632 Mon Sep 17 00:00:00 2001 From: Andy Adamson <andros@netapp.com> Date: Thu, 14 Jan 2010 17:45:06 -0500 Subject: nfs41: remove uneeded checks in callback processing All callback operations have arguments to decode and require processing. The preprocess_nfs4X_op functions catch unsupported or illegal ops so decode_args and process_op pointers are always non NULL. Signed-off-by: Andy Adamson <andros@netapp.com> Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com> --- fs/nfs/callback_xdr.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index d3e07f469949..a6f2ded72b17 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -630,9 +630,8 @@ static __be32 process_op(uint32_t minorversion, int nop, maxlen = xdr_out->end - xdr_out->p; if (maxlen > 0 && maxlen < PAGE_SIZE) { - if (likely(op->decode_args != NULL)) - status = op->decode_args(rqstp, xdr_in, argp); - if (likely(status == 0 && op->process_op != NULL)) + status = op->decode_args(rqstp, xdr_in, argp); + if (likely(status == 0)) status = op->process_op(argp, resp); } else status = htonl(NFS4ERR_RESOURCE); -- cgit v1.2.3 From b2f28bd78354b9bbcd178bf6bbf6b2277cd9b761 Mon Sep 17 00:00:00 2001 From: Andy Adamson <andros@netapp.com> Date: Thu, 14 Jan 2010 17:45:07 -0500 Subject: nfs41: prepare for back channel drc Make all cb_sequence arguments available to verify_seqid which will make replay decisions. Signed-off-by: Andy Adamson <andros@netapp.com> Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com> --- fs/nfs/callback_proc.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 49c4b548b4d0..3d7edd65577b 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -153,34 +153,34 @@ int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const n * a single outstanding callback request at a time. */ static int -validate_seqid(struct nfs4_slot_table *tbl, u32 slotid, u32 seqid) +validate_seqid(struct nfs4_slot_table *tbl, struct cb_sequenceargs * args) { struct nfs4_slot *slot; dprintk("%s enter. slotid %d seqid %d\n", - __func__, slotid, seqid); + __func__, args->csa_slotid, args->csa_sequenceid); - if (slotid > NFS41_BC_MAX_CALLBACKS) + if (args->csa_slotid > NFS41_BC_MAX_CALLBACKS) return htonl(NFS4ERR_BADSLOT); - slot = tbl->slots + slotid; + slot = tbl->slots + args->csa_slotid; dprintk("%s slot table seqid: %d\n", __func__, slot->seq_nr); /* Normal */ - if (likely(seqid == slot->seq_nr + 1)) { + if (likely(args->csa_sequenceid == slot->seq_nr + 1)) { slot->seq_nr++; return htonl(NFS4_OK); } /* Replay */ - if (seqid == slot->seq_nr) { + if (args->csa_sequenceid == slot->seq_nr) { dprintk("%s seqid %d is a replay - no DRC available\n", - __func__, seqid); + __func__, args->csa_sequenceid); return htonl(NFS4_OK); } /* Wraparound */ - if (seqid == 1 && (slot->seq_nr + 1) == 0) { + if (args->csa_sequenceid == 1 && (slot->seq_nr + 1) == 0) { slot->seq_nr = 1; return htonl(NFS4_OK); } @@ -291,8 +291,7 @@ unsigned nfs4_callback_sequence(struct cb_sequenceargs *args, if (clp == NULL) goto out; - status = validate_seqid(&clp->cl_session->bc_slot_table, - args->csa_slotid, args->csa_sequenceid); + status = validate_seqid(&clp->cl_session->bc_slot_table, args); if (status) goto out_putclient; -- cgit v1.2.3 From 4911096f1a5df73c12c287a42ece4e7b5d9c19ec Mon Sep 17 00:00:00 2001 From: Andy Adamson <andros@netapp.com> Date: Thu, 14 Jan 2010 17:45:08 -0500 Subject: nfs41: back channel drc minimal implementation For now the back channel ca_maxresponsesize_cached is 0 and there is no backchannel DRC. Return NFS4ERR_REP_TOO_BIG_TO_CACHE when a cb_sequence cachethis is true. When it is false, return NFS4ERR_RETRY_UNCACHED_REP as the next operation error. Remember the replay error accross compound operation processing. Signed-off-by: Andy Adamson <andros@netapp.com> Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com> --- fs/nfs/callback_proc.c | 25 +++++++++++++++++-------- fs/nfs/callback_xdr.c | 19 +++++++++++++++---- 2 files changed, 32 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 3d7edd65577b..4062f7690a33 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -143,9 +143,8 @@ int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const n * Return success if the sequenceID is one more than what we last saw on * this slot, accounting for wraparound. Increments the slot's sequence. * - * We don't yet implement a duplicate request cache, so at this time - * we will log replays, and process them as if we had not seen them before, - * but we don't bump the sequence in the slot. Not too worried about it, + * We don't yet implement a duplicate request cache, instead we set the + * back channel ca_maxresponsesize_cached to zero. This is OK for now * since we only currently implement idempotent callbacks anyway. * * We have a single slot backchannel at this time, so we don't bother @@ -174,9 +173,15 @@ validate_seqid(struct nfs4_slot_table *tbl, struct cb_sequenceargs * args) /* Replay */ if (args->csa_sequenceid == slot->seq_nr) { - dprintk("%s seqid %d is a replay - no DRC available\n", + dprintk("%s seqid %d is a replay\n", __func__, args->csa_sequenceid); - return htonl(NFS4_OK); + /* Signal process_op to set this error on next op */ + if (args->csa_cachethis == 0) + return htonl(NFS4ERR_RETRY_UNCACHED_REP); + + /* The ca_maxresponsesize_cached is 0 with no DRC */ + else if (args->csa_cachethis == 1) + return htonl(NFS4ERR_REP_TOO_BIG_TO_CACHE); } /* Wraparound */ @@ -319,9 +324,13 @@ out: kfree(args->csa_rclists[i].rcl_refcalls); kfree(args->csa_rclists); - dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); - res->csr_status = status; - return res->csr_status; + if (status == htonl(NFS4ERR_RETRY_UNCACHED_REP)) + res->csr_status = 0; + else + res->csr_status = status; + dprintk("%s: exit with status = %d res->csr_status %d\n", __func__, + ntohl(status), ntohl(res->csr_status)); + return status; } unsigned nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy) diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index a6f2ded72b17..08b430d922c4 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -605,7 +605,7 @@ preprocess_nfs4_op(unsigned int op_nr, struct callback_op **op) static __be32 process_op(uint32_t minorversion, int nop, struct svc_rqst *rqstp, struct xdr_stream *xdr_in, void *argp, - struct xdr_stream *xdr_out, void *resp) + struct xdr_stream *xdr_out, void *resp, int* drc_status) { struct callback_op *op = &callback_ops[0]; unsigned int op_nr; @@ -628,6 +628,11 @@ static __be32 process_op(uint32_t minorversion, int nop, if (status) goto encode_hdr; + if (*drc_status) { + status = *drc_status; + goto encode_hdr; + } + maxlen = xdr_out->end - xdr_out->p; if (maxlen > 0 && maxlen < PAGE_SIZE) { status = op->decode_args(rqstp, xdr_in, argp); @@ -636,6 +641,12 @@ static __be32 process_op(uint32_t minorversion, int nop, } else status = htonl(NFS4ERR_RESOURCE); + /* Only set by OP_CB_SEQUENCE processing */ + if (status == htonl(NFS4ERR_RETRY_UNCACHED_REP)) { + *drc_status = status; + status = 0; + } + encode_hdr: res = encode_op_hdr(xdr_out, op_nr, status); if (unlikely(res)) @@ -655,7 +666,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r struct cb_compound_hdr_res hdr_res = { NULL }; struct xdr_stream xdr_in, xdr_out; __be32 *p; - __be32 status; + __be32 status, drc_status = 0; unsigned int nops = 0; dprintk("%s: start\n", __func__); @@ -675,8 +686,8 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r return rpc_system_err; while (status == 0 && nops != hdr_arg.nops) { - status = process_op(hdr_arg.minorversion, nops, - rqstp, &xdr_in, argp, &xdr_out, resp); + status = process_op(hdr_arg.minorversion, nops, rqstp, + &xdr_in, argp, &xdr_out, resp, &drc_status); nops++; } -- cgit v1.2.3 From b9efa1b27e25b1286504973c0a6bf0f24106faa8 Mon Sep 17 00:00:00 2001 From: Andy Adamson <andros@netapp.com> Date: Wed, 20 Jan 2010 16:06:27 -0500 Subject: nfs41: implement cb_recall_slot Drain the fore channel and reset the max_slots to the new value. Signed-off-by: Andy Adamson <andros@netapp.com> Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com> --- fs/nfs/callback.h | 8 ++++++++ fs/nfs/callback_proc.c | 32 ++++++++++++++++++++++++++++++++ fs/nfs/callback_xdr.c | 22 +++++++++++++++++++++- fs/nfs/nfs4_fs.h | 2 ++ fs/nfs/nfs4state.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ include/linux/nfs_fs_sb.h | 2 ++ 6 files changed, 109 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h index d4036be0b589..85a7cfd1b8dd 100644 --- a/fs/nfs/callback.h +++ b/fs/nfs/callback.h @@ -119,6 +119,14 @@ struct cb_recallanyargs { }; extern unsigned nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy); + +struct cb_recallslotargs { + struct sockaddr *crsa_addr; + uint32_t crsa_target_max_slots; +}; +extern unsigned nfs4_callback_recallslot(struct cb_recallslotargs *args, + void *dummy); + #endif /* CONFIG_NFS_V4_1 */ extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res); diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 4062f7690a33..e5155d9df595 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -361,4 +361,36 @@ out: dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); return status; } + +/* Reduce the fore channel's max_slots to the target value */ +unsigned nfs4_callback_recallslot(struct cb_recallslotargs *args, void *dummy) +{ + struct nfs_client *clp; + struct nfs4_slot_table *fc_tbl; + int status; + + status = htonl(NFS4ERR_OP_NOT_IN_SESSION); + clp = nfs_find_client(args->crsa_addr, 4); + if (clp == NULL) + goto out; + + dprintk("NFS: CB_RECALL_SLOT request from %s target max slots %d\n", + rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR), + args->crsa_target_max_slots); + + fc_tbl = &clp->cl_session->fc_slot_table; + + status = htonl(NFS4ERR_BAD_HIGH_SLOT); + if (args->crsa_target_max_slots >= fc_tbl->max_slots || + args->crsa_target_max_slots < 1) + goto out; + + fc_tbl->target_max_slots = args->crsa_target_max_slots; + nfs41_handle_recall_slot(clp); + status = htonl(NFS4_OK); + nfs_put_client(clp); /* balance nfs_find_client */ +out: + dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); + return status; +} #endif /* CONFIG_NFS_V4_1 */ diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index 08b430d922c4..8e66e20b59fd 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -24,6 +24,7 @@ #define CB_OP_SEQUENCE_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ + \ 4 + 1 + 3) #define CB_OP_RECALLANY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) +#define CB_OP_RECALLSLOT_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) #endif /* CONFIG_NFS_V4_1 */ #define NFSDBG_FACILITY NFSDBG_CALLBACK @@ -349,6 +350,20 @@ static unsigned decode_recallany_args(struct svc_rqst *rqstp, return 0; } +static unsigned decode_recallslot_args(struct svc_rqst *rqstp, + struct xdr_stream *xdr, + struct cb_recallslotargs *args) +{ + __be32 *p; + + args->crsa_addr = svc_addr(rqstp); + p = read_buf(xdr, 4); + if (unlikely(p == NULL)) + return htonl(NFS4ERR_BADXDR); + args->crsa_target_max_slots = ntohl(*p++); + return 0; +} + #endif /* CONFIG_NFS_V4_1 */ static __be32 encode_string(struct xdr_stream *xdr, unsigned int len, const char *str) @@ -557,6 +572,7 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op) case OP_CB_RECALL: case OP_CB_SEQUENCE: case OP_CB_RECALL_ANY: + case OP_CB_RECALL_SLOT: *op = &callback_ops[op_nr]; break; @@ -565,7 +581,6 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op) case OP_CB_NOTIFY: case OP_CB_PUSH_DELEG: case OP_CB_RECALLABLE_OBJ_AVAIL: - case OP_CB_RECALL_SLOT: case OP_CB_WANTS_CANCELLED: case OP_CB_NOTIFY_LOCK: return htonl(NFS4ERR_NOTSUPP); @@ -734,6 +749,11 @@ static struct callback_op callback_ops[] = { .decode_args = (callback_decode_arg_t)decode_recallany_args, .res_maxsize = CB_OP_RECALLANY_RES_MAXSZ, }, + [OP_CB_RECALL_SLOT] = { + .process_op = (callback_process_op_t)nfs4_callback_recallslot, + .decode_args = (callback_decode_arg_t)decode_recallslot_args, + .res_maxsize = CB_OP_RECALLSLOT_RES_MAXSZ, + }, #endif /* CONFIG_NFS_V4_1 */ }; diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 0c6fda33d66e..a187200a7aac 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -46,6 +46,7 @@ enum nfs4_client_state { NFS4CLNT_DELEGRETURN, NFS4CLNT_SESSION_RESET, NFS4CLNT_SESSION_DRAINING, + NFS4CLNT_RECALL_SLOT, }; /* @@ -280,6 +281,7 @@ extern void nfs4_schedule_state_manager(struct nfs_client *); extern int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state); extern int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state); extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags); +extern void nfs41_handle_recall_slot(struct nfs_client *clp); extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp); extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl); extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t); diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 8406cacd3240..9164758c1ace 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1249,6 +1249,12 @@ static int nfs4_reclaim_lease(struct nfs_client *clp) } #ifdef CONFIG_NFS_V4_1 +void nfs41_handle_recall_slot(struct nfs_client *clp) +{ + set_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state); + nfs4_schedule_state_recovery(clp); +} + void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags) { if (!flags) @@ -1299,9 +1305,38 @@ out: return status; } +static int nfs4_recall_slot(struct nfs_client *clp) +{ + struct nfs4_slot_table *fc_tbl = &clp->cl_session->fc_slot_table; + struct nfs4_channel_attrs *fc_attrs = &clp->cl_session->fc_attrs; + struct nfs4_slot *new, *old; + int i; + + nfs4_begin_drain_session(clp); + new = kmalloc(fc_tbl->target_max_slots * sizeof(struct nfs4_slot), + GFP_KERNEL); + if (!new) + return -ENOMEM; + + spin_lock(&fc_tbl->slot_tbl_lock); + for (i = 0; i < fc_tbl->target_max_slots; i++) + new[i].seq_nr = fc_tbl->slots[i].seq_nr; + old = fc_tbl->slots; + fc_tbl->slots = new; + fc_tbl->max_slots = fc_tbl->target_max_slots; + fc_tbl->target_max_slots = 0; + fc_attrs->max_reqs = fc_tbl->max_slots; + spin_unlock(&fc_tbl->slot_tbl_lock); + + kfree(old); + nfs4_end_drain_session(clp); + return 0; +} + #else /* CONFIG_NFS_V4_1 */ static int nfs4_reset_session(struct nfs_client *clp) { return 0; } static int nfs4_end_drain_session(struct nfs_client *clp) { return 0; } +static int nfs4_recall_slot(struct nfs_client *clp) { return 0; } #endif /* CONFIG_NFS_V4_1 */ /* Set NFS4CLNT_LEASE_EXPIRED for all v4.0 errors and for recoverable errors @@ -1398,6 +1433,15 @@ static void nfs4_state_manager(struct nfs_client *clp) nfs_client_return_marked_delegations(clp); continue; } + /* Recall session slots */ + if (test_and_clear_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state) + && nfs4_has_session(clp)) { + status = nfs4_recall_slot(clp); + if (status < 0) + goto out_error; + continue; + } + nfs4_clear_state_manager_bit(clp); /* Did we race with an attempt to give us more work? */ diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index 34fc6be5bfcf..ecd9e6c74d06 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -193,6 +193,8 @@ struct nfs4_slot_table { int max_slots; /* # slots in table */ int highest_used_slotid; /* sent to server on each SEQ. * op for dynamic resizing */ + int target_max_slots; /* Set by CB_RECALL_SLOT as + * the new max_slots */ }; static inline int slot_idx(struct nfs4_slot_table *tbl, struct nfs4_slot *sp) -- cgit v1.2.3 From 104aeba484c9291cde2def6d037b836af46d8eb0 Mon Sep 17 00:00:00 2001 From: Andy Adamson <andros@netapp.com> Date: Thu, 14 Jan 2010 17:45:10 -0500 Subject: nfs41: resize slot table in reset When session is reset, client can renegotiate slot table size. Signed-off-by: Andy Adamson <andros@netapp.com> Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com> --- fs/nfs/nfs4proc.c | 40 +++++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 19 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index b829118c7e04..84b53d38f50b 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -4618,26 +4618,32 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo) /* * Reset a slot table */ -static int nfs4_reset_slot_table(struct nfs4_slot_table *tbl, int max_slots, - int old_max_slots, int ivalue) +static int nfs4_reset_slot_table(struct nfs4_slot_table *tbl, u32 max_reqs, + int ivalue) { + struct nfs4_slot *new = NULL; int i; int ret = 0; - dprintk("--> %s: max_reqs=%u, tbl %p\n", __func__, max_slots, tbl); + dprintk("--> %s: max_reqs=%u, tbl->max_slots %d\n", __func__, + max_reqs, tbl->max_slots); - /* - * Until we have dynamic slot table adjustment, insist - * upon the same slot table size - */ - if (max_slots != old_max_slots) { - dprintk("%s reset slot table does't match old\n", - __func__); - ret = -EINVAL; /*XXX NFS4ERR_REQ_TOO_BIG ? */ - goto out; + /* Does the newly negotiated max_reqs match the existing slot table? */ + if (max_reqs != tbl->max_slots) { + ret = -ENOMEM; + new = kmalloc(max_reqs * sizeof(struct nfs4_slot), + GFP_KERNEL); + if (!new) + goto out; + ret = 0; + kfree(tbl->slots); } spin_lock(&tbl->slot_tbl_lock); - for (i = 0; i < max_slots; ++i) + if (new) { + tbl->slots = new; + tbl->max_slots = max_reqs; + } + for (i = 0; i < tbl->max_slots; ++i) tbl->slots[i].seq_nr = ivalue; spin_unlock(&tbl->slot_tbl_lock); dprintk("%s: tbl=%p slots=%p max_slots=%d\n", __func__, @@ -4655,16 +4661,12 @@ static int nfs4_reset_slot_tables(struct nfs4_session *session) int status; status = nfs4_reset_slot_table(&session->fc_slot_table, - session->fc_attrs.max_reqs, - session->fc_slot_table.max_slots, - 1); + session->fc_attrs.max_reqs, 1); if (status) return status; status = nfs4_reset_slot_table(&session->bc_slot_table, - session->bc_attrs.max_reqs, - session->bc_slot_table.max_slots, - 0); + session->bc_attrs.max_reqs, 0); return status; } -- cgit v1.2.3 From bae0ac0ee1839e345a9b26d8c00eb3ef565caad1 Mon Sep 17 00:00:00 2001 From: Andy Adamson <andros@netapp.com> Date: Thu, 21 Jan 2010 14:19:16 -0500 Subject: nfs41: fix nfs4_callback_recallslot Return NFS4_OK if target high slotid equals enforced high slotid. Fix nfs_client reference leak. Signed-off-by: Andy Adamson <andros@netapp.com> Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com> --- fs/nfs/callback_proc.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index e5155d9df595..c79e18cd0e15 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -381,13 +381,17 @@ unsigned nfs4_callback_recallslot(struct cb_recallslotargs *args, void *dummy) fc_tbl = &clp->cl_session->fc_slot_table; status = htonl(NFS4ERR_BAD_HIGH_SLOT); - if (args->crsa_target_max_slots >= fc_tbl->max_slots || + if (args->crsa_target_max_slots > fc_tbl->max_slots || args->crsa_target_max_slots < 1) - goto out; + goto out_putclient; + + status = htonl(NFS4_OK); + if (args->crsa_target_max_slots == fc_tbl->max_slots) + goto out_putclient; fc_tbl->target_max_slots = args->crsa_target_max_slots; nfs41_handle_recall_slot(clp); - status = htonl(NFS4_OK); +out_putclient: nfs_put_client(clp); /* balance nfs_find_client */ out: dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); -- cgit v1.2.3 From 41f54a55480c752d9419cac5e647785cb794142e Mon Sep 17 00:00:00 2001 From: Andy Adamson <andros@netapp.com> Date: Thu, 21 Jan 2010 14:54:13 -0500 Subject: nfs41: clear NFS4CLNT_RECALL_SLOT bit on session reset Signed-off-by: Andy Adamson <andros@netapp.com> Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com> --- fs/nfs/nfs4state.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 9164758c1ace..2931c46c4127 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1291,17 +1291,17 @@ static int nfs4_reset_session(struct nfs_client *clp) memset(clp->cl_session->sess_id.data, 0, NFS4_MAX_SESSIONID_LEN); status = nfs4_proc_create_session(clp); - if (status) + if (status) { status = nfs4_recovery_handle_error(clp, status); + goto out; + } + /* create_session negotiated new slot table */ + clear_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state); -out: - /* - * Let the state manager reestablish state - */ - if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) && - status == 0) + /* Let the state manager reestablish state */ + if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state)) nfs41_setup_state_renewal(clp); - +out: return status; } -- cgit v1.2.3 From 9733f0d9289cbcac4fa03db0cb5aec1ab01c6bc9 Mon Sep 17 00:00:00 2001 From: Andy Adamson <andros@netapp.com> Date: Fri, 22 Jan 2010 12:03:08 -0500 Subject: nfs41: cleanup callback code to use __be32 type Signed-off-by: Andy Adamson <andros@netapp.com> Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com> --- fs/nfs/callback_proc.c | 15 ++++++++------- fs/nfs/callback_xdr.c | 30 +++++++++++++++--------------- 2 files changed, 23 insertions(+), 22 deletions(-) (limited to 'fs') diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index c79e18cd0e15..84761b5bb8e2 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -151,7 +151,7 @@ int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const n * checking the used_slots bit array on the table. The lower layer guarantees * a single outstanding callback request at a time. */ -static int +static __be32 validate_seqid(struct nfs4_slot_table *tbl, struct cb_sequenceargs * args) { struct nfs4_slot *slot; @@ -285,11 +285,12 @@ out: return status; } -unsigned nfs4_callback_sequence(struct cb_sequenceargs *args, +__be32 nfs4_callback_sequence(struct cb_sequenceargs *args, struct cb_sequenceres *res) { struct nfs_client *clp; - int i, status; + int i; + __be32 status; status = htonl(NFS4ERR_BADSESSION); clp = find_client_with_session(args->csa_addr, 4, &args->csa_sessionid); @@ -333,10 +334,10 @@ out: return status; } -unsigned nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy) +__be32 nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy) { struct nfs_client *clp; - int status; + __be32 status; fmode_t flags = 0; status = htonl(NFS4ERR_OP_NOT_IN_SESSION); @@ -363,11 +364,11 @@ out: } /* Reduce the fore channel's max_slots to the target value */ -unsigned nfs4_callback_recallslot(struct cb_recallslotargs *args, void *dummy) +__be32 nfs4_callback_recallslot(struct cb_recallslotargs *args, void *dummy) { struct nfs_client *clp; struct nfs4_slot_table *fc_tbl; - int status; + __be32 status; status = htonl(NFS4ERR_OP_NOT_IN_SESSION); clp = nfs_find_client(args->crsa_addr, 4); diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index 8e66e20b59fd..db30c0b398b5 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -219,10 +219,10 @@ out: #if defined(CONFIG_NFS_V4_1) -static unsigned decode_sessionid(struct xdr_stream *xdr, +static __be32 decode_sessionid(struct xdr_stream *xdr, struct nfs4_sessionid *sid) { - uint32_t *p; + __be32 *p; int len = NFS4_MAX_SESSIONID_LEN; p = read_buf(xdr, len); @@ -233,12 +233,12 @@ static unsigned decode_sessionid(struct xdr_stream *xdr, return 0; } -static unsigned decode_rc_list(struct xdr_stream *xdr, +static __be32 decode_rc_list(struct xdr_stream *xdr, struct referring_call_list *rc_list) { - uint32_t *p; + __be32 *p; int i; - unsigned status; + __be32 status; status = decode_sessionid(xdr, &rc_list->rcl_sessionid); if (status) @@ -271,13 +271,13 @@ out: return status; } -static unsigned decode_cb_sequence_args(struct svc_rqst *rqstp, +static __be32 decode_cb_sequence_args(struct svc_rqst *rqstp, struct xdr_stream *xdr, struct cb_sequenceargs *args) { - uint32_t *p; + __be32 *p; int i; - unsigned status; + __be32 status; status = decode_sessionid(xdr, &args->csa_sessionid); if (status) @@ -331,11 +331,11 @@ out_free: goto out; } -static unsigned decode_recallany_args(struct svc_rqst *rqstp, +static __be32 decode_recallany_args(struct svc_rqst *rqstp, struct xdr_stream *xdr, struct cb_recallanyargs *args) { - uint32_t *p; + __be32 *p; args->craa_addr = svc_addr(rqstp); p = read_buf(xdr, 4); @@ -350,7 +350,7 @@ static unsigned decode_recallany_args(struct svc_rqst *rqstp, return 0; } -static unsigned decode_recallslot_args(struct svc_rqst *rqstp, +static __be32 decode_recallslot_args(struct svc_rqst *rqstp, struct xdr_stream *xdr, struct cb_recallslotargs *args) { @@ -517,10 +517,10 @@ out: #if defined(CONFIG_NFS_V4_1) -static unsigned encode_sessionid(struct xdr_stream *xdr, +static __be32 encode_sessionid(struct xdr_stream *xdr, const struct nfs4_sessionid *sid) { - uint32_t *p; + __be32 *p; int len = NFS4_MAX_SESSIONID_LEN; p = xdr_reserve_space(xdr, len); @@ -531,11 +531,11 @@ static unsigned encode_sessionid(struct xdr_stream *xdr, return 0; } -static unsigned encode_cb_sequence_res(struct svc_rqst *rqstp, +static __be32 encode_cb_sequence_res(struct svc_rqst *rqstp, struct xdr_stream *xdr, const struct cb_sequenceres *res) { - uint32_t *p; + __be32 *p; unsigned status = res->csr_status; if (unlikely(status != 0)) -- cgit v1.2.3 From c2459dc46269728e4a080ec8d5a316b2bba2e142 Mon Sep 17 00:00:00 2001 From: Chuck Lever <chuck.lever@oracle.com> Date: Mon, 1 Feb 2010 14:17:14 -0500 Subject: NFS: Proper accounting for NFS VFS calls Nit: The VFSOPEN and VFSFLUSH counters are function call counters. Count every call to these routines. Signed-off-by: Chuck Lever <chuck.lever@oracle.com> Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com> --- fs/nfs/file.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 63f2071d6445..57cf94f129ba 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -123,11 +123,11 @@ nfs_file_open(struct inode *inode, struct file *filp) filp->f_path.dentry->d_parent->d_name.name, filp->f_path.dentry->d_name.name); + nfs_inc_stats(inode, NFSIOS_VFSOPEN); res = nfs_check_flags(filp->f_flags); if (res) return res; - nfs_inc_stats(inode, NFSIOS_VFSOPEN); res = nfs_open(inode, filp); return res; } @@ -237,9 +237,9 @@ nfs_file_flush(struct file *file, fl_owner_t id) dentry->d_parent->d_name.name, dentry->d_name.name); + nfs_inc_stats(inode, NFSIOS_VFSFLUSH); if ((file->f_mode & FMODE_WRITE) == 0) return 0; - nfs_inc_stats(inode, NFSIOS_VFSFLUSH); /* Flush writes to the server and return any errors */ return nfs_do_fsync(ctx, inode); -- cgit v1.2.3 From 4184dcf2dbde481b34d370e1704f2b91a8c9f0d1 Mon Sep 17 00:00:00 2001 From: Chuck Lever <chuck.lever@oracle.com> Date: Mon, 1 Feb 2010 14:17:23 -0500 Subject: NFS: Fix byte accounting for generic NFS reads Currently, the NFS I/O counters count the number of bytes requested by applications, rather than the number of bytes actually read by the system calls. The number of bytes requested for reads is actually not that useful, because the value is usually a buffer size for reads. That is, that requested number is usually a maximum, and frequently doesn't reflect the actual number of bytes read. Signed-off-by: Chuck Lever <chuck.lever@oracle.com> Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com> --- fs/nfs/file.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 57cf94f129ba..7f4910c98c7c 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -262,9 +262,11 @@ nfs_file_read(struct kiocb *iocb, const struct iovec *iov, (unsigned long) count, (unsigned long) pos); result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping); - nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, count); - if (!result) + if (!result) { result = generic_file_aio_read(iocb, iov, nr_segs, pos); + if (result > 0) + nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, result); + } return result; } -- cgit v1.2.3 From aa2f1ef10e6ad65c9138ec576f82c08f32e6f32c Mon Sep 17 00:00:00 2001 From: Chuck Lever <chuck.lever@oracle.com> Date: Mon, 1 Feb 2010 14:17:32 -0500 Subject: NFS: Account for NFS bytes read via the splice API Bytes read via the splice API should be accounted for in the NFS performance statistics. Signed-off-by: Chuck Lever <chuck.lever@oracle.com> Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com> --- fs/nfs/file.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 7f4910c98c7c..abbc20281ea4 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -284,8 +284,11 @@ nfs_file_splice_read(struct file *filp, loff_t *ppos, (unsigned long) count, (unsigned long long) *ppos); res = nfs_revalidate_mapping(inode, filp->f_mapping); - if (!res) + if (!res) { res = generic_file_splice_read(filp, ppos, pipe, count, flags); + if (res > 0) + nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, res); + } return res; } -- cgit v1.2.3 From 7e381172cf6e0282a56374e50667515aed55166a Mon Sep 17 00:00:00 2001 From: Chuck Lever <chuck.lever@oracle.com> Date: Mon, 1 Feb 2010 14:17:41 -0500 Subject: NFS: Improve NFS iostat byte count accuracy for writes The bytes counted by the performance counters for NFS writes should reflect write and sync errors. If the write(2) system call reports an error, the bytes should not be counted. And, if the write is short, the actual number of bytes that was written should be counted, not the number of bytes that was requested. Signed-off-by: Chuck Lever <chuck.lever@oracle.com> Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com> --- fs/nfs/file.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/nfs/file.c b/fs/nfs/file.c index abbc20281ea4..ae8d02294e46 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -601,6 +601,7 @@ static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov, { struct dentry * dentry = iocb->ki_filp->f_path.dentry; struct inode * inode = dentry->d_inode; + unsigned long written = 0; ssize_t result; size_t count = iov_length(iov, nr_segs); @@ -627,14 +628,18 @@ static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov, if (!count) goto out; - nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, count); result = generic_file_aio_write(iocb, iov, nr_segs, pos); + if (result > 0) + written = result; + /* Return error values for O_DSYNC and IS_SYNC() */ if (result >= 0 && nfs_need_sync_write(iocb->ki_filp, inode)) { int err = nfs_do_fsync(nfs_file_open_context(iocb->ki_filp), inode); if (err < 0) result = err; } + if (result > 0) + nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written); out: return result; @@ -649,6 +654,7 @@ static ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe, { struct dentry *dentry = filp->f_path.dentry; struct inode *inode = dentry->d_inode; + unsigned long written = 0; ssize_t ret; dprintk("NFS splice_write(%s/%s, %lu@%llu)\n", @@ -659,14 +665,17 @@ static ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe, * The combination of splice and an O_APPEND destination is disallowed. */ - nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, count); - ret = generic_file_splice_write(pipe, filp, ppos, count, flags); + if (ret > 0) + written = ret; + if (ret >= 0 && nfs_need_sync_write(filp, inode)) { int err = nfs_do_fsync(nfs_file_open_context(filp), inode); if (err < 0) ret = err; } + if (ret > 0) + nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written); return ret; } -- cgit v1.2.3 From f895c53f8ace3c3e49ebf9def90e63fc6d46d2bf Mon Sep 17 00:00:00 2001 From: Chuck Lever <chuck.lever@oracle.com> Date: Mon, 1 Feb 2010 14:17:50 -0500 Subject: NFS: Make close(2) asynchronous when closing NFS O_DIRECT files For NFSv2 and v3: O_DIRECT writes are always synchronous, and aren't cached, so nothing should be flushed when closing an NFS O_DIRECT file descriptor. Thus there are no write errors to report on close(2). In addition, there's no cached data to verify on the next open(2), so we don't need clean GETATTR results at close time to compare with. Thus, there's no need for the nfs_revalidate_inode() call when closing an NFS O_DIRECT file. This reduces the number of synchronous on-the-wire requests for a simple open-write-close of an NFS O_DIRECT file by roughly 20%. For NFSv4: Call nfs4_do_close() with wait set to zero when closing an NFS O_DIRECT file. The CLOSE will go on the wire, but the application won't wait for it to complete. Signed-off-by: Chuck Lever <chuck.lever@oracle.com> Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com> --- fs/nfs/inode.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index f141bde7756a..87cca56846d6 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -620,11 +620,6 @@ void put_nfs_open_context(struct nfs_open_context *ctx) __put_nfs_open_context(ctx, 0); } -static void put_nfs_open_context_sync(struct nfs_open_context *ctx) -{ - __put_nfs_open_context(ctx, 1); -} - /* * Ensure that mmap has a recent RPC credential for use when writing out * shared pages @@ -671,7 +666,7 @@ static void nfs_file_clear_open_context(struct file *filp) spin_lock(&inode->i_lock); list_move_tail(&ctx->list, &NFS_I(inode)->open_files); spin_unlock(&inode->i_lock); - put_nfs_open_context_sync(ctx); + __put_nfs_open_context(ctx, filp->f_flags & O_DIRECT ? 0 : 1); } } -- cgit v1.2.3 From 66655de6d132b726be64c324bc3f9ea366d20697 Mon Sep 17 00:00:00 2001 From: Li Zefan <lizf@cn.fujitsu.com> Date: Mon, 8 Feb 2010 23:18:22 +0000 Subject: seq_file: Add helpers for iteration over a hlist Some places in kernel need to iterate over a hlist in seq_file, so provide some common helpers. Signed-off-by: Li Zefan <lizf@cn.fujitsu.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- fs/seq_file.c | 59 +++++++++++++++++++++++++++++++++++++++++++++--- include/linux/seq_file.h | 11 +++++++++ 2 files changed, 67 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/seq_file.c b/fs/seq_file.c index eae7d9dbf3ff..f65b16f02da3 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -674,7 +674,6 @@ struct list_head *seq_list_start(struct list_head *head, loff_t pos) return NULL; } - EXPORT_SYMBOL(seq_list_start); struct list_head *seq_list_start_head(struct list_head *head, loff_t pos) @@ -684,7 +683,6 @@ struct list_head *seq_list_start_head(struct list_head *head, loff_t pos) return seq_list_start(head, pos - 1); } - EXPORT_SYMBOL(seq_list_start_head); struct list_head *seq_list_next(void *v, struct list_head *head, loff_t *ppos) @@ -695,5 +693,60 @@ struct list_head *seq_list_next(void *v, struct list_head *head, loff_t *ppos) ++*ppos; return lh == head ? NULL : lh; } - EXPORT_SYMBOL(seq_list_next); + +/** + * seq_hlist_start - start an iteration of a hlist + * @head: the head of the hlist + * @pos: the start position of the sequence + * + * Called at seq_file->op->start(). + */ +struct hlist_node *seq_hlist_start(struct hlist_head *head, loff_t pos) +{ + struct hlist_node *node; + + hlist_for_each(node, head) + if (pos-- == 0) + return node; + return NULL; +} +EXPORT_SYMBOL(seq_hlist_start); + +/** + * seq_hlist_start_head - start an iteration of a hlist + * @head: the head of the hlist + * @pos: the start position of the sequence + * + * Called at seq_file->op->start(). Call this function if you want to + * print a header at the top of the output. + */ +struct hlist_node *seq_hlist_start_head(struct hlist_head *head, loff_t pos) +{ + if (!pos) + return SEQ_START_TOKEN; + + return seq_hlist_start(head, pos - 1); +} +EXPORT_SYMBOL(seq_hlist_start_head); + +/** + * seq_hlist_next - move to the next position of the hlist + * @v: the current iterator + * @head: the head of the hlist + * @pos: the current posision + * + * Called at seq_file->op->next(). + */ +struct hlist_node *seq_hlist_next(void *v, struct hlist_head *head, + loff_t *ppos) +{ + struct hlist_node *node = v; + + ++*ppos; + if (v == SEQ_START_TOKEN) + return head->first; + else + return node->next; +} +EXPORT_SYMBOL(seq_hlist_next); diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h index 8366d8f12e53..c95bcdc18f4c 100644 --- a/include/linux/seq_file.h +++ b/include/linux/seq_file.h @@ -135,4 +135,15 @@ extern struct list_head *seq_list_start_head(struct list_head *head, extern struct list_head *seq_list_next(void *v, struct list_head *head, loff_t *ppos); +/* + * Helpers for iteration over hlist_head-s in seq_files + */ + +extern struct hlist_node *seq_hlist_start(struct hlist_head *head, + loff_t pos); +extern struct hlist_node *seq_hlist_start_head(struct hlist_head *head, + loff_t pos); +extern struct hlist_node *seq_hlist_next(void *v, struct hlist_head *head, + loff_t *ppos); + #endif -- cgit v1.2.3 From 87185517de81101da5afbc82cefdeed6eeaa38fb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@infradead.org> Date: Wed, 3 Feb 2010 19:43:31 +0000 Subject: xfs: only clear the suid bit once in xfs_write file_remove_suid already calls into ->setattr to clear the suid and sgid bits if needed, no need to start a second transaction to do it ourselves. Note that xfs_write_clear_setuid issues a sync transaction while the path through ->setattr doesn't, but that is consistant with the other filesystems. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Alex Elder <aelder@sgi.com> Signed-off-by: Alex Elder <aelder@sgi.com> --- fs/xfs/linux-2.6/xfs_lrw.c | 15 +++------------ fs/xfs/xfs_rw.c | 42 ------------------------------------------ fs/xfs/xfs_rw.h | 1 - 3 files changed, 3 insertions(+), 55 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c index c80fa00d2ad7..eac6f80d786d 100644 --- a/fs/xfs/linux-2.6/xfs_lrw.c +++ b/fs/xfs/linux-2.6/xfs_lrw.c @@ -630,18 +630,9 @@ start: * by root. This keeps people from modifying setuid and * setgid binaries. */ - - if (((xip->i_d.di_mode & S_ISUID) || - ((xip->i_d.di_mode & (S_ISGID | S_IXGRP)) == - (S_ISGID | S_IXGRP))) && - !capable(CAP_FSETID)) { - error = xfs_write_clear_setuid(xip); - if (likely(!error)) - error = -file_remove_suid(file); - if (unlikely(error)) { - goto out_unlock_internal; - } - } + error = -file_remove_suid(file); + if (unlikely(error)) + goto out_unlock_internal; /* We can write back this queue in page reclaim */ current->backing_dev_info = mapping->backing_dev_info; diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c index abb2c458b148..e336742a58a4 100644 --- a/fs/xfs/xfs_rw.c +++ b/fs/xfs/xfs_rw.c @@ -46,48 +46,6 @@ #include "xfs_rw.h" #include "xfs_trace.h" -/* - * This is a subroutine for xfs_write() and other writers (xfs_ioctl) - * which clears the setuid and setgid bits when a file is written. - */ -int -xfs_write_clear_setuid( - xfs_inode_t *ip) -{ - xfs_mount_t *mp; - xfs_trans_t *tp; - int error; - - mp = ip->i_mount; - tp = xfs_trans_alloc(mp, XFS_TRANS_WRITEID); - if ((error = xfs_trans_reserve(tp, 0, - XFS_WRITEID_LOG_RES(mp), - 0, 0, 0))) { - xfs_trans_cancel(tp, 0); - return error; - } - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - xfs_trans_ihold(tp, ip); - ip->i_d.di_mode &= ~S_ISUID; - - /* - * Note that we don't have to worry about mandatory - * file locking being disabled here because we only - * clear the S_ISGID bit if the Group execute bit is - * on, but if it was on then mandatory locking wouldn't - * have been enabled. - */ - if (ip->i_d.di_mode & S_IXGRP) { - ip->i_d.di_mode &= ~S_ISGID; - } - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - xfs_trans_set_sync(tp); - error = xfs_trans_commit(tp, 0); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - return 0; -} - /* * Force a shutdown of the filesystem instantly while keeping * the filesystem consistent. We don't do an unmount here; just shutdown diff --git a/fs/xfs/xfs_rw.h b/fs/xfs/xfs_rw.h index a54c3b7cd376..11c41ec6ed75 100644 --- a/fs/xfs/xfs_rw.h +++ b/fs/xfs/xfs_rw.h @@ -39,7 +39,6 @@ xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb) /* * Prototypes for functions in xfs_rw.c. */ -extern int xfs_write_clear_setuid(struct xfs_inode *ip); extern int xfs_read_buf(struct xfs_mount *mp, xfs_buftarg_t *btp, xfs_daddr_t blkno, int len, uint flags, struct xfs_buf **bpp); -- cgit v1.2.3 From 180040b89ee2aed88c0a0b1fcf7ada9a512b12e3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@infradead.org> Date: Fri, 5 Feb 2010 09:57:55 +0000 Subject: xfs: optimize log flushing in xfs_fsync If we have a pinned inode it must have a log item attached to it. Usually that log item will have ili_last_lsn already set, in which case we only need to flush the log up to that LSN instead of doing a full log force. This gives speedups of about 5% in some fsync heavy workloads. Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Alex Elder <aelder@sgi.com> --- fs/xfs/xfs_vnodeops.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 43241e289800..ddd2c5d1b854 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -629,8 +629,14 @@ xfs_fsync( */ xfs_iunlock(ip, XFS_ILOCK_SHARED); if (xfs_ipincount(ip)) { - error = _xfs_log_force(ip->i_mount, XFS_LOG_SYNC, - &log_flushed); + if (ip->i_itemp->ili_last_lsn) { + error = _xfs_log_force_lsn(ip->i_mount, + ip->i_itemp->ili_last_lsn, + XFS_LOG_SYNC, &log_flushed); + } else { + error = _xfs_log_force(ip->i_mount, + XFS_LOG_SYNC, &log_flushed); + } } } else { /* -- cgit v1.2.3 From e902ec9906e844f4613fa6190c6fa65f162dc86e Mon Sep 17 00:00:00 2001 From: Jiro SEKIBA <jir@unicus.jp> Date: Sat, 30 Jan 2010 18:06:35 +0900 Subject: nilfs2: issue discard request after cleaning segments This adds a function to send discard requests for given array of segment numbers, and calls the function when garbage collection succeeded. Signed-off-by: Jiro SEKIBA <jir@unicus.jp> Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp> --- Documentation/filesystems/nilfs2.txt | 3 +++ fs/nilfs2/segment.c | 10 ++++++++++ fs/nilfs2/super.c | 8 +++++++- fs/nilfs2/the_nilfs.c | 38 ++++++++++++++++++++++++++++++++++++ fs/nilfs2/the_nilfs.h | 1 + include/linux/nilfs2_fs.h | 1 + 6 files changed, 60 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/Documentation/filesystems/nilfs2.txt b/Documentation/filesystems/nilfs2.txt index 839efd8a8a8c..cf6d0d85ca82 100644 --- a/Documentation/filesystems/nilfs2.txt +++ b/Documentation/filesystems/nilfs2.txt @@ -74,6 +74,9 @@ norecovery Disable recovery of the filesystem on mount. This disables every write access on the device for read-only mounts or snapshots. This option will fail for r/w mounts on an unclean volume. +discard Issue discard/TRIM commands to the underlying block + device when blocks are freed. This is useful for SSD + devices and sparse/thinly-provisioned LUNs. NILFS2 usage ============ diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 105b508b47a8..9280b0f10792 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -2560,6 +2560,16 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv, set_current_state(TASK_INTERRUPTIBLE); schedule_timeout(sci->sc_interval); } + if (nilfs_test_opt(sbi, DISCARD)) { + int ret = nilfs_discard_segments(nilfs, sci->sc_freesegs, + sci->sc_nfreesegs); + if (ret) { + printk(KERN_WARNING + "NILFS warning: error %d on discard request, " + "turning discards off for the device\n", ret); + nilfs_clear_opt(sbi, DISCARD); + } + } out_unlock: sci->sc_freesegs = NULL; diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 8173faee31e6..3f88401a375b 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -481,6 +481,8 @@ static int nilfs_show_options(struct seq_file *seq, struct vfsmount *vfs) seq_printf(seq, ",order=strict"); if (nilfs_test_opt(sbi, NORECOVERY)) seq_printf(seq, ",norecovery"); + if (nilfs_test_opt(sbi, DISCARD)) + seq_printf(seq, ",discard"); return 0; } @@ -550,7 +552,7 @@ static const struct export_operations nilfs_export_ops = { enum { Opt_err_cont, Opt_err_panic, Opt_err_ro, Opt_nobarrier, Opt_snapshot, Opt_order, Opt_norecovery, - Opt_err, + Opt_discard, Opt_err, }; static match_table_t tokens = { @@ -561,6 +563,7 @@ static match_table_t tokens = { {Opt_snapshot, "cp=%u"}, {Opt_order, "order=%s"}, {Opt_norecovery, "norecovery"}, + {Opt_discard, "discard"}, {Opt_err, NULL} }; @@ -614,6 +617,9 @@ static int parse_options(char *options, struct super_block *sb) case Opt_norecovery: nilfs_set_opt(sbi, NORECOVERY); break; + case Opt_discard: + nilfs_set_opt(sbi, DISCARD); + break; default: printk(KERN_ERR "NILFS: Unrecognized mount option \"%s\"\n", p); diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index 6241e1722efc..92733d5651d2 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -646,6 +646,44 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data) goto out; } +int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump, + size_t nsegs) +{ + sector_t seg_start, seg_end; + sector_t start = 0, nblocks = 0; + unsigned int sects_per_block; + __u64 *sn; + int ret = 0; + + sects_per_block = (1 << nilfs->ns_blocksize_bits) / + bdev_logical_block_size(nilfs->ns_bdev); + for (sn = segnump; sn < segnump + nsegs; sn++) { + nilfs_get_segment_range(nilfs, *sn, &seg_start, &seg_end); + + if (!nblocks) { + start = seg_start; + nblocks = seg_end - seg_start + 1; + } else if (start + nblocks == seg_start) { + nblocks += seg_end - seg_start + 1; + } else { + ret = blkdev_issue_discard(nilfs->ns_bdev, + start * sects_per_block, + nblocks * sects_per_block, + GFP_NOFS, + DISCARD_FL_BARRIER); + if (ret < 0) + return ret; + nblocks = 0; + } + } + if (nblocks) + ret = blkdev_issue_discard(nilfs->ns_bdev, + start * sects_per_block, + nblocks * sects_per_block, + GFP_NOFS, DISCARD_FL_BARRIER); + return ret; +} + int nilfs_count_free_blocks(struct the_nilfs *nilfs, sector_t *nblocks) { struct inode *dat = nilfs_dat_inode(nilfs); diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h index 589786e33464..fd057f8ad439 100644 --- a/fs/nilfs2/the_nilfs.h +++ b/fs/nilfs2/the_nilfs.h @@ -221,6 +221,7 @@ struct the_nilfs *find_or_create_nilfs(struct block_device *); void put_nilfs(struct the_nilfs *); int init_nilfs(struct the_nilfs *, struct nilfs_sb_info *, char *); int load_nilfs(struct the_nilfs *, struct nilfs_sb_info *); +int nilfs_discard_segments(struct the_nilfs *, __u64 *, size_t); int nilfs_count_free_blocks(struct the_nilfs *, sector_t *); struct nilfs_sb_info *nilfs_find_sbinfo(struct the_nilfs *, int, __u64); int nilfs_checkpoint_is_mounted(struct the_nilfs *, __u64, int); diff --git a/include/linux/nilfs2_fs.h b/include/linux/nilfs2_fs.h index 3fe02cf8b65a..640702e97457 100644 --- a/include/linux/nilfs2_fs.h +++ b/include/linux/nilfs2_fs.h @@ -153,6 +153,7 @@ struct nilfs_super_root { semantics also for data */ #define NILFS_MOUNT_NORECOVERY 0x4000 /* Disable write access during mount-time recovery */ +#define NILFS_MOUNT_DISCARD 0x8000 /* Issue DISCARD requests */ /** -- cgit v1.2.3 From 7512487e6d6459e4c3f9c7cedc53050a6c30e387 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp> Date: Tue, 26 Jan 2010 13:59:40 +0900 Subject: nilfs2: use mnt_want_write in ioctls where write access is needed A few nilfs2 ioctls need to ask for and then later release write access to the mount in order to avoid potential write to read-only mounts. This adds the missing mnt_want_write and mnt_drop_write in nilfs_ioctl_change_cpmode, nilfs_ioctl_delete_checkpoint, and nilfs_ioctl_clean_segments. Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp> --- fs/nilfs2/ioctl.c | 60 +++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 43 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index d6b2b83de363..8e5cad020c30 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -26,6 +26,7 @@ #include <linux/capability.h> /* capable() */ #include <linux/uaccess.h> /* copy_from_user(), copy_to_user() */ #include <linux/vmalloc.h> +#include <linux/mount.h> /* mnt_want_write(), mnt_drop_write() */ #include <linux/nilfs2_fs.h> #include "nilfs.h" #include "segment.h" @@ -107,20 +108,28 @@ static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp, if (!capable(CAP_SYS_ADMIN)) return -EPERM; + + ret = mnt_want_write(filp->f_path.mnt); + if (ret) + return ret; + + ret = -EFAULT; if (copy_from_user(&cpmode, argp, sizeof(cpmode))) - return -EFAULT; + goto out; mutex_lock(&nilfs->ns_mount_mutex); + nilfs_transaction_begin(inode->i_sb, &ti, 0); ret = nilfs_cpfile_change_cpmode( cpfile, cpmode.cm_cno, cpmode.cm_mode); - if (unlikely(ret < 0)) { + if (unlikely(ret < 0)) nilfs_transaction_abort(inode->i_sb); - mutex_unlock(&nilfs->ns_mount_mutex); - return ret; - } - nilfs_transaction_commit(inode->i_sb); /* never fails */ + else + nilfs_transaction_commit(inode->i_sb); /* never fails */ + mutex_unlock(&nilfs->ns_mount_mutex); +out: + mnt_drop_write(filp->f_path.mnt); return ret; } @@ -135,16 +144,23 @@ nilfs_ioctl_delete_checkpoint(struct inode *inode, struct file *filp, if (!capable(CAP_SYS_ADMIN)) return -EPERM; + + ret = mnt_want_write(filp->f_path.mnt); + if (ret) + return ret; + + ret = -EFAULT; if (copy_from_user(&cno, argp, sizeof(cno))) - return -EFAULT; + goto out; nilfs_transaction_begin(inode->i_sb, &ti, 0); ret = nilfs_cpfile_delete_checkpoint(cpfile, cno); - if (unlikely(ret < 0)) { + if (unlikely(ret < 0)) nilfs_transaction_abort(inode->i_sb); - return ret; - } - nilfs_transaction_commit(inode->i_sb); /* never fails */ + else + nilfs_transaction_commit(inode->i_sb); /* never fails */ +out: + mnt_drop_write(filp->f_path.mnt); return ret; } @@ -496,12 +512,19 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp, if (!capable(CAP_SYS_ADMIN)) return -EPERM; + ret = mnt_want_write(filp->f_path.mnt); + if (ret) + return ret; + + ret = -EFAULT; if (copy_from_user(argv, argp, sizeof(argv))) - return -EFAULT; + goto out; + ret = -EINVAL; nsegs = argv[4].v_nmembs; if (argv[4].v_size != argsz[4]) - return -EINVAL; + goto out; + /* * argv[4] points to segment numbers this ioctl cleans. We * use kmalloc() for its buffer because memory used for the @@ -509,9 +532,10 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp, */ kbufs[4] = memdup_user((void __user *)(unsigned long)argv[4].v_base, nsegs * sizeof(__u64)); - if (IS_ERR(kbufs[4])) - return PTR_ERR(kbufs[4]); - + if (IS_ERR(kbufs[4])) { + ret = PTR_ERR(kbufs[4]); + goto out; + } nilfs = NILFS_SB(inode->i_sb)->s_nilfs; for (n = 0; n < 4; n++) { @@ -563,10 +587,12 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp, nilfs_remove_all_gcinode(nilfs); clear_nilfs_gc_running(nilfs); - out_free: +out_free: while (--n >= 0) vfree(kbufs[n]); kfree(kbufs[4]); +out: + mnt_drop_write(filp->f_path.mnt); return ret; } -- cgit v1.2.3 From fe5f171bb272946ce5fbf843ce2f8467d0d41b9a Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp> Date: Sun, 31 Jan 2010 19:46:40 +0900 Subject: nilfs2: fix potential hang in nilfs_error on errors=remount-ro nilfs_error() calls nilfs_detach_segment_constructor() if errors=remount-ro option is specified, and this may lead to a hang due to recursive locking of, for instance, nilfs->ns_segctor_sem and others. In this case, detaching segment constructor is not necessary because read-only flag is set to the filesystem and further writes are blocked. This fixes the potential hang issue by removing the nilfs_detach_segment_constructor() call from nilfs_error. Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp> --- fs/nilfs2/segment.c | 11 +++++++++-- fs/nilfs2/super.c | 3 --- 2 files changed, 9 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 9280b0f10792..ab439a7ef2d8 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -2875,8 +2875,15 @@ int nilfs_attach_segment_constructor(struct nilfs_sb_info *sbi) struct the_nilfs *nilfs = sbi->s_nilfs; int err; - /* Each field of nilfs_segctor is cleared through the initialization - of super-block info */ + if (NILFS_SC(sbi)) { + /* + * This happens if the filesystem was remounted + * read/write after nilfs_error degenerated it into a + * read-only mount. + */ + nilfs_detach_segment_constructor(sbi); + } + sbi->s_sc_info = nilfs_segctor_new(sbi); if (!sbi->s_sc_info) return -ENOMEM; diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 3f88401a375b..f068270f6c75 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -96,9 +96,6 @@ void nilfs_error(struct super_block *sb, const char *function, if (!(sb->s_flags & MS_RDONLY)) { struct the_nilfs *nilfs = sbi->s_nilfs; - if (!nilfs_test_opt(sbi, ERRORS_CONT)) - nilfs_detach_segment_constructor(sbi); - down_write(&nilfs->ns_sem); if (!(nilfs->ns_mount_state & NILFS_ERROR_FS)) { nilfs->ns_mount_state |= NILFS_ERROR_FS; -- cgit v1.2.3 From 086d1764b22bb2d9d79bb8e2198927acf028d732 Mon Sep 17 00:00:00 2001 From: Jiro SEKIBA <jir@unicus.jp> Date: Fri, 5 Feb 2010 23:15:26 +0900 Subject: nilfs2: delete unnecessary condition in nilfs_dat_translate This is a trivial patch to delete unnecessary condition in nilfs_dat_translate. nilfs_dat_translate() will asign translated address to *blocknrp if blocknrp is not NULL. However the condition is unneeded, because all callers of nilfs_dat_translate() pass blocknrp properly. Signed-off-by: Jiro SEKIBA <jir@unicus.jp> Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp> --- fs/nilfs2/dat.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c index 187dd07ba86c..9d1e5de91afb 100644 --- a/fs/nilfs2/dat.c +++ b/fs/nilfs2/dat.c @@ -388,8 +388,7 @@ int nilfs_dat_translate(struct inode *dat, __u64 vblocknr, sector_t *blocknrp) ret = -ENOENT; goto out; } - if (blocknrp != NULL) - *blocknrp = blocknr; + *blocknrp = blocknr; out: kunmap_atomic(kaddr, KM_USER0); -- cgit v1.2.3 From dcd76186955e2b595c378dbe5b9bb6c8c5374b10 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp> Date: Tue, 26 Jan 2010 15:20:15 +0900 Subject: nilfs2: get rid of nilfs_segctor_req struct This will clean up nilfs_segctor_req struct and the obscure request argument passed among private methods of segment constructor. Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp> --- fs/nilfs2/segment.c | 79 ++++++++++++++++++++++++++--------------------------- fs/nilfs2/segment.h | 2 ++ 2 files changed, 40 insertions(+), 41 deletions(-) (limited to 'fs') diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index ab439a7ef2d8..fa4abfc22d33 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -2425,43 +2425,43 @@ int nilfs_construct_dsync_segment(struct super_block *sb, struct inode *inode, return err; } -struct nilfs_segctor_req { - int mode; - __u32 seq_accepted; - int sc_err; /* construction failure */ - int sb_err; /* super block writeback failure */ -}; - #define FLUSH_FILE_BIT (0x1) /* data file only */ #define FLUSH_DAT_BIT (1 << NILFS_DAT_INO) /* DAT only */ -static void nilfs_segctor_accept(struct nilfs_sc_info *sci, - struct nilfs_segctor_req *req) +/** + * nilfs_segctor_accept - record accepted sequence count of log-write requests + * @sci: segment constructor object + */ +static void nilfs_segctor_accept(struct nilfs_sc_info *sci) { - req->sc_err = req->sb_err = 0; spin_lock(&sci->sc_state_lock); - req->seq_accepted = sci->sc_seq_request; + sci->sc_seq_accepted = sci->sc_seq_request; spin_unlock(&sci->sc_state_lock); if (sci->sc_timer) del_timer_sync(sci->sc_timer); } -static void nilfs_segctor_notify(struct nilfs_sc_info *sci, - struct nilfs_segctor_req *req) +/** + * nilfs_segctor_notify - notify the result of request to caller threads + * @sci: segment constructor object + * @mode: mode of log forming + * @err: error code to be notified + */ +static void nilfs_segctor_notify(struct nilfs_sc_info *sci, int mode, int err) { /* Clear requests (even when the construction failed) */ spin_lock(&sci->sc_state_lock); - if (req->mode == SC_LSEG_SR) { + if (mode == SC_LSEG_SR) { sci->sc_state &= ~NILFS_SEGCTOR_COMMIT; - sci->sc_seq_done = req->seq_accepted; - nilfs_segctor_wakeup(sci, req->sc_err ? : req->sb_err); + sci->sc_seq_done = sci->sc_seq_accepted; + nilfs_segctor_wakeup(sci, err); sci->sc_flush_request = 0; } else { - if (req->mode == SC_FLUSH_FILE) + if (mode == SC_FLUSH_FILE) sci->sc_flush_request &= ~FLUSH_FILE_BIT; - else if (req->mode == SC_FLUSH_DAT) + else if (mode == SC_FLUSH_DAT) sci->sc_flush_request &= ~FLUSH_DAT_BIT; /* re-enable timer if checkpoint creation was not done */ @@ -2472,30 +2472,37 @@ static void nilfs_segctor_notify(struct nilfs_sc_info *sci, spin_unlock(&sci->sc_state_lock); } -static int nilfs_segctor_construct(struct nilfs_sc_info *sci, - struct nilfs_segctor_req *req) +/** + * nilfs_segctor_construct - form logs and write them to disk + * @sci: segment constructor object + * @mode: mode of log forming + */ +static int nilfs_segctor_construct(struct nilfs_sc_info *sci, int mode) { struct nilfs_sb_info *sbi = sci->sc_sbi; struct the_nilfs *nilfs = sbi->s_nilfs; int err = 0; + nilfs_segctor_accept(sci); + if (nilfs_discontinued(nilfs)) - req->mode = SC_LSEG_SR; - if (!nilfs_segctor_confirm(sci)) { - err = nilfs_segctor_do_construct(sci, req->mode); - req->sc_err = err; - } + mode = SC_LSEG_SR; + if (!nilfs_segctor_confirm(sci)) + err = nilfs_segctor_do_construct(sci, mode); + if (likely(!err)) { - if (req->mode != SC_FLUSH_DAT) + if (mode != SC_FLUSH_DAT) atomic_set(&nilfs->ns_ndirtyblks, 0); if (test_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags) && nilfs_discontinued(nilfs)) { down_write(&nilfs->ns_sem); - req->sb_err = nilfs_commit_super(sbi, - nilfs_altsb_need_update(nilfs)); + err = nilfs_commit_super( + sbi, nilfs_altsb_need_update(nilfs)); up_write(&nilfs->ns_sem); } } + + nilfs_segctor_notify(sci, mode, err); return err; } @@ -2526,7 +2533,6 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv, struct nilfs_sc_info *sci = NILFS_SC(sbi); struct the_nilfs *nilfs = sbi->s_nilfs; struct nilfs_transaction_info ti; - struct nilfs_segctor_req req = { .mode = SC_LSEG_SR }; int err; if (unlikely(!sci)) @@ -2547,10 +2553,8 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv, list_splice_tail_init(&nilfs->ns_gc_inodes, &sci->sc_gc_inodes); for (;;) { - nilfs_segctor_accept(sci, &req); - err = nilfs_segctor_construct(sci, &req); + err = nilfs_segctor_construct(sci, SC_LSEG_SR); nilfs_remove_written_gcinodes(nilfs, &sci->sc_gc_inodes); - nilfs_segctor_notify(sci, &req); if (likely(!err)) break; @@ -2583,13 +2587,9 @@ static void nilfs_segctor_thread_construct(struct nilfs_sc_info *sci, int mode) { struct nilfs_sb_info *sbi = sci->sc_sbi; struct nilfs_transaction_info ti; - struct nilfs_segctor_req req = { .mode = mode }; nilfs_transaction_lock(sbi, &ti, 0); - - nilfs_segctor_accept(sci, &req); - nilfs_segctor_construct(sci, &req); - nilfs_segctor_notify(sci, &req); + nilfs_segctor_construct(sci, mode); /* * Unclosed segment should be retried. We do this using sc_timer. @@ -2807,12 +2807,9 @@ static void nilfs_segctor_write_out(struct nilfs_sc_info *sci) do { struct nilfs_sb_info *sbi = sci->sc_sbi; struct nilfs_transaction_info ti; - struct nilfs_segctor_req req = { .mode = SC_LSEG_SR }; nilfs_transaction_lock(sbi, &ti, 0); - nilfs_segctor_accept(sci, &req); - ret = nilfs_segctor_construct(sci, &req); - nilfs_segctor_notify(sci, &req); + ret = nilfs_segctor_construct(sci, SC_LSEG_SR); nilfs_transaction_unlock(sbi); } while (ret && retrycount-- > 0); diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h index 3d3ab2f9864c..3155e0c7f415 100644 --- a/fs/nilfs2/segment.h +++ b/fs/nilfs2/segment.h @@ -116,6 +116,7 @@ struct nilfs_segsum_pointer { * @sc_wait_daemon: Daemon wait queue * @sc_wait_task: Start/end wait queue to control segctord task * @sc_seq_request: Request counter + * @sc_seq_accept: Accepted request count * @sc_seq_done: Completion counter * @sc_sync: Request of explicit sync operation * @sc_interval: Timeout value of background construction @@ -169,6 +170,7 @@ struct nilfs_sc_info { wait_queue_head_t sc_wait_task; __u32 sc_seq_request; + __u32 sc_seq_accepted; __u32 sc_seq_done; int sc_sync; -- cgit v1.2.3 From e605f0a7249d8002c660af379f884896cbaa45ae Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp> Date: Wed, 9 Dec 2009 00:57:52 +0900 Subject: nilfs2: get rid of s_dirt flag use This replaces s_dirt flag use in nilfs with a new flag added on the nilfs object. The s_dirt flag was used to indicate if sop->write_super() should be called, however the current version of nilfs does not use the callback. Thus, it can be replaced with the own flag. Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp> Cc: Jiro SEKIBA <jir@unicus.jp> --- fs/nilfs2/segment.c | 11 +++++------ fs/nilfs2/super.c | 4 ++-- fs/nilfs2/the_nilfs.h | 2 ++ 3 files changed, 9 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index fa4abfc22d33..c4fcdd1567a9 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -1937,8 +1937,7 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci) { struct nilfs_segment_buffer *segbuf; struct page *bd_page = NULL, *fs_page = NULL; - struct nilfs_sb_info *sbi = sci->sc_sbi; - struct the_nilfs *nilfs = sbi->s_nilfs; + struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs; int update_sr = (sci->sc_super_root != NULL); list_for_each_entry(segbuf, &sci->sc_write_logs, sb_list) { @@ -2020,7 +2019,7 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci) if (update_sr) { nilfs_set_last_segment(nilfs, segbuf->sb_pseg_start, segbuf->sb_sum.seg_seq, nilfs->ns_cno++); - sbi->s_super->s_dirt = 1; + set_nilfs_sb_dirty(nilfs); clear_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags); clear_bit(NILFS_SC_DIRTY, &sci->sc_flags); @@ -2645,6 +2644,7 @@ static int nilfs_segctor_flush_mode(struct nilfs_sc_info *sci) static int nilfs_segctor_thread(void *arg) { struct nilfs_sc_info *sci = (struct nilfs_sc_info *)arg; + struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs; struct timer_list timer; int timeout = 0; @@ -2690,7 +2690,6 @@ static int nilfs_segctor_thread(void *arg) } else { DEFINE_WAIT(wait); int should_sleep = 1; - struct the_nilfs *nilfs; prepare_to_wait(&sci->sc_wait_daemon, &wait, TASK_INTERRUPTIBLE); @@ -2711,8 +2710,8 @@ static int nilfs_segctor_thread(void *arg) finish_wait(&sci->sc_wait_daemon, &wait); timeout = ((sci->sc_state & NILFS_SEGCTOR_COMMIT) && time_after_eq(jiffies, sci->sc_timer->expires)); - nilfs = sci->sc_sbi->s_nilfs; - if (sci->sc_super->s_dirt && nilfs_sb_need_update(nilfs)) + + if (nilfs_sb_dirty(nilfs) && nilfs_sb_need_update(nilfs)) set_nilfs_discontinued(nilfs); } goto loop; diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index f068270f6c75..92579cc4c935 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -298,7 +298,7 @@ int nilfs_commit_super(struct nilfs_sb_info *sbi, int dupsb) memcpy(sbp[1], sbp[0], nilfs->ns_sbsize); nilfs->ns_sbwtime[1] = t; } - sbi->s_super->s_dirt = 0; + clear_nilfs_sb_dirty(nilfs); return nilfs_sync_super(sbi, dupsb); } @@ -342,7 +342,7 @@ static int nilfs_sync_fs(struct super_block *sb, int wait) err = nilfs_construct_segment(sb); down_write(&nilfs->ns_sem); - if (sb->s_dirt) + if (nilfs_sb_dirty(nilfs)) nilfs_commit_super(sbi, 1); up_write(&nilfs->ns_sem); diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h index fd057f8ad439..e9795f1724d7 100644 --- a/fs/nilfs2/the_nilfs.h +++ b/fs/nilfs2/the_nilfs.h @@ -38,6 +38,7 @@ enum { the latest checkpoint was loaded */ THE_NILFS_DISCONTINUED, /* 'next' pointer chain has broken */ THE_NILFS_GC_RUNNING, /* gc process is running */ + THE_NILFS_SB_DIRTY, /* super block is dirty */ }; /** @@ -197,6 +198,7 @@ THE_NILFS_FNS(INIT, init) THE_NILFS_FNS(LOADED, loaded) THE_NILFS_FNS(DISCONTINUED, discontinued) THE_NILFS_FNS(GC_RUNNING, gc_running) +THE_NILFS_FNS(SB_DIRTY, sb_dirty) /* Minimum interval of periodical update of superblocks (in seconds) */ #define NILFS_SB_FREQ 10 -- cgit v1.2.3 From d1c6b72a7224f6cd6924f7079f79580cde696d68 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp> Date: Thu, 17 Dec 2009 00:55:40 +0900 Subject: nilfs2: move iterator to write log into segment buffer This moves iterator to submit write requests for a series of logs into segbuf.c, and hides nilfs_segbuf_write() and nilfs_segbuf_wait() in the file. Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp> --- fs/nilfs2/segbuf.c | 18 ++++++++++++++++++ fs/nilfs2/segbuf.h | 5 +---- fs/nilfs2/segment.c | 9 ++------- 3 files changed, 21 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c index 645c78656aa0..ab56fe44e377 100644 --- a/fs/nilfs2/segbuf.c +++ b/fs/nilfs2/segbuf.c @@ -40,6 +40,11 @@ struct nilfs_write_info { }; +static int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf, + struct the_nilfs *nilfs); +static int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf); + + static struct kmem_cache *nilfs_segbuf_cachep; static void nilfs_segbuf_init_once(void *obj) @@ -302,6 +307,19 @@ void nilfs_truncate_logs(struct list_head *logs, } } +int nilfs_write_logs(struct list_head *logs, struct the_nilfs *nilfs) +{ + struct nilfs_segment_buffer *segbuf; + int ret = 0; + + list_for_each_entry(segbuf, logs, sb_list) { + ret = nilfs_segbuf_write(segbuf, nilfs); + if (ret) + break; + } + return ret; +} + int nilfs_wait_on_logs(struct list_head *logs) { struct nilfs_segment_buffer *segbuf; diff --git a/fs/nilfs2/segbuf.h b/fs/nilfs2/segbuf.h index 6af1630fb401..94dfd3517bc0 100644 --- a/fs/nilfs2/segbuf.h +++ b/fs/nilfs2/segbuf.h @@ -166,13 +166,10 @@ nilfs_segbuf_add_file_buffer(struct nilfs_segment_buffer *segbuf, segbuf->sb_sum.nfileblk++; } -int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf, - struct the_nilfs *nilfs); -int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf); - void nilfs_clear_logs(struct list_head *logs); void nilfs_truncate_logs(struct list_head *logs, struct nilfs_segment_buffer *last); +int nilfs_write_logs(struct list_head *logs, struct the_nilfs *nilfs); int nilfs_wait_on_logs(struct list_head *logs); static inline void nilfs_destroy_logs(struct list_head *logs) diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index c4fcdd1567a9..ada2f1b947a3 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -1764,14 +1764,9 @@ static int nilfs_segctor_prepare_write(struct nilfs_sc_info *sci, static int nilfs_segctor_write(struct nilfs_sc_info *sci, struct the_nilfs *nilfs) { - struct nilfs_segment_buffer *segbuf; - int ret = 0; + int ret; - list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) { - ret = nilfs_segbuf_write(segbuf, nilfs); - if (ret) - break; - } + ret = nilfs_write_logs(&sci->sc_segbufs, nilfs); list_splice_tail_init(&sci->sc_segbufs, &sci->sc_write_logs); return ret; } -- cgit v1.2.3 From d67b1b03254c501fef371b0e5916c94a52bfc2c5 Mon Sep 17 00:00:00 2001 From: Julia Lawall <julia@diku.dk> Date: Sat, 6 Feb 2010 08:45:15 +0000 Subject: fs/xfs: Correct NULL test Test the value that was just allocated rather than the previously tested one. A simplified version of the semantic match that finds this problem is as follows: (http://coccinelle.lip6.fr/) // <smpl> @r@ expression *x; expression e; identifier l; @@ if (x == NULL || ...) { ... when forall return ...; } ... when != goto l; when != x = e when != &x *x == NULL // </smpl> Signed-off-by: Julia Lawall <julia@diku.dk> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Alex Elder <aelder@sgi.com> --- fs/xfs/quota/xfs_qm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c index 8699e51cb45e..417e61e3d9dd 100644 --- a/fs/xfs/quota/xfs_qm.c +++ b/fs/xfs/quota/xfs_qm.c @@ -123,7 +123,7 @@ xfs_Gqm_init(void) goto out; gdqhash = kmem_zalloc_large(hsize); - if (!udqhash) + if (!gdqhash) goto out_free_udqhash; hsize /= sizeof(xfs_dqhash_t); -- cgit v1.2.3 From 7c540d9e3da38c3d1c15fb8059e4577a84ac0066 Mon Sep 17 00:00:00 2001 From: Jeremy Kerr <jeremy.kerr@canonical.com> Date: Sun, 14 Feb 2010 07:13:41 -0700 Subject: proc_devtree: fix THIS_MODULE without module.h Commit e22f628395432b967f2f505858c64450f7835365 introduced a build breakage for ARM devtree work: the THIS_MODULE macro was added, but we don't have module.h This change adds the necessary #include to get THIS_MODULE defined. While we could just replace it with NULL (PROC_FS is a bool, not a tristate), using THIS_MODULE will prevent unexpected breakage if we ever do compile this as a module. Signed-off-by: Jeremy Kerr <jeremy.kerr@canonical.com> Signed-off-by: Grant Likely <grant.likely@secretlab.ca> Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org> Acked-by: Michal Simek <monstr@monstr.eu> --- fs/proc/proc_devtree.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c index 0ec45110e15e..f8650dce74fb 100644 --- a/fs/proc/proc_devtree.c +++ b/fs/proc/proc_devtree.c @@ -11,6 +11,7 @@ #include <linux/stat.h> #include <linux/string.h> #include <linux/of.h> +#include <linux/module.h> #include <asm/prom.h> #include <asm/uaccess.h> #include "internal.h" -- cgit v1.2.3 From 12062dddda450976b129dcb1bacd91acaf4d8030 Mon Sep 17 00:00:00 2001 From: Eric Sandeen <sandeen@redhat.com> Date: Mon, 15 Feb 2010 14:19:27 -0500 Subject: ext4: move __func__ into a macro for ext4_warning, ext4_error Just a pet peeve of mine; we had a mishash of calls with either __func__ or "function_name" and the latter tends to get out of sync. I think it's easier to just hide the __func__ in a macro, and it'll be consistent from then on. Signed-off-by: Eric Sandeen <sandeen@redhat.com> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu> --- fs/ext4/balloc.c | 29 +++++--------- fs/ext4/dir.c | 4 +- fs/ext4/ext4.h | 7 +++- fs/ext4/ext4_jbd2.c | 2 +- fs/ext4/extents.c | 9 +++-- fs/ext4/ialloc.c | 27 +++++-------- fs/ext4/inode.c | 43 +++++++++------------ fs/ext4/mballoc.c | 27 ++++++------- fs/ext4/move_extent.c | 12 +++--- fs/ext4/namei.c | 51 +++++++++++-------------- fs/ext4/resize.c | 102 +++++++++++++++++++------------------------------- fs/ext4/super.c | 11 +++--- fs/ext4/xattr.c | 32 +++++++--------- 13 files changed, 146 insertions(+), 210 deletions(-) (limited to 'fs') diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 22bc7435d913..720061a0ee53 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -97,8 +97,8 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, /* If checksum is bad mark all blocks used to prevent allocation * essentially implementing a per-group read-only flag. */ if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) { - ext4_error(sb, __func__, - "Checksum bad for group %u", block_group); + ext4_error(sb, "Checksum bad for group %u", + block_group); ext4_free_blks_set(sb, gdp, 0); ext4_free_inodes_set(sb, gdp, 0); ext4_itable_unused_set(sb, gdp, 0); @@ -210,10 +210,8 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb, struct ext4_sb_info *sbi = EXT4_SB(sb); if (block_group >= ngroups) { - ext4_error(sb, "ext4_get_group_desc", - "block_group >= groups_count - " - "block_group = %u, groups_count = %u", - block_group, ngroups); + ext4_error(sb, "block_group >= groups_count - block_group = %u," + " groups_count = %u", block_group, ngroups); return NULL; } @@ -221,8 +219,7 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb, group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb); offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1); if (!sbi->s_group_desc[group_desc]) { - ext4_error(sb, "ext4_get_group_desc", - "Group descriptor not loaded - " + ext4_error(sb, "Group descriptor not loaded - " "block_group = %u, group_desc = %u, desc = %u", block_group, group_desc, offset); return NULL; @@ -282,9 +279,7 @@ static int ext4_valid_block_bitmap(struct super_block *sb, return 1; err_out: - ext4_error(sb, __func__, - "Invalid block bitmap - " - "block_group = %d, block = %llu", + ext4_error(sb, "Invalid block bitmap - block_group = %d, block = %llu", block_group, bitmap_blk); return 0; } @@ -311,8 +306,7 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) bitmap_blk = ext4_block_bitmap(sb, desc); bh = sb_getblk(sb, bitmap_blk); if (unlikely(!bh)) { - ext4_error(sb, __func__, - "Cannot read block bitmap - " + ext4_error(sb, "Cannot read block bitmap - " "block_group = %u, block_bitmap = %llu", block_group, bitmap_blk); return NULL; @@ -354,8 +348,7 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) set_bitmap_uptodate(bh); if (bh_submit_read(bh) < 0) { put_bh(bh); - ext4_error(sb, __func__, - "Cannot read block bitmap - " + ext4_error(sb, "Cannot read block bitmap - " "block_group = %u, block_bitmap = %llu", block_group, bitmap_blk); return NULL; @@ -419,8 +412,7 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) || in_range(block + count - 1, ext4_inode_table(sb, desc), sbi->s_itb_per_group)) { - ext4_error(sb, __func__, - "Adding blocks in system zones - " + ext4_error(sb, "Adding blocks in system zones - " "Block = %llu, count = %lu", block, count); goto error_return; @@ -453,8 +445,7 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, BUFFER_TRACE(bitmap_bh, "clear bit"); if (!ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group), bit + i, bitmap_bh->b_data)) { - ext4_error(sb, __func__, - "bit already cleared for block %llu", + ext4_error(sb, "bit already cleared for block %llu", (ext4_fsblk_t)(block + i)); BUFFER_TRACE(bitmap_bh, "bit already cleared"); } else { diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 9dc93168e262..0e0bef3ba91e 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -83,7 +83,7 @@ int ext4_check_dir_entry(const char *function, struct inode *dir, error_msg = "inode out of bounds"; if (error_msg != NULL) - ext4_error(dir->i_sb, function, + __ext4_error(dir->i_sb, function, "bad entry in directory #%lu: %s - " "offset=%u, inode=%u, rec_len=%d, name_len=%d", dir->i_ino, error_msg, offset, @@ -150,7 +150,7 @@ static int ext4_readdir(struct file *filp, */ if (!bh) { if (!dir_has_error) { - ext4_error(sb, __func__, "directory #%lu " + ext4_error(sb, "directory #%lu " "contains a hole at offset %Lu", inode->i_ino, (unsigned long long) filp->f_pos); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 61cf3b3cde4e..509437ffb71b 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1486,13 +1486,16 @@ extern int ext4_group_extend(struct super_block *sb, ext4_fsblk_t n_blocks_count); /* super.c */ -extern void ext4_error(struct super_block *, const char *, const char *, ...) +extern void __ext4_error(struct super_block *, const char *, const char *, ...) __attribute__ ((format (printf, 3, 4))); +#define ext4_error(sb, message...) __ext4_error(sb, __func__, ## message) extern void __ext4_std_error(struct super_block *, const char *, int); extern void ext4_abort(struct super_block *, const char *, const char *, ...) __attribute__ ((format (printf, 3, 4))); -extern void ext4_warning(struct super_block *, const char *, const char *, ...) +extern void __ext4_warning(struct super_block *, const char *, + const char *, ...) __attribute__ ((format (printf, 3, 4))); +#define ext4_warning(sb, message...) __ext4_warning(sb, __func__, ## message) extern void ext4_msg(struct super_block *, const char *, const char *, ...) __attribute__ ((format (printf, 3, 4))); extern void ext4_grp_locked_error(struct super_block *, ext4_group_t, diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index b57e5c711b6d..2f407c487e6b 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -132,7 +132,7 @@ int __ext4_handle_dirty_metadata(const char *where, handle_t *handle, if (inode && inode_needs_sync(inode)) { sync_dirty_buffer(bh); if (buffer_req(bh) && !buffer_uptodate(bh)) { - ext4_error(inode->i_sb, __func__, + ext4_error(inode->i_sb, "IO error syncing inode, " "inode=%lu, block=%llu", inode->i_ino, diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 54616157c0f3..bd808915ad2f 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -440,7 +440,7 @@ static int __ext4_ext_check(const char *function, struct inode *inode, return 0; corrupted: - ext4_error(inode->i_sb, function, + __ext4_error(inode->i_sb, function, "bad header/extent in inode #%lu: %s - magic %x, " "entries %u, max %u(%u), depth %u(%u)", inode->i_ino, error_msg, le16_to_cpu(eh->eh_magic), @@ -1538,8 +1538,9 @@ int ext4_ext_try_to_merge(struct inode *inode, merge_done = 1; WARN_ON(eh->eh_entries == 0); if (!eh->eh_entries) - ext4_error(inode->i_sb, "ext4_ext_try_to_merge", - "inode#%lu, eh->eh_entries = 0!", inode->i_ino); + ext4_error(inode->i_sb, + "inode#%lu, eh->eh_entries = 0!", + inode->i_ino); } return merge_done; @@ -3238,7 +3239,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, * this is why assert can't be put in ext4_ext_find_extent() */ if (path[depth].p_ext == NULL && depth != 0) { - ext4_error(inode->i_sb, __func__, "bad extent address " + ext4_error(inode->i_sb, "bad extent address " "inode: %lu, iblock: %d, depth: %d", inode->i_ino, iblock, depth); err = -EIO; diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 2fab5adae1e2..e4aaf619b56d 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -76,8 +76,7 @@ unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh, /* If checksum is bad mark all blocks and inodes use to prevent * allocation, essentially implementing a per-group read-only flag. */ if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) { - ext4_error(sb, __func__, "Checksum bad for group %u", - block_group); + ext4_error(sb, "Checksum bad for group %u", block_group); ext4_free_blks_set(sb, gdp, 0); ext4_free_inodes_set(sb, gdp, 0); ext4_itable_unused_set(sb, gdp, 0); @@ -111,8 +110,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) bitmap_blk = ext4_inode_bitmap(sb, desc); bh = sb_getblk(sb, bitmap_blk); if (unlikely(!bh)) { - ext4_error(sb, __func__, - "Cannot read inode bitmap - " + ext4_error(sb, "Cannot read inode bitmap - " "block_group = %u, inode_bitmap = %llu", block_group, bitmap_blk); return NULL; @@ -153,8 +151,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) set_bitmap_uptodate(bh); if (bh_submit_read(bh) < 0) { put_bh(bh); - ext4_error(sb, __func__, - "Cannot read inode bitmap - " + ext4_error(sb, "Cannot read inode bitmap - " "block_group = %u, inode_bitmap = %llu", block_group, bitmap_blk); return NULL; @@ -229,8 +226,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) es = EXT4_SB(sb)->s_es; if (ino < EXT4_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) { - ext4_error(sb, "ext4_free_inode", - "reserved or nonexistent inode %lu", ino); + ext4_error(sb, "reserved or nonexistent inode %lu", ino); goto error_return; } block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); @@ -248,8 +244,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) cleared = ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group), bit, bitmap_bh->b_data); if (!cleared) - ext4_error(sb, "ext4_free_inode", - "bit already cleared for inode %lu", ino); + ext4_error(sb, "bit already cleared for inode %lu", ino); else { gdp = ext4_get_group_desc(sb, block_group, &bh2); @@ -736,8 +731,7 @@ static int ext4_claim_inode(struct super_block *sb, if ((group == 0 && ino < EXT4_FIRST_INO(sb)) || ino > EXT4_INODES_PER_GROUP(sb)) { ext4_unlock_group(sb, group); - ext4_error(sb, __func__, - "reserved inode or inode > inodes count - " + ext4_error(sb, "reserved inode or inode > inodes count - " "block_group = %u, inode=%lu", group, ino + group * EXT4_INODES_PER_GROUP(sb)); return 1; @@ -1099,8 +1093,7 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino) /* Error cases - e2fsck has already cleaned up for us */ if (ino > max_ino) { - ext4_warning(sb, __func__, - "bad orphan ino %lu! e2fsck was run?", ino); + ext4_warning(sb, "bad orphan ino %lu! e2fsck was run?", ino); goto error; } @@ -1108,8 +1101,7 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino) bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb); bitmap_bh = ext4_read_inode_bitmap(sb, block_group); if (!bitmap_bh) { - ext4_warning(sb, __func__, - "inode bitmap error for orphan %lu", ino); + ext4_warning(sb, "inode bitmap error for orphan %lu", ino); goto error; } @@ -1141,8 +1133,7 @@ iget_failed: err = PTR_ERR(inode); inode = NULL; bad_orphan: - ext4_warning(sb, __func__, - "bad orphan inode %lu! e2fsck was run?", ino); + ext4_warning(sb, "bad orphan inode %lu! e2fsck was run?", ino); printk(KERN_NOTICE "ext4_test_bit(bit=%d, block=%llu) = %d\n", bit, (unsigned long long)bitmap_bh->b_blocknr, ext4_test_bit(bit, bitmap_bh->b_data)); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 3e530119d7f0..536067bcf75b 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -194,7 +194,7 @@ void ext4_delete_inode(struct inode *inode) inode->i_size = 0; err = ext4_mark_inode_dirty(handle, inode); if (err) { - ext4_warning(inode->i_sb, __func__, + ext4_warning(inode->i_sb, "couldn't mark inode dirty (err %d)", err); goto stop_handle; } @@ -212,7 +212,7 @@ void ext4_delete_inode(struct inode *inode) if (err > 0) err = ext4_journal_restart(handle, 3); if (err != 0) { - ext4_warning(inode->i_sb, __func__, + ext4_warning(inode->i_sb, "couldn't extend journal (err %d)", err); stop_handle: ext4_journal_stop(handle); @@ -323,8 +323,7 @@ static int ext4_block_to_path(struct inode *inode, offsets[n++] = i_block & (ptrs - 1); final = ptrs; } else { - ext4_warning(inode->i_sb, "ext4_block_to_path", - "block %lu > max in inode %lu", + ext4_warning(inode->i_sb, "block %lu > max in inode %lu", i_block + direct_blocks + indirect_blocks + double_blocks, inode->i_ino); } @@ -344,7 +343,7 @@ static int __ext4_check_blockref(const char *function, struct inode *inode, if (blk && unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb), blk, 1))) { - ext4_error(inode->i_sb, function, + __ext4_error(inode->i_sb, function, "invalid block reference %u " "in inode #%lu", blk, inode->i_ino); return -EIO; @@ -1125,7 +1124,7 @@ static int check_block_validity(struct inode *inode, const char *msg, sector_t logical, sector_t phys, int len) { if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) { - ext4_error(inode->i_sb, msg, + __ext4_error(inode->i_sb, msg, "inode #%lu logical block %llu mapped to %llu " "(size %d)", inode->i_ino, (unsigned long long) logical, @@ -4147,7 +4146,7 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode, if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free, count)) { - ext4_error(inode->i_sb, __func__, "inode #%lu: " + ext4_error(inode->i_sb, "inode #%lu: " "attempt to clear blocks %llu len %lu, invalid", inode->i_ino, (unsigned long long) block_to_free, count); @@ -4255,7 +4254,7 @@ static void ext4_free_data(handle_t *handle, struct inode *inode, if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh)) ext4_handle_dirty_metadata(handle, inode, this_bh); else - ext4_error(inode->i_sb, __func__, + ext4_error(inode->i_sb, "circular indirect block detected, " "inode=%lu, block=%llu", inode->i_ino, @@ -4297,7 +4296,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), nr, 1)) { - ext4_error(inode->i_sb, __func__, + ext4_error(inode->i_sb, "indirect mapped block in inode " "#%lu invalid (level %d, blk #%lu)", inode->i_ino, depth, @@ -4313,7 +4312,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, * (should be rare). */ if (!bh) { - ext4_error(inode->i_sb, "ext4_free_branches", + ext4_error(inode->i_sb, "Read failure, inode=%lu, block=%llu", inode->i_ino, nr); continue; @@ -4628,9 +4627,8 @@ static int __ext4_get_inode_loc(struct inode *inode, bh = sb_getblk(sb, block); if (!bh) { - ext4_error(sb, "ext4_get_inode_loc", "unable to read " - "inode block - inode=%lu, block=%llu", - inode->i_ino, block); + ext4_error(sb, "unable to read inode block - " + "inode=%lu, block=%llu", inode->i_ino, block); return -EIO; } if (!buffer_uptodate(bh)) { @@ -4728,9 +4726,8 @@ make_io: submit_bh(READ_META, bh); wait_on_buffer(bh); if (!buffer_uptodate(bh)) { - ext4_error(sb, __func__, - "unable to read inode block - inode=%lu, " - "block=%llu", inode->i_ino, block); + ext4_error(sb, "unable to read inode block - inode=%lu," + " block=%llu", inode->i_ino, block); brelse(bh); return -EIO; } @@ -4941,8 +4938,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) ret = 0; if (ei->i_file_acl && !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) { - ext4_error(sb, __func__, - "bad extended attribute block %llu in inode #%lu", + ext4_error(sb, "bad extended attribute block %llu inode #%lu", ei->i_file_acl, inode->i_ino); ret = -EIO; goto bad_inode; @@ -4988,8 +4984,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); } else { ret = -EIO; - ext4_error(inode->i_sb, __func__, - "bogus i_mode (%o) for inode=%lu", + ext4_error(inode->i_sb, "bogus i_mode (%o) for inode=%lu", inode->i_mode, inode->i_ino); goto bad_inode; } @@ -5228,10 +5223,8 @@ int ext4_write_inode(struct inode *inode, int wait) if (wait) sync_dirty_buffer(iloc.bh); if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) { - ext4_error(inode->i_sb, __func__, - "IO error syncing inode, " - "inode=%lu, block=%llu", - inode->i_ino, + ext4_error(inode->i_sb, "IO error syncing inode, " + "inode=%lu, block=%llu", inode->i_ino, (unsigned long long)iloc.bh->b_blocknr); err = -EIO; } @@ -5644,7 +5637,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) EXT4_STATE_NO_EXPAND); if (mnt_count != le16_to_cpu(sbi->s_es->s_mnt_count)) { - ext4_warning(inode->i_sb, __func__, + ext4_warning(inode->i_sb, "Unable to expand inode %lu. Delete" " some EAs or run e2fsck.", inode->i_ino); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index d129c1039f1d..415e11f1e9ee 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2709,8 +2709,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, len = ac->ac_b_ex.fe_len; if (!ext4_data_block_valid(sbi, block, len)) { - ext4_error(sb, __func__, - "Allocating blocks %llu-%llu which overlap " + ext4_error(sb, "Allocating blocks %llu-%llu which overlap " "fs metadata\n", block, block+len); /* File system mounted not to panic on error * Fix the bitmap and repeat the block allocation @@ -3623,15 +3622,13 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, bitmap_bh = ext4_read_block_bitmap(sb, group); if (bitmap_bh == NULL) { - ext4_error(sb, __func__, "Error in reading block " - "bitmap for %u", group); + ext4_error(sb, "Error reading block bitmap for %u", group); return 0; } err = ext4_mb_load_buddy(sb, group, &e4b); if (err) { - ext4_error(sb, __func__, "Error in loading buddy " - "information for %u", group); + ext4_error(sb, "Error loading buddy information for %u", group); put_bh(bitmap_bh); return 0; } @@ -3804,15 +3801,15 @@ repeat: err = ext4_mb_load_buddy(sb, group, &e4b); if (err) { - ext4_error(sb, __func__, "Error in loading buddy " - "information for %u", group); + ext4_error(sb, "Error loading buddy information for %u", + group); continue; } bitmap_bh = ext4_read_block_bitmap(sb, group); if (bitmap_bh == NULL) { - ext4_error(sb, __func__, "Error in reading block " - "bitmap for %u", group); + ext4_error(sb, "Error reading block bitmap for %u", + group); ext4_mb_release_desc(&e4b); continue; } @@ -4077,8 +4074,8 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb, ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL); if (ext4_mb_load_buddy(sb, group, &e4b)) { - ext4_error(sb, __func__, "Error in loading buddy " - "information for %u", group); + ext4_error(sb, "Error loading buddy information for %u", + group); continue; } ext4_lock_group(sb, group); @@ -4478,8 +4475,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, es = EXT4_SB(sb)->s_es; if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) && !ext4_data_block_valid(sbi, block, count)) { - ext4_error(sb, __func__, - "Freeing blocks not in datazone - " + ext4_error(sb, "Freeing blocks not in datazone - " "block = %llu, count = %lu", block, count); goto error_return; } @@ -4548,8 +4544,7 @@ do_more: in_range(block + count - 1, ext4_inode_table(sb, gdp), EXT4_SB(sb)->s_itb_per_group)) { - ext4_error(sb, __func__, - "Freeing blocks in system zone - " + ext4_error(sb, "Freeing blocks in system zone - " "Block = %llu, count = %lu", block, count); /* err = 0. ext4_std_error should be a no op */ goto error_return; diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 82c415be87a4..1654eb862d74 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -152,12 +152,12 @@ mext_check_null_inode(struct inode *inode1, struct inode *inode2, int ret = 0; if (inode1 == NULL) { - ext4_error(inode2->i_sb, function, + __ext4_error(inode2->i_sb, function, "Both inodes should not be NULL: " "inode1 NULL inode2 %lu", inode2->i_ino); ret = -EIO; } else if (inode2 == NULL) { - ext4_error(inode1->i_sb, function, + __ext4_error(inode1->i_sb, function, "Both inodes should not be NULL: " "inode1 %lu inode2 NULL", inode1->i_ino); ret = -EIO; @@ -526,7 +526,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode, * new_ext |-------| */ if (le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end) { - ext4_error(orig_inode->i_sb, __func__, + ext4_error(orig_inode->i_sb, "new_ext_end(%u) should be less than or equal to " "oext->ee_block(%u) + oext_alen(%d) - 1", new_ext_end, le32_to_cpu(oext->ee_block), @@ -689,12 +689,12 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode, while (1) { /* The extent for donor must be found. */ if (!dext) { - ext4_error(donor_inode->i_sb, __func__, + ext4_error(donor_inode->i_sb, "The extent for donor must be found"); *err = -EIO; goto out; } else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) { - ext4_error(donor_inode->i_sb, __func__, + ext4_error(donor_inode->i_sb, "Donor offset(%u) and the first block of donor " "extent(%u) should be equal", donor_off, @@ -1351,7 +1351,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, if (ret1 < 0) break; if (*moved_len > len) { - ext4_error(orig_inode->i_sb, __func__, + ext4_error(orig_inode->i_sb, "We replaced blocks too much! " "sum of replaced: %llu requested: %llu", *moved_len, len); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 17a17e10dd60..bd2dc0b71c8c 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -383,8 +383,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir, if (root->info.hash_version != DX_HASH_TEA && root->info.hash_version != DX_HASH_HALF_MD4 && root->info.hash_version != DX_HASH_LEGACY) { - ext4_warning(dir->i_sb, __func__, - "Unrecognised inode hash code %d", + ext4_warning(dir->i_sb, "Unrecognised inode hash code %d", root->info.hash_version); brelse(bh); *err = ERR_BAD_DX_DIR; @@ -399,8 +398,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir, hash = hinfo->hash; if (root->info.unused_flags & 1) { - ext4_warning(dir->i_sb, __func__, - "Unimplemented inode hash flags: %#06x", + ext4_warning(dir->i_sb, "Unimplemented inode hash flags: %#06x", root->info.unused_flags); brelse(bh); *err = ERR_BAD_DX_DIR; @@ -408,8 +406,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir, } if ((indirect = root->info.indirect_levels) > 1) { - ext4_warning(dir->i_sb, __func__, - "Unimplemented inode hash depth: %#06x", + ext4_warning(dir->i_sb, "Unimplemented inode hash depth: %#06x", root->info.indirect_levels); brelse(bh); *err = ERR_BAD_DX_DIR; @@ -421,8 +418,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir, if (dx_get_limit(entries) != dx_root_limit(dir, root->info.info_length)) { - ext4_warning(dir->i_sb, __func__, - "dx entry: limit != root limit"); + ext4_warning(dir->i_sb, "dx entry: limit != root limit"); brelse(bh); *err = ERR_BAD_DX_DIR; goto fail; @@ -433,7 +429,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir, { count = dx_get_count(entries); if (!count || count > dx_get_limit(entries)) { - ext4_warning(dir->i_sb, __func__, + ext4_warning(dir->i_sb, "dx entry: no count or count > limit"); brelse(bh); *err = ERR_BAD_DX_DIR; @@ -478,7 +474,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir, goto fail2; at = entries = ((struct dx_node *) bh->b_data)->entries; if (dx_get_limit(entries) != dx_node_limit (dir)) { - ext4_warning(dir->i_sb, __func__, + ext4_warning(dir->i_sb, "dx entry: limit != node limit"); brelse(bh); *err = ERR_BAD_DX_DIR; @@ -494,7 +490,7 @@ fail2: } fail: if (*err == ERR_BAD_DX_DIR) - ext4_warning(dir->i_sb, __func__, + ext4_warning(dir->i_sb, "Corrupt dir inode %ld, running e2fsck is " "recommended.", dir->i_ino); return NULL; @@ -947,9 +943,8 @@ restart: wait_on_buffer(bh); if (!buffer_uptodate(bh)) { /* read error, skip block & hope for the best */ - ext4_error(sb, __func__, "reading directory #%lu " - "offset %lu", dir->i_ino, - (unsigned long)block); + ext4_error(sb, "reading directory #%lu offset %lu", + dir->i_ino, (unsigned long)block); brelse(bh); goto next; } @@ -1041,7 +1036,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q retval = ext4_htree_next_block(dir, hash, frame, frames, NULL); if (retval < 0) { - ext4_warning(sb, __func__, + ext4_warning(sb, "error reading index page in directory #%lu", dir->i_ino); *err = retval; @@ -1071,14 +1066,13 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru __u32 ino = le32_to_cpu(de->inode); brelse(bh); if (!ext4_valid_inum(dir->i_sb, ino)) { - ext4_error(dir->i_sb, "ext4_lookup", - "bad inode number: %u", ino); + ext4_error(dir->i_sb, "bad inode number: %u", ino); return ERR_PTR(-EIO); } inode = ext4_iget(dir->i_sb, ino); if (unlikely(IS_ERR(inode))) { if (PTR_ERR(inode) == -ESTALE) { - ext4_error(dir->i_sb, __func__, + ext4_error(dir->i_sb, "deleted inode referenced: %u", ino); return ERR_PTR(-EIO); @@ -1110,7 +1104,7 @@ struct dentry *ext4_get_parent(struct dentry *child) brelse(bh); if (!ext4_valid_inum(child->d_inode->i_sb, ino)) { - ext4_error(child->d_inode->i_sb, "ext4_get_parent", + ext4_error(child->d_inode->i_sb, "bad inode number: %u", ino); return ERR_PTR(-EIO); } @@ -1410,7 +1404,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, de = (struct ext4_dir_entry_2 *)((char *)fde + ext4_rec_len_from_disk(fde->rec_len, blocksize)); if ((char *) de >= (((char *) root) + blocksize)) { - ext4_error(dir->i_sb, __func__, + ext4_error(dir->i_sb, "invalid rec_len for '..' in inode %lu", dir->i_ino); brelse(bh); @@ -1575,8 +1569,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, if (levels && (dx_get_count(frames->entries) == dx_get_limit(frames->entries))) { - ext4_warning(sb, __func__, - "Directory index full!"); + ext4_warning(sb, "Directory index full!"); err = -ENOSPC; goto cleanup; } @@ -1916,11 +1909,11 @@ static int empty_dir(struct inode *inode) if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) || !(bh = ext4_bread(NULL, inode, 0, 0, &err))) { if (err) - ext4_error(inode->i_sb, __func__, + ext4_error(inode->i_sb, "error %d reading directory #%lu offset 0", err, inode->i_ino); else - ext4_warning(inode->i_sb, __func__, + ext4_warning(inode->i_sb, "bad directory (dir #%lu) - no data block", inode->i_ino); return 1; @@ -1931,7 +1924,7 @@ static int empty_dir(struct inode *inode) !le32_to_cpu(de1->inode) || strcmp(".", de->name) || strcmp("..", de1->name)) { - ext4_warning(inode->i_sb, "empty_dir", + ext4_warning(inode->i_sb, "bad directory (dir #%lu) - no `.' or `..'", inode->i_ino); brelse(bh); @@ -1949,7 +1942,7 @@ static int empty_dir(struct inode *inode) offset >> EXT4_BLOCK_SIZE_BITS(sb), 0, &err); if (!bh) { if (err) - ext4_error(sb, __func__, + ext4_error(sb, "error %d reading directory" " #%lu offset %u", err, inode->i_ino, offset); @@ -2163,7 +2156,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry) if (retval) goto end_rmdir; if (!EXT4_DIR_LINK_EMPTY(inode)) - ext4_warning(inode->i_sb, "ext4_rmdir", + ext4_warning(inode->i_sb, "empty directory has too many links (%d)", inode->i_nlink); inode->i_version++; @@ -2215,7 +2208,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) goto end_unlink; if (!inode->i_nlink) { - ext4_warning(inode->i_sb, "ext4_unlink", + ext4_warning(inode->i_sb, "Deleting nonexistent file (%lu), %d", inode->i_ino, inode->i_nlink); inode->i_nlink = 1; @@ -2462,7 +2455,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, } } if (retval) { - ext4_warning(old_dir->i_sb, "ext4_rename", + ext4_warning(old_dir->i_sb, "Deleting old file (%lu), %d, error=%d", old_dir->i_ino, old_dir->i_nlink, retval); } diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 3b2c5541d8a6..5692c48754a0 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -48,65 +48,54 @@ static int verify_group_input(struct super_block *sb, ext4_get_group_no_and_offset(sb, start, NULL, &offset); if (group != sbi->s_groups_count) - ext4_warning(sb, __func__, - "Cannot add at group %u (only %u groups)", + ext4_warning(sb, "Cannot add at group %u (only %u groups)", input->group, sbi->s_groups_count); else if (offset != 0) - ext4_warning(sb, __func__, "Last group not full"); + ext4_warning(sb, "Last group not full"); else if (input->reserved_blocks > input->blocks_count / 5) - ext4_warning(sb, __func__, "Reserved blocks too high (%u)", + ext4_warning(sb, "Reserved blocks too high (%u)", input->reserved_blocks); else if (free_blocks_count < 0) - ext4_warning(sb, __func__, "Bad blocks count %u", + ext4_warning(sb, "Bad blocks count %u", input->blocks_count); else if (!(bh = sb_bread(sb, end - 1))) - ext4_warning(sb, __func__, - "Cannot read last block (%llu)", + ext4_warning(sb, "Cannot read last block (%llu)", end - 1); else if (outside(input->block_bitmap, start, end)) - ext4_warning(sb, __func__, - "Block bitmap not in group (block %llu)", + ext4_warning(sb, "Block bitmap not in group (block %llu)", (unsigned long long)input->block_bitmap); else if (outside(input->inode_bitmap, start, end)) - ext4_warning(sb, __func__, - "Inode bitmap not in group (block %llu)", + ext4_warning(sb, "Inode bitmap not in group (block %llu)", (unsigned long long)input->inode_bitmap); else if (outside(input->inode_table, start, end) || outside(itend - 1, start, end)) - ext4_warning(sb, __func__, - "Inode table not in group (blocks %llu-%llu)", + ext4_warning(sb, "Inode table not in group (blocks %llu-%llu)", (unsigned long long)input->inode_table, itend - 1); else if (input->inode_bitmap == input->block_bitmap) - ext4_warning(sb, __func__, - "Block bitmap same as inode bitmap (%llu)", + ext4_warning(sb, "Block bitmap same as inode bitmap (%llu)", (unsigned long long)input->block_bitmap); else if (inside(input->block_bitmap, input->inode_table, itend)) - ext4_warning(sb, __func__, - "Block bitmap (%llu) in inode table (%llu-%llu)", + ext4_warning(sb, "Block bitmap (%llu) in inode table " + "(%llu-%llu)", (unsigned long long)input->block_bitmap, (unsigned long long)input->inode_table, itend - 1); else if (inside(input->inode_bitmap, input->inode_table, itend)) - ext4_warning(sb, __func__, - "Inode bitmap (%llu) in inode table (%llu-%llu)", + ext4_warning(sb, "Inode bitmap (%llu) in inode table " + "(%llu-%llu)", (unsigned long long)input->inode_bitmap, (unsigned long long)input->inode_table, itend - 1); else if (inside(input->block_bitmap, start, metaend)) - ext4_warning(sb, __func__, - "Block bitmap (%llu) in GDT table" - " (%llu-%llu)", + ext4_warning(sb, "Block bitmap (%llu) in GDT table (%llu-%llu)", (unsigned long long)input->block_bitmap, start, metaend - 1); else if (inside(input->inode_bitmap, start, metaend)) - ext4_warning(sb, __func__, - "Inode bitmap (%llu) in GDT table" - " (%llu-%llu)", + ext4_warning(sb, "Inode bitmap (%llu) in GDT table (%llu-%llu)", (unsigned long long)input->inode_bitmap, start, metaend - 1); else if (inside(input->inode_table, start, metaend) || inside(itend - 1, start, metaend)) - ext4_warning(sb, __func__, - "Inode table (%llu-%llu) overlaps" - "GDT table (%llu-%llu)", + ext4_warning(sb, "Inode table (%llu-%llu) overlaps GDT table " + "(%llu-%llu)", (unsigned long long)input->inode_table, itend - 1, start, metaend - 1); else @@ -364,8 +353,7 @@ static int verify_reserved_gdb(struct super_block *sb, while ((grp = ext4_list_backups(sb, &three, &five, &seven)) < end) { if (le32_to_cpu(*p++) != grp * EXT4_BLOCKS_PER_GROUP(sb) + blk){ - ext4_warning(sb, __func__, - "reserved GDT %llu" + ext4_warning(sb, "reserved GDT %llu" " missing grp %d (%llu)", blk, grp, grp * @@ -420,8 +408,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, */ if (EXT4_SB(sb)->s_sbh->b_blocknr != le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) { - ext4_warning(sb, __func__, - "won't resize using backup superblock at %llu", + ext4_warning(sb, "won't resize using backup superblock at %llu", (unsigned long long)EXT4_SB(sb)->s_sbh->b_blocknr); return -EPERM; } @@ -444,8 +431,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, data = (__le32 *)dind->b_data; if (le32_to_cpu(data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)]) != gdblock) { - ext4_warning(sb, __func__, - "new group %u GDT block %llu not reserved", + ext4_warning(sb, "new group %u GDT block %llu not reserved", input->group, gdblock); err = -EINVAL; goto exit_dind; @@ -468,7 +454,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, GFP_NOFS); if (!n_group_desc) { err = -ENOMEM; - ext4_warning(sb, __func__, + ext4_warning(sb, "not enough memory for %lu groups", gdb_num + 1); goto exit_inode; } @@ -567,8 +553,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode, /* Get each reserved primary GDT block and verify it holds backups */ for (res = 0; res < reserved_gdb; res++, blk++) { if (le32_to_cpu(*data) != blk) { - ext4_warning(sb, __func__, - "reserved block %llu" + ext4_warning(sb, "reserved block %llu" " not at offset %ld", blk, (long)(data - (__le32 *)dind->b_data)); @@ -713,8 +698,7 @@ static void update_backups(struct super_block *sb, */ exit_err: if (err) { - ext4_warning(sb, __func__, - "can't update backup for group %u (err %d), " + ext4_warning(sb, "can't update backup for group %u (err %d), " "forcing fsck on next reboot", group, err); sbi->s_mount_state &= ~EXT4_VALID_FS; sbi->s_es->s_state &= cpu_to_le16(~EXT4_VALID_FS); @@ -753,20 +737,19 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) if (gdb_off == 0 && !EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER)) { - ext4_warning(sb, __func__, - "Can't resize non-sparse filesystem further"); + ext4_warning(sb, "Can't resize non-sparse filesystem further"); return -EPERM; } if (ext4_blocks_count(es) + input->blocks_count < ext4_blocks_count(es)) { - ext4_warning(sb, __func__, "blocks_count overflow"); + ext4_warning(sb, "blocks_count overflow"); return -EINVAL; } if (le32_to_cpu(es->s_inodes_count) + EXT4_INODES_PER_GROUP(sb) < le32_to_cpu(es->s_inodes_count)) { - ext4_warning(sb, __func__, "inodes_count overflow"); + ext4_warning(sb, "inodes_count overflow"); return -EINVAL; } @@ -774,14 +757,13 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) if (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_RESIZE_INODE) || !le16_to_cpu(es->s_reserved_gdt_blocks)) { - ext4_warning(sb, __func__, + ext4_warning(sb, "No reserved GDT blocks, can't resize"); return -EPERM; } inode = ext4_iget(sb, EXT4_RESIZE_INO); if (IS_ERR(inode)) { - ext4_warning(sb, __func__, - "Error opening resize inode"); + ext4_warning(sb, "Error opening resize inode"); return PTR_ERR(inode); } } @@ -810,8 +792,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) mutex_lock(&sbi->s_resize_lock); if (input->group != sbi->s_groups_count) { - ext4_warning(sb, __func__, - "multiple resizers run on filesystem!"); + ext4_warning(sb, "multiple resizers run on filesystem!"); err = -EBUSY; goto exit_journal; } @@ -997,13 +978,12 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, " too large to resize to %llu blocks safely\n", sb->s_id, n_blocks_count); if (sizeof(sector_t) < 8) - ext4_warning(sb, __func__, "CONFIG_LBDAF not enabled"); + ext4_warning(sb, "CONFIG_LBDAF not enabled"); return -EINVAL; } if (n_blocks_count < o_blocks_count) { - ext4_warning(sb, __func__, - "can't shrink FS - resize aborted"); + ext4_warning(sb, "can't shrink FS - resize aborted"); return -EBUSY; } @@ -1011,15 +991,14 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last); if (last == 0) { - ext4_warning(sb, __func__, - "need to use ext2online to resize further"); + ext4_warning(sb, "need to use ext2online to resize further"); return -EPERM; } add = EXT4_BLOCKS_PER_GROUP(sb) - last; if (o_blocks_count + add < o_blocks_count) { - ext4_warning(sb, __func__, "blocks_count overflow"); + ext4_warning(sb, "blocks_count overflow"); return -EINVAL; } @@ -1027,16 +1006,13 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, add = n_blocks_count - o_blocks_count; if (o_blocks_count + add < n_blocks_count) - ext4_warning(sb, __func__, - "will only finish group (%llu" - " blocks, %u new)", + ext4_warning(sb, "will only finish group (%llu blocks, %u new)", o_blocks_count + add, add); /* See if the device is actually as big as what was requested */ bh = sb_bread(sb, o_blocks_count + add - 1); if (!bh) { - ext4_warning(sb, __func__, - "can't read last block, resize aborted"); + ext4_warning(sb, "can't read last block, resize aborted"); return -ENOSPC; } brelse(bh); @@ -1047,14 +1023,13 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, handle = ext4_journal_start_sb(sb, 3); if (IS_ERR(handle)) { err = PTR_ERR(handle); - ext4_warning(sb, __func__, "error %d on journal start", err); + ext4_warning(sb, "error %d on journal start", err); goto exit_put; } mutex_lock(&EXT4_SB(sb)->s_resize_lock); if (o_blocks_count != ext4_blocks_count(es)) { - ext4_warning(sb, __func__, - "multiple resizers run on filesystem!"); + ext4_warning(sb, "multiple resizers run on filesystem!"); mutex_unlock(&EXT4_SB(sb)->s_resize_lock); ext4_journal_stop(handle); err = -EBUSY; @@ -1063,8 +1038,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, if ((err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh))) { - ext4_warning(sb, __func__, - "error %d on journal write access", err); + ext4_warning(sb, "error %d on journal write access", err); mutex_unlock(&EXT4_SB(sb)->s_resize_lock); ext4_journal_stop(handle); goto exit_put; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 68a55dffb360..1c85bb67e6eb 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -333,7 +333,7 @@ static void ext4_handle_error(struct super_block *sb) sb->s_id); } -void ext4_error(struct super_block *sb, const char *function, +void __ext4_error(struct super_block *sb, const char *function, const char *fmt, ...) { va_list args; @@ -450,7 +450,7 @@ void ext4_msg (struct super_block * sb, const char *prefix, va_end(args); } -void ext4_warning(struct super_block *sb, const char *function, +void __ext4_warning(struct super_block *sb, const char *function, const char *fmt, ...) { va_list args; @@ -507,7 +507,7 @@ void ext4_update_dynamic_rev(struct super_block *sb) if (le32_to_cpu(es->s_rev_level) > EXT4_GOOD_OLD_REV) return; - ext4_warning(sb, __func__, + ext4_warning(sb, "updating to rev %d because of new feature flag, " "running e2fsck is recommended", EXT4_DYNAMIC_REV); @@ -3367,10 +3367,9 @@ static void ext4_clear_journal_err(struct super_block *sb, char nbuf[16]; errstr = ext4_decode_error(sb, j_errno, nbuf); - ext4_warning(sb, __func__, "Filesystem error recorded " + ext4_warning(sb, "Filesystem error recorded " "from previous mount: %s", errstr); - ext4_warning(sb, __func__, "Marking fs in need of " - "filesystem check."); + ext4_warning(sb, "Marking fs in need of filesystem check."); EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; es->s_state |= cpu_to_le16(EXT4_ERROR_FS); diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index c619a7ea670d..627c98abbed9 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -227,7 +227,8 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name, ea_bdebug(bh, "b_count=%d, refcount=%d", atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); if (ext4_xattr_check_block(bh)) { -bad_block: ext4_error(inode->i_sb, __func__, +bad_block: + ext4_error(inode->i_sb, "inode %lu: bad block %llu", inode->i_ino, EXT4_I(inode)->i_file_acl); error = -EIO; @@ -371,7 +372,7 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size) ea_bdebug(bh, "b_count=%d, refcount=%d", atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); if (ext4_xattr_check_block(bh)) { - ext4_error(inode->i_sb, __func__, + ext4_error(inode->i_sb, "inode %lu: bad block %llu", inode->i_ino, EXT4_I(inode)->i_file_acl); error = -EIO; @@ -665,9 +666,8 @@ ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i, atomic_read(&(bs->bh->b_count)), le32_to_cpu(BHDR(bs->bh)->h_refcount)); if (ext4_xattr_check_block(bs->bh)) { - ext4_error(sb, __func__, - "inode %lu: bad block %llu", inode->i_ino, - EXT4_I(inode)->i_file_acl); + ext4_error(sb, "inode %lu: bad block %llu", + inode->i_ino, EXT4_I(inode)->i_file_acl); error = -EIO; goto cleanup; } @@ -880,9 +880,8 @@ cleanup_dquot: goto cleanup; bad_block: - ext4_error(inode->i_sb, __func__, - "inode %lu: bad block %llu", inode->i_ino, - EXT4_I(inode)->i_file_acl); + ext4_error(inode->i_sb, "inode %lu: bad block %llu", + inode->i_ino, EXT4_I(inode)->i_file_acl); goto cleanup; #undef header @@ -1195,9 +1194,8 @@ retry: if (!bh) goto cleanup; if (ext4_xattr_check_block(bh)) { - ext4_error(inode->i_sb, __func__, - "inode %lu: bad block %llu", inode->i_ino, - EXT4_I(inode)->i_file_acl); + ext4_error(inode->i_sb, "inode %lu: bad block %llu", + inode->i_ino, EXT4_I(inode)->i_file_acl); error = -EIO; goto cleanup; } @@ -1372,16 +1370,14 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode) goto cleanup; bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); if (!bh) { - ext4_error(inode->i_sb, __func__, - "inode %lu: block %llu read error", inode->i_ino, - EXT4_I(inode)->i_file_acl); + ext4_error(inode->i_sb, "inode %lu: block %llu read error", + inode->i_ino, EXT4_I(inode)->i_file_acl); goto cleanup; } if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) || BHDR(bh)->h_blocks != cpu_to_le32(1)) { - ext4_error(inode->i_sb, __func__, - "inode %lu: bad block %llu", inode->i_ino, - EXT4_I(inode)->i_file_acl); + ext4_error(inode->i_sb, "inode %lu: bad block %llu", + inode->i_ino, EXT4_I(inode)->i_file_acl); goto cleanup; } ext4_xattr_release_block(handle, inode, bh); @@ -1506,7 +1502,7 @@ again: } bh = sb_bread(inode->i_sb, ce->e_block); if (!bh) { - ext4_error(inode->i_sb, __func__, + ext4_error(inode->i_sb, "inode %lu: block %lu read error", inode->i_ino, (unsigned long) ce->e_block); } else if (le32_to_cpu(BHDR(bh)->h_refcount) >= -- cgit v1.2.3 From 9aaab0589baa61d637a52badddbff2d74f35a955 Mon Sep 17 00:00:00 2001 From: Roel Kluin <roel.kluin@gmail.com> Date: Mon, 15 Feb 2010 14:26:16 -0500 Subject: ext4: add missing error checking to ext4_expand_extra_isize_ea() Signed-off-by: Roel Kluin <roel.kluin@gmail.com> --- fs/ext4/xattr.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 627c98abbed9..efc16a4b7ceb 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1300,6 +1300,8 @@ retry: /* Remove the chosen entry from the inode */ error = ext4_xattr_ibody_set(handle, inode, &i, is); + if (error) + goto cleanup; entry = IFIRST(header); if (entry_size + EXT4_XATTR_SIZE(size) >= new_extra_isize) -- cgit v1.2.3 From aca92ff6f57c000d1b4523e383c8bd6b8269b8b1 Mon Sep 17 00:00:00 2001 From: Leonard Michlmayr <leonard.michlmayr@gmail.com> Date: Thu, 4 Mar 2010 17:07:28 -0500 Subject: ext4: correctly calculate number of blocks for fiemap ext4_fiemap() rounds the length of the requested range down to blocksize, which is is not the true number of blocks that cover the requested region. This problem is especially impressive if the user requests only the first byte of a file: not a single extent will be reported. We fix this by calculating the last block of the region and then subtract to find the number of blocks in the extents. Signed-off-by: Leonard Michlmayr <leonard.michlmayr@gmail.com> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu> --- fs/ext4/extents.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index bd808915ad2f..7d54850f7136 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3768,7 +3768,6 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len) { ext4_lblk_t start_blk; - ext4_lblk_t len_blks; int error = 0; /* fallback to generic here if not in extents fmt */ @@ -3782,8 +3781,14 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) { error = ext4_xattr_fiemap(inode, fieinfo); } else { + ext4_lblk_t len_blks; + __u64 last_blk; + start_blk = start >> inode->i_sb->s_blocksize_bits; - len_blks = len >> inode->i_sb->s_blocksize_bits; + last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits; + if (last_blk >= EXT_MAX_BLOCK) + last_blk = EXT_MAX_BLOCK-1; + len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1; /* * Walk the extent tree gathering extent information. -- cgit v1.2.3 From ba869023eac8354b17acdcff82b851ea8e7b1809 Mon Sep 17 00:00:00 2001 From: dingdinghua <dingdinghua@nrchpc.ac.cn> Date: Mon, 15 Feb 2010 16:35:42 -0500 Subject: jbd2: delay discarding buffers in journal_unmap_buffer Delay discarding buffers in journal_unmap_buffer until we know that "add to orphan" operation has definitely been committed, otherwise the log space of committing transation may be freed and reused before truncate get committed, updates may get lost if crash happens. Signed-off-by: dingdinghua <dingdinghua@nrchpc.ac.cn> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu> --- fs/jbd2/commit.c | 10 +++++----- fs/jbd2/transaction.c | 43 +++++++++++++++++++++++++++++++------------ 2 files changed, 36 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 1bc74b6f26d2..3ee211ed58f1 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -930,12 +930,12 @@ restart_loop: /* A buffer which has been freed while still being * journaled by a previous transaction may end up still * being dirty here, but we want to avoid writing back - * that buffer in the future now that the last use has - * been committed. That's not only a performance gain, - * it also stops aliasing problems if the buffer is left - * behind for writeback and gets reallocated for another + * that buffer in the future after the "add to orphan" + * operation been committed, That's not only a performance + * gain, it also stops aliasing problems if the buffer is + * left behind for writeback and gets reallocated for another * use in a different page. */ - if (buffer_freed(bh)) { + if (buffer_freed(bh) && !jh->b_next_transaction) { clear_buffer_freed(bh); clear_buffer_jbddirty(bh); } diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index a0512700542f..bfc70f57900f 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -1727,6 +1727,21 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) if (!jh) goto zap_buffer_no_jh; + /* + * We cannot remove the buffer from checkpoint lists until the + * transaction adding inode to orphan list (let's call it T) + * is committed. Otherwise if the transaction changing the + * buffer would be cleaned from the journal before T is + * committed, a crash will cause that the correct contents of + * the buffer will be lost. On the other hand we have to + * clear the buffer dirty bit at latest at the moment when the + * transaction marking the buffer as freed in the filesystem + * structures is committed because from that moment on the + * buffer can be reallocated and used by a different page. + * Since the block hasn't been freed yet but the inode has + * already been added to orphan list, it is safe for us to add + * the buffer to BJ_Forget list of the newest transaction. + */ transaction = jh->b_transaction; if (transaction == NULL) { /* First case: not on any transaction. If it @@ -1783,16 +1798,15 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) } else if (transaction == journal->j_committing_transaction) { JBUFFER_TRACE(jh, "on committing transaction"); /* - * If it is committing, we simply cannot touch it. We - * can remove it's next_transaction pointer from the - * running transaction if that is set, but nothing - * else. */ + * The buffer is committing, we simply cannot touch + * it. So we just set j_next_transaction to the + * running transaction (if there is one) and mark + * buffer as freed so that commit code knows it should + * clear dirty bits when it is done with the buffer. + */ set_buffer_freed(bh); - if (jh->b_next_transaction) { - J_ASSERT(jh->b_next_transaction == - journal->j_running_transaction); - jh->b_next_transaction = NULL; - } + if (journal->j_running_transaction && buffer_jbddirty(bh)) + jh->b_next_transaction = journal->j_running_transaction; jbd2_journal_put_journal_head(jh); spin_unlock(&journal->j_list_lock); jbd_unlock_bh_state(bh); @@ -1969,7 +1983,7 @@ void jbd2_journal_file_buffer(struct journal_head *jh, */ void __jbd2_journal_refile_buffer(struct journal_head *jh) { - int was_dirty; + int was_dirty, jlist; struct buffer_head *bh = jh2bh(jh); J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); @@ -1991,8 +2005,13 @@ void __jbd2_journal_refile_buffer(struct journal_head *jh) __jbd2_journal_temp_unlink_buffer(jh); jh->b_transaction = jh->b_next_transaction; jh->b_next_transaction = NULL; - __jbd2_journal_file_buffer(jh, jh->b_transaction, - jh->b_modified ? BJ_Metadata : BJ_Reserved); + if (buffer_freed(bh)) + jlist = BJ_Forget; + else if (jh->b_modified) + jlist = BJ_Metadata; + else + jlist = BJ_Reserved; + __jbd2_journal_file_buffer(jh, jh->b_transaction, jlist); J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING); if (was_dirty) -- cgit v1.2.3 From 73b50c1c92666d326b5fa2c945d46509f2f6d91f Mon Sep 17 00:00:00 2001 From: Curt Wohlgemuth <curtw@google.com> Date: Tue, 16 Feb 2010 15:06:29 -0500 Subject: ext4: Fix BUG_ON at fs/buffer.c:652 in no journal mode Calls to ext4_handle_dirty_metadata should only pass in an inode pointer for inode-specific metadata, and not for shared metadata blocks such as inode table blocks, block group descriptors, the superblock, etc. The BUG_ON can get tripped when updating a special device (such as a block device) that is opened (so that i_mapping is set in fs/block_dev.c) and the file system is mounted in no journal mode. Addresses-Google-Bug: #2404870 Signed-off-by: Curt Wohlgemuth <curtw@google.com> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu> --- fs/ext4/ext4_jbd2.c | 2 +- fs/ext4/ialloc.c | 2 +- fs/ext4/inode.c | 6 +++--- fs/ext4/namei.c | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index 2f407c487e6b..53d2764d71ca 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -125,7 +125,7 @@ int __ext4_handle_dirty_metadata(const char *where, handle_t *handle, ext4_journal_abort_handle(where, __func__, bh, handle, err); } else { - if (inode && bh) + if (inode) mark_buffer_dirty_inode(bh, inode); else mark_buffer_dirty(bh); diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index e4aaf619b56d..004c9da9e5c6 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -898,7 +898,7 @@ repeat_in_this_group: BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(handle, - inode, + NULL, inode_bitmap_bh); if (err) goto fail; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 536067bcf75b..ecac8c5a6f5c 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5120,7 +5120,7 @@ static int ext4_do_update_inode(handle_t *handle, EXT4_FEATURE_RO_COMPAT_LARGE_FILE); sb->s_dirt = 1; ext4_handle_sync(handle); - err = ext4_handle_dirty_metadata(handle, inode, + err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); } } @@ -5149,7 +5149,7 @@ static int ext4_do_update_inode(handle_t *handle, } BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); - rc = ext4_handle_dirty_metadata(handle, inode, bh); + rc = ext4_handle_dirty_metadata(handle, NULL, bh); if (!err) err = rc; ext4_clear_inode_state(inode, EXT4_STATE_NEW); @@ -5701,7 +5701,7 @@ static int ext4_pin_inode(handle_t *handle, struct inode *inode) err = jbd2_journal_get_write_access(handle, iloc.bh); if (!err) err = ext4_handle_dirty_metadata(handle, - inode, + NULL, iloc.bh); brelse(iloc.bh); } diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index bd2dc0b71c8c..a13948f8242f 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2017,7 +2017,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) /* Insert this inode at the head of the on-disk orphan list... */ NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan); EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino); - err = ext4_handle_dirty_metadata(handle, inode, EXT4_SB(sb)->s_sbh); + err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); rc = ext4_mark_iloc_dirty(handle, inode, &iloc); if (!err) err = rc; @@ -2089,7 +2089,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode) if (err) goto out_brelse; sbi->s_es->s_last_orphan = cpu_to_le32(ino_next); - err = ext4_handle_dirty_metadata(handle, inode, sbi->s_sbh); + err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); } else { struct ext4_iloc iloc2; struct inode *i_prev = -- cgit v1.2.3 From 003cb608a2533d0927a83bc4e07e46d7a622eda9 Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Tue, 2 Feb 2010 14:39:01 +0900 Subject: percpu: add __percpu sparse annotations to fs Add __percpu sparse annotations to fs. These annotations are to make sparse consider percpu variables to be in a different address space and warn if accessed without going through percpu accessors. This patch doesn't affect normal builds. Signed-off-by: Tejun Heo <tj@kernel.org> Cc: "Theodore Ts'o" <tytso@mit.edu> Cc: Trond Myklebust <Trond.Myklebust@netapp.com> Cc: Alex Elder <aelder@sgi.com> Cc: Christoph Hellwig <hch@infradead.org> Cc: Alexander Viro <viro@zeniv.linux.org.uk> --- fs/ext4/ext4.h | 2 +- fs/nfs/iostat.h | 4 ++-- fs/xfs/xfs_mount.h | 2 +- include/linux/mount.h | 2 +- include/linux/nfs_fs_sb.h | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 874d169a193e..4cedc91ec59d 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1014,7 +1014,7 @@ struct ext4_sb_info { atomic_t s_lock_busy; /* locality groups */ - struct ext4_locality_group *s_locality_groups; + struct ext4_locality_group __percpu *s_locality_groups; /* for write statistics */ unsigned long s_sectors_written_start; diff --git a/fs/nfs/iostat.h b/fs/nfs/iostat.h index 46d779abafd3..1d8d5c813b01 100644 --- a/fs/nfs/iostat.h +++ b/fs/nfs/iostat.h @@ -57,12 +57,12 @@ static inline void nfs_add_fscache_stats(struct inode *inode, } #endif -static inline struct nfs_iostats *nfs_alloc_iostats(void) +static inline struct nfs_iostats __percpu *nfs_alloc_iostats(void) { return alloc_percpu(struct nfs_iostats); } -static inline void nfs_free_iostats(struct nfs_iostats *stats) +static inline void nfs_free_iostats(struct nfs_iostats __percpu *stats) { if (stats != NULL) free_percpu(stats); diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 1df7e4502967..24c88870cdb2 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -243,7 +243,7 @@ typedef struct xfs_mount { struct xfs_qmops *m_qm_ops; /* vector of XQM ops */ atomic_t m_active_trans; /* number trans frozen */ #ifdef HAVE_PERCPU_SB - xfs_icsb_cnts_t *m_sb_cnts; /* per-cpu superblock counters */ + xfs_icsb_cnts_t __percpu *m_sb_cnts; /* per-cpu superblock counters */ unsigned long m_icsb_counters; /* disabled per-cpu counters */ struct notifier_block m_icsb_notifier; /* hotplug cpu notifier */ struct mutex m_icsb_mutex; /* balancer sync lock */ diff --git a/include/linux/mount.h b/include/linux/mount.h index 5d5275364867..b5f43a34ef88 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -66,7 +66,7 @@ struct vfsmount { int mnt_pinned; int mnt_ghosts; #ifdef CONFIG_SMP - int *mnt_writers; + int __percpu *mnt_writers; #else int mnt_writers; #endif diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index 34fc6be5bfcf..6a2e44fd75e2 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -105,7 +105,7 @@ struct nfs_server { struct rpc_clnt * client; /* RPC client handle */ struct rpc_clnt * client_acl; /* ACL RPC client handle */ struct nlm_host *nlm_host; /* NLM client handle */ - struct nfs_iostats * io_stats; /* I/O statistics */ + struct nfs_iostats __percpu *io_stats; /* I/O statistics */ struct backing_dev_info backing_dev_info; atomic_long_t writeback; /* number of writeback pages */ int flags; /* various flags */ -- cgit v1.2.3 From 03f29365e84ff6d651be4e6186e0400ca59da6cd Mon Sep 17 00:00:00 2001 From: Jiro SEKIBA <jir@unicus.jp> Date: Thu, 18 Feb 2010 19:11:35 +0900 Subject: nilfs2: delete unnecessary condition in load_segment_summary This is a trivial patch to remove unnecessary condition. load_segment_summary() checks crc of segment_summary OR crc of whole log data blocks based on boolean argument full_check. However, callers of the function pass only 1 as full_check, which means only whole log data blocks checking code is running all the time. This patch deletes the condition and full_check argument and also deletes enum 'NILFS_SEG_FAIL_CHECKSUM_SEGSUM' and corresponding case clause, for it is nolonger used anymore. Signed-off-by: Jiro SEKIBA <jir@unicus.jp> Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp> --- fs/nilfs2/recovery.c | 41 +++++++++++------------------------------ 1 file changed, 11 insertions(+), 30 deletions(-) (limited to 'fs') diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c index c9c96c7825dc..017bedc761a0 100644 --- a/fs/nilfs2/recovery.c +++ b/fs/nilfs2/recovery.c @@ -39,7 +39,6 @@ enum { NILFS_SEG_FAIL_IO, NILFS_SEG_FAIL_MAGIC, NILFS_SEG_FAIL_SEQ, - NILFS_SEG_FAIL_CHECKSUM_SEGSUM, NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT, NILFS_SEG_FAIL_CHECKSUM_FULL, NILFS_SEG_FAIL_CONSISTENCY, @@ -71,10 +70,6 @@ static int nilfs_warn_segment_error(int err) printk(KERN_WARNING "NILFS warning: Sequence number mismatch\n"); break; - case NILFS_SEG_FAIL_CHECKSUM_SEGSUM: - printk(KERN_WARNING - "NILFS warning: Checksum error in segment summary\n"); - break; case NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT: printk(KERN_WARNING "NILFS warning: Checksum error in super root\n"); @@ -206,19 +201,15 @@ int nilfs_read_super_root_block(struct super_block *sb, sector_t sr_block, * @pseg_start: start disk block number of partial segment * @seg_seq: sequence number requested * @ssi: pointer to nilfs_segsum_info struct to store information - * @full_check: full check flag - * (0: only checks segment summary CRC, 1: data CRC) */ static int load_segment_summary(struct nilfs_sb_info *sbi, sector_t pseg_start, - u64 seg_seq, struct nilfs_segsum_info *ssi, - int full_check) + u64 seg_seq, struct nilfs_segsum_info *ssi) { struct buffer_head *bh_sum; struct nilfs_segment_summary *sum; - unsigned long offset, nblock; - u64 check_bytes; - u32 crc, crc_sum; + unsigned long nblock; + u32 crc; int ret = NILFS_SEG_FAIL_IO; bh_sum = sb_bread(sbi->s_super, pseg_start); @@ -237,34 +228,24 @@ load_segment_summary(struct nilfs_sb_info *sbi, sector_t pseg_start, ret = NILFS_SEG_FAIL_SEQ; goto failed; } - if (full_check) { - offset = sizeof(sum->ss_datasum); - check_bytes = - ((u64)ssi->nblocks << sbi->s_super->s_blocksize_bits); - nblock = ssi->nblocks; - crc_sum = le32_to_cpu(sum->ss_datasum); - ret = NILFS_SEG_FAIL_CHECKSUM_FULL; - } else { /* only checks segment summary */ - offset = sizeof(sum->ss_datasum) + sizeof(sum->ss_sumsum); - check_bytes = ssi->sumbytes; - nblock = ssi->nsumblk; - crc_sum = le32_to_cpu(sum->ss_sumsum); - ret = NILFS_SEG_FAIL_CHECKSUM_SEGSUM; - } + nblock = ssi->nblocks; if (unlikely(nblock == 0 || nblock > sbi->s_nilfs->ns_blocks_per_segment)) { /* This limits the number of blocks read in the CRC check */ ret = NILFS_SEG_FAIL_CONSISTENCY; goto failed; } - if (calc_crc_cont(sbi, bh_sum, &crc, offset, check_bytes, + if (calc_crc_cont(sbi, bh_sum, &crc, sizeof(sum->ss_datasum), + ((u64)nblock << sbi->s_super->s_blocksize_bits), pseg_start, nblock)) { ret = NILFS_SEG_FAIL_IO; goto failed; } - if (crc == crc_sum) + if (crc == le32_to_cpu(sum->ss_datasum)) ret = 0; + else + ret = NILFS_SEG_FAIL_CHECKSUM_FULL; failed: brelse(bh_sum); out: @@ -598,7 +579,7 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs, while (segnum != ri->ri_segnum || pseg_start <= ri->ri_pseg_start) { - ret = load_segment_summary(sbi, pseg_start, seg_seq, &ssi, 1); + ret = load_segment_summary(sbi, pseg_start, seg_seq, &ssi); if (ret) { if (ret == NILFS_SEG_FAIL_IO) { err = -EIO; @@ -821,7 +802,7 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, for (;;) { /* Load segment summary */ - ret = load_segment_summary(sbi, pseg_start, seg_seq, &ssi, 1); + ret = load_segment_summary(sbi, pseg_start, seg_seq, &ssi); if (ret) { if (ret == NILFS_SEG_FAIL_IO) goto failed; -- cgit v1.2.3 From c44dcc56d2b5c79ba3063d20f76e5347e2e418f6 Mon Sep 17 00:00:00 2001 From: Al Viro <viro@zeniv.linux.org.uk> Date: Thu, 11 Feb 2010 02:24:46 -0500 Subject: switch inotify_user to anon_inode Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> --- fs/notify/inotify/inotify_user.c | 59 ++++------------------------------------ include/linux/magic.h | 1 - 2 files changed, 6 insertions(+), 54 deletions(-) (limited to 'fs') diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index a94e8bd8eb1f..472cdf29ef82 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -29,14 +29,12 @@ #include <linux/init.h> /* module_init */ #include <linux/inotify.h> #include <linux/kernel.h> /* roundup() */ -#include <linux/magic.h> /* superblock magic number */ -#include <linux/mount.h> /* mntget */ #include <linux/namei.h> /* LOOKUP_FOLLOW */ -#include <linux/path.h> /* struct path */ #include <linux/sched.h> /* struct user */ #include <linux/slab.h> /* struct kmem_cache */ #include <linux/syscalls.h> #include <linux/types.h> +#include <linux/anon_inodes.h> #include <linux/uaccess.h> #include <linux/poll.h> #include <linux/wait.h> @@ -45,8 +43,6 @@ #include <asm/ioctls.h> -static struct vfsmount *inotify_mnt __read_mostly; - /* these are configurable via /proc/sys/fs/inotify/ */ static int inotify_max_user_instances __read_mostly; static int inotify_max_queued_events __read_mostly; @@ -645,9 +641,7 @@ SYSCALL_DEFINE1(inotify_init1, int, flags) { struct fsnotify_group *group; struct user_struct *user; - struct file *filp; - struct path path; - int fd, ret; + int ret; /* Check the IN_* constants for consistency. */ BUILD_BUG_ON(IN_CLOEXEC != O_CLOEXEC); @@ -656,10 +650,6 @@ SYSCALL_DEFINE1(inotify_init1, int, flags) if (flags & ~(IN_CLOEXEC | IN_NONBLOCK)) return -EINVAL; - fd = get_unused_fd_flags(flags & O_CLOEXEC); - if (fd < 0) - return fd; - user = get_current_user(); if (unlikely(atomic_read(&user->inotify_devs) >= inotify_max_user_instances)) { @@ -676,27 +666,14 @@ SYSCALL_DEFINE1(inotify_init1, int, flags) atomic_inc(&user->inotify_devs); - path.mnt = inotify_mnt; - path.dentry = inotify_mnt->mnt_root; - path_get(&path); - filp = alloc_file(&path, FMODE_READ, &inotify_fops); - if (!filp) - goto Enfile; + ret = anon_inode_getfd("inotify", &inotify_fops, group, + O_RDONLY | flags); + if (ret >= 0) + return ret; - filp->f_flags = O_RDONLY | (flags & O_NONBLOCK); - filp->private_data = group; - - fd_install(fd, filp); - - return fd; - -Enfile: - ret = -ENFILE; - path_put(&path); atomic_dec(&user->inotify_devs); out_free_uid: free_uid(user); - put_unused_fd(fd); return ret; } @@ -783,20 +760,6 @@ out: return ret; } -static int -inotify_get_sb(struct file_system_type *fs_type, int flags, - const char *dev_name, void *data, struct vfsmount *mnt) -{ - return get_sb_pseudo(fs_type, "inotify", NULL, - INOTIFYFS_SUPER_MAGIC, mnt); -} - -static struct file_system_type inotify_fs_type = { - .name = "inotifyfs", - .get_sb = inotify_get_sb, - .kill_sb = kill_anon_super, -}; - /* * inotify_user_setup - Our initialization function. Note that we cannnot return * error because we have compiled-in VFS hooks. So an (unlikely) failure here @@ -804,16 +767,6 @@ static struct file_system_type inotify_fs_type = { */ static int __init inotify_user_setup(void) { - int ret; - - ret = register_filesystem(&inotify_fs_type); - if (unlikely(ret)) - panic("inotify: register_filesystem returned %d!\n", ret); - - inotify_mnt = kern_mount(&inotify_fs_type); - if (IS_ERR(inotify_mnt)) - panic("inotify: kern_mount ret %ld!\n", PTR_ERR(inotify_mnt)); - inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark_entry, SLAB_PANIC); event_priv_cachep = KMEM_CACHE(inotify_event_private_data, SLAB_PANIC); diff --git a/include/linux/magic.h b/include/linux/magic.h index 76285e01b39e..eb9800f05782 100644 --- a/include/linux/magic.h +++ b/include/linux/magic.h @@ -52,7 +52,6 @@ #define CGROUP_SUPER_MAGIC 0x27e0eb #define FUTEXFS_SUPER_MAGIC 0xBAD1DEA -#define INOTIFYFS_SUPER_MAGIC 0x2BAD1DEA #define STACK_END_MAGIC 0x57AC6E9D -- cgit v1.2.3 From ac278a9c505092dd82077a2446af8f9fc0d9c095 Mon Sep 17 00:00:00 2001 From: Al Viro <viro@ZenIV.linux.org.uk> Date: Tue, 16 Feb 2010 18:09:36 +0000 Subject: fix LOOKUP_FOLLOW on automount "symlinks" Make sure that automount "symlinks" are followed regardless of LOOKUP_FOLLOW; it should have no effect on them. Cc: stable@kernel.org Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> --- fs/namei.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index d62fdc875f22..a4855af776a8 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -822,6 +822,17 @@ fail: return PTR_ERR(dentry); } +/* + * This is a temporary kludge to deal with "automount" symlinks; proper + * solution is to trigger them on follow_mount(), so that do_lookup() + * would DTRT. To be killed before 2.6.34-final. + */ +static inline int follow_on_final(struct inode *inode, unsigned lookup_flags) +{ + return inode && unlikely(inode->i_op->follow_link) && + ((lookup_flags & LOOKUP_FOLLOW) || S_ISDIR(inode->i_mode)); +} + /* * Name resolution. * This is the basic name resolution function, turning a pathname into @@ -942,8 +953,7 @@ last_component: if (err) break; inode = next.dentry->d_inode; - if ((lookup_flags & LOOKUP_FOLLOW) - && inode && inode->i_op->follow_link) { + if (follow_on_final(inode, lookup_flags)) { err = do_follow_link(&next, nd); if (err) goto return_err; -- cgit v1.2.3 From 7fee4868be91e71a3ee8e57289ebf5e10a12297e Mon Sep 17 00:00:00 2001 From: Al Viro <viro@zeniv.linux.org.uk> Date: Thu, 14 Jan 2010 01:03:28 -0500 Subject: Switch proc/self to nd_set_link() Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> --- fs/proc/base.c | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index e42bbd843ed1..58324c299165 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2369,16 +2369,30 @@ static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd) { struct pid_namespace *ns = dentry->d_sb->s_fs_info; pid_t tgid = task_tgid_nr_ns(current, ns); - char tmp[PROC_NUMBUF]; - if (!tgid) - return ERR_PTR(-ENOENT); - sprintf(tmp, "%d", task_tgid_nr_ns(current, ns)); - return ERR_PTR(vfs_follow_link(nd,tmp)); + char *name = ERR_PTR(-ENOENT); + if (tgid) { + name = __getname(); + if (!name) + name = ERR_PTR(-ENOMEM); + else + sprintf(name, "%d", tgid); + } + nd_set_link(nd, name); + return NULL; +} + +static void proc_self_put_link(struct dentry *dentry, struct nameidata *nd, + void *cookie) +{ + char *s = nd_get_link(nd); + if (!IS_ERR(s)) + __putname(s); } static const struct inode_operations proc_self_inode_operations = { .readlink = proc_self_readlink, .follow_link = proc_self_follow_link, + .put_link = proc_self_put_link, }; /* -- cgit v1.2.3 From aeaa5ccd6421fbf9e7ded0ac67b12ea2b9fcf51e Mon Sep 17 00:00:00 2001 From: Chuck Ebbert <cebbert@redhat.com> Date: Mon, 15 Feb 2010 18:07:39 -0500 Subject: vfs: don't call ima_file_check() unconditionally in nfsd_open() commit 1e41568d7378d1ba8c64ba137b9ddd00b59f893a ("Take ima_path_check() in nfsd past dentry_open() in nfsd_open()") moved this code back to its original location but missed the "else". Signed-off-by: Chuck Ebbert <cebbert@redhat.com> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> --- fs/nfsd/vfs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 97d79eff6b7f..8715d194561a 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -752,7 +752,8 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, flags, current_cred()); if (IS_ERR(*filp)) host_err = PTR_ERR(*filp); - host_err = ima_file_check(*filp, access); + else + host_err = ima_file_check(*filp, access); out_nfserr: err = nfserrno(host_err); out: -- cgit v1.2.3 From 0d561f12b490dd2b993d73112d3297007688e6df Mon Sep 17 00:00:00 2001 From: Jiro SEKIBA <jir@unicus.jp> Date: Sat, 20 Feb 2010 19:47:49 +0900 Subject: nilfs2: add reader's lock for cno in nilfs_ioctl_sync This adds reader's lock for the_nilfs->cno in nilfs_ioctl_sync, for the_nilfs->cno should be proctected by segctor_sem when reading. Signed-off-by: Jiro SEKIBA <jir@unicus.jp> Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp> --- fs/nilfs2/ioctl.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index 8e5cad020c30..313d0a21da48 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -601,13 +601,17 @@ static int nilfs_ioctl_sync(struct inode *inode, struct file *filp, { __u64 cno; int ret; + struct the_nilfs *nilfs; ret = nilfs_construct_segment(inode->i_sb); if (ret < 0) return ret; if (argp != NULL) { - cno = NILFS_SB(inode->i_sb)->s_nilfs->ns_cno - 1; + nilfs = NILFS_SB(inode->i_sb)->s_nilfs; + down_read(&nilfs->ns_segctor_sem); + cno = nilfs->ns_cno - 1; + up_read(&nilfs->ns_segctor_sem); if (copy_to_user(argp, &cno, sizeof(cno))) return -EFAULT; } -- cgit v1.2.3 From 8f9941aeccc318f243ab3fa55aaa17f4c1cb33f9 Mon Sep 17 00:00:00 2001 From: David Howells <dhowells@redhat.com> Date: Fri, 19 Feb 2010 18:14:21 +0000 Subject: CacheFiles: Fix a race in cachefiles_delete_object() vs rename cachefiles_delete_object() can race with rename. It gets the parent directory of the object it's asked to delete, then locks it - but rename may have changed the object's parent between the get and the completion of the lock. However, if such a circumstance is detected, we abandon our attempt to delete the object - since it's no longer in the index key path, it won't be seen again by lookups of that key. The assumption is that cachefilesd may have culled it by renaming it to the graveyard for later destruction. Signed-off-by: David Howells <dhowells@redhat.com> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> --- fs/cachefiles/namei.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 14ac4806e291..eeb4986ea7db 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -348,7 +348,17 @@ int cachefiles_delete_object(struct cachefiles_cache *cache, dir = dget_parent(object->dentry); mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT); - ret = cachefiles_bury_object(cache, dir, object->dentry); + + /* we need to check that our parent is _still_ our parent - it may have + * been renamed */ + if (dir == object->dentry->d_parent) { + ret = cachefiles_bury_object(cache, dir, object->dentry); + } else { + /* it got moved, presumably by cachefilesd culling it, so it's + * no longer in the key path and we can ignore it */ + mutex_unlock(&dir->d_inode->i_mutex); + ret = 0; + } dput(dir); _leave(" = %d", ret); -- cgit v1.2.3 From f501912a35c02eadc55ca9396ece55fe36f785d0 Mon Sep 17 00:00:00 2001 From: Ben Myers <bpm@sgi.com> Date: Wed, 17 Feb 2010 14:05:11 -0600 Subject: commit_metadata export operation replacing nfsd_sync_dir - Add commit_metadata export_operation to allow the underlying filesystem to decide how to commit an inode most efficiently. - Usage of nfsd_sync_dir and write_inode_now has been replaced with the commit_metadata function that takes a svc_fh. - The commit_metadata function calls the commit_metadata export_op if it's there, or else falls back to sync_inode instead of fsync and write_inode_now because only metadata need be synced here. - nfsd4_sync_rec_dir now uses vfs_fsync so that commit_metadata can be static Signed-off-by: Ben Myers <bpm@sgi.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu> --- fs/nfsd/nfs4recover.c | 4 +- fs/nfsd/vfs.c | 106 +++++++++++++++++++++++------------------------ include/linux/exportfs.h | 5 +++ 3 files changed, 58 insertions(+), 57 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 5a754f7b71ed..98fb98e330b4 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -119,9 +119,7 @@ out_no_tfm: static void nfsd4_sync_rec_dir(void) { - mutex_lock(&rec_dir.dentry->d_inode->i_mutex); - nfsd_sync_dir(rec_dir.dentry); - mutex_unlock(&rec_dir.dentry->d_inode->i_mutex); + vfs_fsync(NULL, rec_dir.dentry, 0); } int diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index ed024d329056..8afdba5082e8 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -27,6 +27,8 @@ #include <linux/jhash.h> #include <linux/ima.h> #include <asm/uaccess.h> +#include <linux/exportfs.h> +#include <linux/writeback.h> #ifdef CONFIG_NFSD_V3 #include "xdr3.h" @@ -271,6 +273,32 @@ out: return err; } +/* + * Commit metadata changes to stable storage. + */ +static int +commit_metadata(struct svc_fh *fhp) +{ + struct inode *inode = fhp->fh_dentry->d_inode; + const struct export_operations *export_ops = inode->i_sb->s_export_op; + int error = 0; + + if (!EX_ISSYNC(fhp->fh_export)) + return 0; + + if (export_ops->commit_metadata) { + error = export_ops->commit_metadata(inode); + } else { + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = 0, /* metadata only */ + }; + + error = sync_inode(inode, &wbc); + } + + return error; +} /* * Set various file attributes. @@ -768,28 +796,6 @@ nfsd_close(struct file *filp) fput(filp); } -/* - * Sync a directory to disk. - * - * We can't just call vfs_fsync because our requirements are slightly odd: - * - * a) we do not have a file struct available - * b) we expect to have i_mutex already held by the caller - */ -int -nfsd_sync_dir(struct dentry *dentry) -{ - struct inode *inode = dentry->d_inode; - int error; - - WARN_ON(!mutex_is_locked(&inode->i_mutex)); - - error = filemap_write_and_wait(inode->i_mapping); - if (!error && inode->i_fop->fsync) - error = inode->i_fop->fsync(NULL, dentry, 0); - return error; -} - /* * Obtain the readahead parameters for the file * specified by (dev, ino). @@ -1331,12 +1337,14 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, goto out_nfserr; } - if (EX_ISSYNC(fhp->fh_export)) { - err = nfserrno(nfsd_sync_dir(dentry)); - write_inode_now(dchild->d_inode, 1); - } + err = nfsd_create_setattr(rqstp, resfhp, iap); - err2 = nfsd_create_setattr(rqstp, resfhp, iap); + /* + * nfsd_setattr already committed the child. Transactional filesystems + * had a chance to commit changes for both parent and child + * simultaneously making the following commit_metadata a noop. + */ + err2 = nfserrno(commit_metadata(fhp)); if (err2) err = err2; mnt_drop_write(fhp->fh_export->ex_path.mnt); @@ -1368,7 +1376,6 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp, struct dentry *dentry, *dchild = NULL; struct inode *dirp; __be32 err; - __be32 err2; int host_err; __u32 v_mtime=0, v_atime=0; @@ -1463,11 +1470,6 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp, if (created) *created = 1; - if (EX_ISSYNC(fhp->fh_export)) { - err = nfserrno(nfsd_sync_dir(dentry)); - /* setattr will sync the child (or not) */ - } - nfsd_check_ignore_resizing(iap); if (createmode == NFS3_CREATE_EXCLUSIVE) { @@ -1482,9 +1484,13 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp, } set_attr: - err2 = nfsd_create_setattr(rqstp, resfhp, iap); - if (err2) - err = err2; + err = nfsd_create_setattr(rqstp, resfhp, iap); + + /* + * nfsd_setattr already committed the child (and possibly also the parent). + */ + if (!err) + err = nfserrno(commit_metadata(fhp)); mnt_drop_write(fhp->fh_export->ex_path.mnt); /* @@ -1599,12 +1605,9 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp, } } else host_err = vfs_symlink(dentry->d_inode, dnew, path); - - if (!host_err) { - if (EX_ISSYNC(fhp->fh_export)) - host_err = nfsd_sync_dir(dentry); - } err = nfserrno(host_err); + if (!err) + err = nfserrno(commit_metadata(fhp)); fh_unlock(fhp); mnt_drop_write(fhp->fh_export->ex_path.mnt); @@ -1666,11 +1669,9 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, } host_err = vfs_link(dold, dirp, dnew); if (!host_err) { - if (EX_ISSYNC(ffhp->fh_export)) { - err = nfserrno(nfsd_sync_dir(ddir)); - write_inode_now(dest, 1); - } - err = 0; + err = nfserrno(commit_metadata(ffhp)); + if (!err) + err = nfserrno(commit_metadata(tfhp)); } else { if (host_err == -EXDEV && rqstp->rq_vers == 2) err = nfserr_acces; @@ -1766,10 +1767,10 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, goto out_dput_new; host_err = vfs_rename(fdir, odentry, tdir, ndentry); - if (!host_err && EX_ISSYNC(tfhp->fh_export)) { - host_err = nfsd_sync_dir(tdentry); + if (!host_err) { + host_err = commit_metadata(tfhp); if (!host_err) - host_err = nfsd_sync_dir(fdentry); + host_err = commit_metadata(ffhp); } mnt_drop_write(ffhp->fh_export->ex_path.mnt); @@ -1850,12 +1851,9 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, dput(rdentry); - if (host_err) - goto out_drop; - if (EX_ISSYNC(fhp->fh_export)) - host_err = nfsd_sync_dir(dentry); + if (!host_err) + host_err = commit_metadata(fhp); -out_drop: mnt_drop_write(fhp->fh_export->ex_path.mnt); out_nfserr: err = nfserrno(host_err); diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h index dc12f416a49f..a9cd507f8cd2 100644 --- a/include/linux/exportfs.h +++ b/include/linux/exportfs.h @@ -96,6 +96,7 @@ struct fid { * @fh_to_parent: find the implied object's parent and get a dentry for it * @get_name: find the name for a given inode in a given directory * @get_parent: find the parent of a given directory + * @commit_metadata: commit metadata changes to stable storage * * See Documentation/filesystems/nfs/Exporting for details on how to use * this interface correctly. @@ -137,6 +138,9 @@ struct fid { * is also a directory. In the event that it cannot be found, or storage * space cannot be allocated, a %ERR_PTR should be returned. * + * commit_metadata: + * @commit_metadata should commit metadata changes to stable storage. + * * Locking rules: * get_parent is called with child->d_inode->i_mutex down * get_name is not (which is possibly inconsistent) @@ -152,6 +156,7 @@ struct export_operations { int (*get_name)(struct dentry *parent, char *name, struct dentry *child); struct dentry * (*get_parent)(struct dentry *child); + int (*commit_metadata)(struct inode *inode); }; extern int exportfs_encode_fh(struct dentry *dentry, struct fid *fid, -- cgit v1.2.3 From 978ebd97d1426d5708d3f353179ab81f191a7eeb Mon Sep 17 00:00:00 2001 From: Ben Myers <bpm@sgi.com> Date: Wed, 17 Feb 2010 14:05:16 -0600 Subject: xfs_export_operations.commit_metadata This is the commit_metadata export operation for XFS. - Takes one inode to be committed. - Forces the log up to the lsn of the inode. - Doesn't force the log if the inode doesn't have a pincount. Signed-off-by: Ben Myers <bpm@sgi.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Dave Chinner <david@fromorbit.com> [bfields@citi.umich.edu: trivial whitespace fix] Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu> --- fs/xfs/linux-2.6/xfs_export.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c index 87b8cbd23d4b..8f4d70789e3f 100644 --- a/fs/xfs/linux-2.6/xfs_export.c +++ b/fs/xfs/linux-2.6/xfs_export.c @@ -29,6 +29,7 @@ #include "xfs_vnodeops.h" #include "xfs_bmap_btree.h" #include "xfs_inode.h" +#include "xfs_inode_item.h" /* * Note that we only accept fileids which are long enough rather than allow @@ -215,9 +216,28 @@ xfs_fs_get_parent( return d_obtain_alias(VFS_I(cip)); } +STATIC int +xfs_fs_nfs_commit_metadata( + struct inode *inode) +{ + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + int error = 0; + + xfs_ilock(ip, XFS_ILOCK_SHARED); + if (xfs_ipincount(ip)) { + error = _xfs_log_force(mp, ip->i_itemp->ili_last_lsn, + XFS_LOG_FORCE | XFS_LOG_SYNC, NULL); + } + xfs_iunlock(ip, XFS_ILOCK_SHARED); + + return error; +} + const struct export_operations xfs_export_operations = { .encode_fh = xfs_fs_encode_fh, .fh_to_dentry = xfs_fs_fh_to_dentry, .fh_to_parent = xfs_fs_fh_to_parent, .get_parent = xfs_fs_get_parent, + .commit_metadata = xfs_fs_nfs_commit_metadata, }; -- cgit v1.2.3 From 1cc523271ef0b6305c565a143e3d48f6fff826dd Mon Sep 17 00:00:00 2001 From: stephen hemminger <shemminger@vyatta.com> Date: Mon, 22 Feb 2010 07:57:17 +0000 Subject: seq_file: add RCU versions of new hlist/list iterators (v3) Many usages of seq_file use RCU protected lists, so non RCU iterators will not work safely. Signed-off-by: Stephen Hemminger <shemminger@vyatta.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- fs/seq_file.c | 71 ++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/rculist.h | 5 ++++ include/linux/seq_file.h | 15 +++++++--- 3 files changed, 87 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/seq_file.c b/fs/seq_file.c index f65b16f02da3..5afd554efad3 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -750,3 +750,74 @@ struct hlist_node *seq_hlist_next(void *v, struct hlist_head *head, return node->next; } EXPORT_SYMBOL(seq_hlist_next); + +/** + * seq_hlist_start_rcu - start an iteration of a hlist protected by RCU + * @head: the head of the hlist + * @pos: the start position of the sequence + * + * Called at seq_file->op->start(). + * + * This list-traversal primitive may safely run concurrently with + * the _rcu list-mutation primitives such as hlist_add_head_rcu() + * as long as the traversal is guarded by rcu_read_lock(). + */ +struct hlist_node *seq_hlist_start_rcu(struct hlist_head *head, + loff_t pos) +{ + struct hlist_node *node; + + __hlist_for_each_rcu(node, head) + if (pos-- == 0) + return node; + return NULL; +} +EXPORT_SYMBOL(seq_hlist_start_rcu); + +/** + * seq_hlist_start_head_rcu - start an iteration of a hlist protected by RCU + * @head: the head of the hlist + * @pos: the start position of the sequence + * + * Called at seq_file->op->start(). Call this function if you want to + * print a header at the top of the output. + * + * This list-traversal primitive may safely run concurrently with + * the _rcu list-mutation primitives such as hlist_add_head_rcu() + * as long as the traversal is guarded by rcu_read_lock(). + */ +struct hlist_node *seq_hlist_start_head_rcu(struct hlist_head *head, + loff_t pos) +{ + if (!pos) + return SEQ_START_TOKEN; + + return seq_hlist_start_rcu(head, pos - 1); +} +EXPORT_SYMBOL(seq_hlist_start_head_rcu); + +/** + * seq_hlist_next_rcu - move to the next position of the hlist protected by RCU + * @v: the current iterator + * @head: the head of the hlist + * @pos: the current posision + * + * Called at seq_file->op->next(). + * + * This list-traversal primitive may safely run concurrently with + * the _rcu list-mutation primitives such as hlist_add_head_rcu() + * as long as the traversal is guarded by rcu_read_lock(). + */ +struct hlist_node *seq_hlist_next_rcu(void *v, + struct hlist_head *head, + loff_t *ppos) +{ + struct hlist_node *node = v; + + ++*ppos; + if (v == SEQ_START_TOKEN) + return rcu_dereference(head->first); + else + return rcu_dereference(node->next); +} +EXPORT_SYMBOL(seq_hlist_next_rcu); diff --git a/include/linux/rculist.h b/include/linux/rculist.h index 1bf0f708c4fc..701fe9cb552a 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -406,6 +406,11 @@ static inline void hlist_add_after_rcu(struct hlist_node *prev, n->next->pprev = &n->next; } +#define __hlist_for_each_rcu(pos, head) \ + for (pos = rcu_dereference((head)->first); \ + pos && ({ prefetch(pos->next); 1; }); \ + pos = rcu_dereference(pos->next)) + /** * hlist_for_each_entry_rcu - iterate over rcu list of given type * @tpos: the type * to use as a loop cursor. diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h index c95bcdc18f4c..03c0232b4169 100644 --- a/include/linux/seq_file.h +++ b/include/linux/seq_file.h @@ -140,10 +140,17 @@ extern struct list_head *seq_list_next(void *v, struct list_head *head, */ extern struct hlist_node *seq_hlist_start(struct hlist_head *head, - loff_t pos); + loff_t pos); extern struct hlist_node *seq_hlist_start_head(struct hlist_head *head, - loff_t pos); + loff_t pos); extern struct hlist_node *seq_hlist_next(void *v, struct hlist_head *head, - loff_t *ppos); - + loff_t *ppos); + +extern struct hlist_node *seq_hlist_start_rcu(struct hlist_head *head, + loff_t pos); +extern struct hlist_node *seq_hlist_start_head_rcu(struct hlist_head *head, + loff_t pos); +extern struct hlist_node *seq_hlist_next_rcu(void *v, + struct hlist_head *head, + loff_t *ppos); #endif -- cgit v1.2.3 From a17e18790a8c47113a73139d54a375dc9ccd8f08 Mon Sep 17 00:00:00 2001 From: Michael Neuling <mikey@neuling.org> Date: Mon, 22 Feb 2010 12:44:24 -0800 Subject: fs/exec.c: fix initial stack reservation 803bf5ec259941936262d10ecc84511b76a20921 ("fs/exec.c: restrict initial stack space expansion to rlimit") attempts to limit the initial stack to 20*PAGE_SIZE. Unfortunately, in attempting ensure the stack is not reduced in size, we ended up not changing the stack at all. This size reduction check is not necessary as the expand_stack call does this already. This caused a regression in UML resulting in most guest processes being killed. Signed-off-by: Michael Neuling <mikey@neuling.org> Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Acked-by: WANG Cong <xiyou.wangcong@gmail.com> Cc: Anton Blanchard <anton@samba.org> Cc: Oleg Nesterov <oleg@redhat.com> Cc: James Morris <jmorris@namei.org> Cc: Serge Hallyn <serue@us.ibm.com> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: Jouni Malinen <j@w1.fi> Cc: <stable@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- fs/exec.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index e95c692ef0e4..cce6bbdbdbb1 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -637,7 +637,6 @@ int setup_arg_pages(struct linux_binprm *bprm, * will align it up. */ rlim_stack = rlimit(RLIMIT_STACK) & PAGE_MASK; - rlim_stack = min(rlim_stack, stack_size); #ifdef CONFIG_STACK_GROWSUP if (stack_size + stack_expand > rlim_stack) stack_base = vma->vm_start + rlim_stack; -- cgit v1.2.3 From 370b41911c0f1074e654a28466411b5c879e4dfd Mon Sep 17 00:00:00 2001 From: Jeff Layton <jlayton@redhat.com> Date: Wed, 10 Feb 2010 16:18:25 -0500 Subject: cifs: add parens around smb_var in BCC macros ...to remove ambiguity about how these values are interpreted when passing in more complex values as arguments. Signed-off-by: Jeff Layton <jlayton@redhat.com> Signed-off-by: Steve French <sfrench@us.ibm.com> --- fs/cifs/cifspdu.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h index 3877737f96a6..14d036d8db11 100644 --- a/fs/cifs/cifspdu.h +++ b/fs/cifs/cifspdu.h @@ -415,10 +415,10 @@ struct smb_hdr { __u8 WordCount; } __attribute__((packed)); /* given a pointer to an smb_hdr retrieve the value of byte count */ -#define BCC(smb_var) (*(__u16 *)((char *)smb_var + sizeof(struct smb_hdr) + (2 * smb_var->WordCount))) -#define BCC_LE(smb_var) (*(__le16 *)((char *)smb_var + sizeof(struct smb_hdr) + (2 * smb_var->WordCount))) +#define BCC(smb_var) (*(__u16 *)((char *)(smb_var) + sizeof(struct smb_hdr) + (2 * (smb_var)->WordCount))) +#define BCC_LE(smb_var) (*(__le16 *)((char *)(smb_var) + sizeof(struct smb_hdr) + (2 * (smb_var)->WordCount))) /* given a pointer to an smb_hdr retrieve the pointer to the byte area */ -#define pByteArea(smb_var) ((unsigned char *)smb_var + sizeof(struct smb_hdr) + (2 * smb_var->WordCount) + 2) +#define pByteArea(smb_var) ((unsigned char *)(smb_var) + sizeof(struct smb_hdr) + (2 * (smb_var)->WordCount) + 2) /* * Computer Name Length (since Netbios name was length 16 with last byte 0x20) -- cgit v1.2.3 From f0d3868b78752024dd7550d0d18fb9e0601cfb69 Mon Sep 17 00:00:00 2001 From: Jeff Layton <jlayton@redhat.com> Date: Wed, 10 Feb 2010 16:18:26 -0500 Subject: cifs: clean up indentation in CIFSSMBQAllEAs Add a label that we can goto on error, and reduce some of the if/then/else indentation in this function. Signed-off-by: Jeff Layton <jlayton@redhat.com> Signed-off-by: Steve French <sfrench@us.ibm.com> --- fs/cifs/cifssmb.c | 151 +++++++++++++++++++++++++++--------------------------- 1 file changed, 75 insertions(+), 76 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 941441d3e386..30709589e0c9 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -5282,9 +5282,10 @@ CIFSSMBQAllEAs(const int xid, struct cifsTconInfo *tcon, int rc = 0; int bytes_returned; int name_len; + struct fealist *ea_response_data; struct fea *temp_fea; char *temp_ptr; - __u16 params, byte_count; + __u16 params, byte_count, data_offset; cFYI(1, ("In Query All EAs path %s", searchName)); QAllEAsRetry: @@ -5334,85 +5335,83 @@ QAllEAsRetry: (struct smb_hdr *) pSMBr, &bytes_returned, 0); if (rc) { cFYI(1, ("Send error in QueryAllEAs = %d", rc)); - } else { /* decode response */ - rc = validate_t2((struct smb_t2_rsp *)pSMBr); + goto QAllEAsOut; + } - /* BB also check enough total bytes returned */ - /* BB we need to improve the validity checking - of these trans2 responses */ - if (rc || (pSMBr->ByteCount < 4)) - rc = -EIO; /* bad smb */ - /* else if (pFindData){ - memcpy((char *) pFindData, - (char *) &pSMBr->hdr.Protocol + - data_offset, kl); - }*/ else { - /* check that length of list is not more than bcc */ - /* check that each entry does not go beyond length - of list */ - /* check that each element of each entry does not - go beyond end of list */ - __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset); - struct fealist *ea_response_data; - rc = 0; - /* validate_trans2_offsets() */ - /* BB check if start of smb + data_offset > &bcc+ bcc */ - ea_response_data = (struct fealist *) - (((char *) &pSMBr->hdr.Protocol) + - data_offset); - name_len = le32_to_cpu(ea_response_data->list_len); - cFYI(1, ("ea length %d", name_len)); - if (name_len <= 8) { - /* returned EA size zeroed at top of function */ - cFYI(1, ("empty EA list returned from server")); - } else { - /* account for ea list len */ - name_len -= 4; - temp_fea = ea_response_data->list; - temp_ptr = (char *)temp_fea; - while (name_len > 0) { - __u16 value_len; - name_len -= 4; - temp_ptr += 4; - rc += temp_fea->name_len; - /* account for prefix user. and trailing null */ - rc = rc + 5 + 1; - if (rc < (int)buf_size) { - memcpy(EAData, "user.", 5); - EAData += 5; - memcpy(EAData, temp_ptr, - temp_fea->name_len); - EAData += temp_fea->name_len; - /* null terminate name */ - *EAData = 0; - EAData = EAData + 1; - } else if (buf_size == 0) { - /* skip copy - calc size only */ - } else { - /* stop before overrun buffer */ - rc = -ERANGE; - break; - } - name_len -= temp_fea->name_len; - temp_ptr += temp_fea->name_len; - /* account for trailing null */ - name_len--; - temp_ptr++; - value_len = - le16_to_cpu(temp_fea->value_len); - name_len -= value_len; - temp_ptr += value_len; - /* BB check that temp_ptr is still - within the SMB BB*/ - /* no trailing null to account for - in value len */ - /* go on to next EA */ - temp_fea = (struct fea *)temp_ptr; - } - } + /* BB also check enough total bytes returned */ + /* BB we need to improve the validity checking + of these trans2 responses */ + + rc = validate_t2((struct smb_t2_rsp *)pSMBr); + if (rc || (pSMBr->ByteCount < 4)) { + rc = -EIO; /* bad smb */ + goto QAllEAsOut; + } + + /* check that length of list is not more than bcc */ + /* check that each entry does not go beyond length + of list */ + /* check that each element of each entry does not + go beyond end of list */ + /* validate_trans2_offsets() */ + /* BB check if start of smb + data_offset > &bcc+ bcc */ + + data_offset = le16_to_cpu(pSMBr->t2.DataOffset); + ea_response_data = (struct fealist *) + (((char *) &pSMBr->hdr.Protocol) + data_offset); + + name_len = le32_to_cpu(ea_response_data->list_len); + cFYI(1, ("ea length %d", name_len)); + if (name_len <= 8) { + cFYI(1, ("empty EA list returned from server")); + goto QAllEAsOut; + } + + /* account for ea list len */ + name_len -= 4; + temp_fea = ea_response_data->list; + temp_ptr = (char *)temp_fea; + while (name_len > 0) { + __u16 value_len; + name_len -= 4; + temp_ptr += 4; + rc += temp_fea->name_len; + /* account for prefix user. and trailing null */ + rc = rc + 5 + 1; + if (rc < (int) buf_size) { + memcpy(EAData, "user.", 5); + EAData += 5; + memcpy(EAData, temp_ptr, temp_fea->name_len); + EAData += temp_fea->name_len; + /* null terminate name */ + *EAData = 0; + EAData = EAData + 1; + } else if (buf_size == 0) { + /* skip copy - calc size only */ + } else { + /* stop before overrun buffer */ + rc = -ERANGE; + break; } + name_len -= temp_fea->name_len; + temp_ptr += temp_fea->name_len; + /* account for trailing null */ + name_len--; + temp_ptr++; + value_len = le16_to_cpu(temp_fea->value_len); + name_len -= value_len; + temp_ptr += value_len; + /* BB check that temp_ptr is still + within the SMB BB*/ + + /* no trailing null to account for + in value len */ + /* go on to next EA */ + temp_fea = (struct fea *)temp_ptr; } + +QAllEAsOut: cifs_buf_release(pSMB); if (rc == -EAGAIN) goto QAllEAsRetry; -- cgit v1.2.3 From 6e462b9f2c37101312109aaa46e196bdca6c846d Mon Sep 17 00:00:00 2001 From: Jeff Layton <jlayton@redhat.com> Date: Wed, 10 Feb 2010 16:18:26 -0500 Subject: cifs: rename name_len to list_len in CIFSSMBQAllEAs ...for clarity and so we can reuse the name for the real name_len. Signed-off-by: Jeff Layton <jlayton@redhat.com> Signed-off-by: Steve French <sfrench@us.ibm.com> --- fs/cifs/cifssmb.c | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 30709589e0c9..f5e15279ea29 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -5281,7 +5281,7 @@ CIFSSMBQAllEAs(const int xid, struct cifsTconInfo *tcon, TRANSACTION2_QPI_RSP *pSMBr = NULL; int rc = 0; int bytes_returned; - int name_len; + int list_len; struct fealist *ea_response_data; struct fea *temp_fea; char *temp_ptr; @@ -5295,18 +5295,18 @@ QAllEAsRetry: return rc; if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { - name_len = + list_len = cifsConvertToUCS((__le16 *) pSMB->FileName, searchName, PATH_MAX, nls_codepage, remap); - name_len++; /* trailing null */ - name_len *= 2; + list_len++; /* trailing null */ + list_len *= 2; } else { /* BB improve the check for buffer overruns BB */ - name_len = strnlen(searchName, PATH_MAX); - name_len++; /* trailing null */ - strncpy(pSMB->FileName, searchName, name_len); + list_len = strnlen(searchName, PATH_MAX); + list_len++; /* trailing null */ + strncpy(pSMB->FileName, searchName, list_len); } - params = 2 /* level */ + 4 /* reserved */ + name_len /* includes NUL */; + params = 2 /* level */ + 4 /* reserved */ + list_len /* includes NUL */; pSMB->TotalDataCount = 0; pSMB->MaxParameterCount = cpu_to_le16(2); /* BB find exact max SMB PDU from sess structure BB */ @@ -5361,23 +5361,23 @@ QAllEAsRetry: ea_response_data = (struct fealist *) (((char *) &pSMBr->hdr.Protocol) + data_offset); - name_len = le32_to_cpu(ea_response_data->list_len); - cFYI(1, ("ea length %d", name_len)); - if (name_len <= 8) { + list_len = le32_to_cpu(ea_response_data->list_len); + cFYI(1, ("ea length %d", list_len)); + if (list_len <= 8) { cFYI(1, ("empty EA list returned from server")); goto QAllEAsOut; } /* account for ea list len */ - name_len -= 4; + list_len -= 4; temp_fea = ea_response_data->list; temp_ptr = (char *)temp_fea; - while (name_len > 0) { + while (list_len > 0) { __u16 value_len; - name_len -= 4; + list_len -= 4; temp_ptr += 4; rc += temp_fea->name_len; - /* account for prefix user. and trailing null */ + /* account for prefix user. and trailing null */ rc = rc + 5 + 1; if (rc < (int) buf_size) { memcpy(EAData, "user.", 5); @@ -5386,7 +5386,7 @@ QAllEAsRetry: EAData += temp_fea->name_len; /* null terminate name */ *EAData = 0; - EAData = EAData + 1; + ++EAData; } else if (buf_size == 0) { /* skip copy - calc size only */ } else { @@ -5394,13 +5394,13 @@ QAllEAsRetry: rc = -ERANGE; break; } - name_len -= temp_fea->name_len; + list_len -= temp_fea->name_len; temp_ptr += temp_fea->name_len; /* account for trailing null */ - name_len--; + list_len--; temp_ptr++; value_len = le16_to_cpu(temp_fea->value_len); - name_len -= value_len; + list_len -= value_len; temp_ptr += value_len; /* BB check that temp_ptr is still within the SMB BB*/ -- cgit v1.2.3 From e529614ad0c2bf1f3a6fcf1402aa77430ecfaa2c Mon Sep 17 00:00:00 2001 From: Jeff Layton <jlayton@redhat.com> Date: Wed, 10 Feb 2010 16:18:26 -0500 Subject: cifs: increase maximum buffer size in CIFSSMBQAllEAs It's 4000 now, but there's no reason to limit it to that. We should be able to handle a response up to CIFSMaxBufSize. Signed-off-by: Jeff Layton <jlayton@redhat.com> Signed-off-by: Steve French <sfrench@us.ibm.com> --- fs/cifs/cifssmb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index f5e15279ea29..4f24f83ce623 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -5310,7 +5310,7 @@ QAllEAsRetry: pSMB->TotalDataCount = 0; pSMB->MaxParameterCount = cpu_to_le16(2); /* BB find exact max SMB PDU from sess structure BB */ - pSMB->MaxDataCount = cpu_to_le16(4000); + pSMB->MaxDataCount = cpu_to_le16(CIFSMaxBufSize); pSMB->MaxSetupCount = 0; pSMB->Reserved = 0; pSMB->Flags = 0; -- cgit v1.2.3 From 0cd126b504cede8a74acf7583a44eba32f9a1da1 Mon Sep 17 00:00:00 2001 From: Jeff Layton <jlayton@redhat.com> Date: Wed, 10 Feb 2010 16:18:26 -0500 Subject: cifs: verify lengths of QueryAllEAs reply Make sure the lengths in a QUERY_ALL_EAS reply don't make the parser walk off the end of the SMB. Signed-off-by: Jeff Layton <jlayton@redhat.com> Signed-off-by: Steve French <sfrench@us.ibm.com> --- fs/cifs/cifssmb.c | 49 +++++++++++++++++++++++++++++++------------------ 1 file changed, 31 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 4f24f83ce623..e197e1647d5d 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -5285,6 +5285,7 @@ CIFSSMBQAllEAs(const int xid, struct cifsTconInfo *tcon, struct fealist *ea_response_data; struct fea *temp_fea; char *temp_ptr; + char *end_of_smb; __u16 params, byte_count, data_offset; cFYI(1, ("In Query All EAs path %s", searchName)); @@ -5368,22 +5369,47 @@ QAllEAsRetry: goto QAllEAsOut; } + /* make sure list_len doesn't go past end of SMB */ + end_of_smb = (char *)pByteArea(&pSMBr->hdr) + BCC(&pSMBr->hdr); + if ((char *)ea_response_data + list_len > end_of_smb) { + cFYI(1, ("EA list appears to go beyond SMB")); + rc = -EIO; + goto QAllEAsOut; + } + /* account for ea list len */ list_len -= 4; temp_fea = ea_response_data->list; temp_ptr = (char *)temp_fea; while (list_len > 0) { + __u8 name_len; __u16 value_len; + list_len -= 4; temp_ptr += 4; - rc += temp_fea->name_len; + /* make sure we can read name_len and value_len */ + if (list_len < 0) { + cFYI(1, ("EA entry goes beyond length of list")); + rc = -EIO; + goto QAllEAsOut; + } + + name_len = temp_fea->name_len; + value_len = le16_to_cpu(temp_fea->value_len); + list_len -= name_len + 1 + value_len; + if (list_len < 0) { + cFYI(1, ("EA entry goes beyond length of list")); + rc = -EIO; + goto QAllEAsOut; + } + /* account for prefix user. and trailing null */ - rc = rc + 5 + 1; + rc += (5 + 1 + name_len); if (rc < (int) buf_size) { memcpy(EAData, "user.", 5); EAData += 5; - memcpy(EAData, temp_ptr, temp_fea->name_len); - EAData += temp_fea->name_len; + memcpy(EAData, temp_ptr, name_len); + EAData += name_len; /* null terminate name */ *EAData = 0; ++EAData; @@ -5394,20 +5420,7 @@ QAllEAsRetry: rc = -ERANGE; break; } - list_len -= temp_fea->name_len; - temp_ptr += temp_fea->name_len; - /* account for trailing null */ - list_len--; - temp_ptr++; - value_len = le16_to_cpu(temp_fea->value_len); - list_len -= value_len; - temp_ptr += value_len; - /* BB check that temp_ptr is still - within the SMB BB*/ - - /* no trailing null to account for - in value len */ - /* go on to next EA */ + temp_ptr += name_len + 1 + value_len; temp_fea = (struct fea *)temp_ptr; } -- cgit v1.2.3 From 31c0519f7af99ae60fd39f7f1c1f7ae1efb56760 Mon Sep 17 00:00:00 2001 From: Jeff Layton <jlayton@redhat.com> Date: Wed, 10 Feb 2010 16:18:26 -0500 Subject: cifs: merge CIFSSMBQueryEA with CIFSSMBQAllEAs Add an "ea_name" parameter to CIFSSMBQAllEAs. When it's set make it behave like CIFSSMBQueryEA does now. The current callers of CIFSSMBQueryEA are converted to use CIFSSMBQAllEAs, and the old CIFSSMBQueryEA function is removed. Signed-off-by: Jeff Layton <jlayton@redhat.com> Signed-off-by: Steve French <sfrench@us.ibm.com> --- fs/cifs/cifsproto.h | 7 +- fs/cifs/cifssmb.c | 214 ++++++++++++---------------------------------------- fs/cifs/inode.c | 2 +- fs/cifs/xattr.c | 8 +- 4 files changed, 54 insertions(+), 177 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 5646727e33f5..88e2bc44ac58 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -363,13 +363,10 @@ extern int CIFSSMBNotify(const int xid, struct cifsTconInfo *tcon, __u32 filter, struct file *file, int multishot, const struct nls_table *nls_codepage); extern ssize_t CIFSSMBQAllEAs(const int xid, struct cifsTconInfo *tcon, - const unsigned char *searchName, char *EAData, + const unsigned char *searchName, + const unsigned char *ea_name, char *EAData, size_t bufsize, const struct nls_table *nls_codepage, int remap_special_chars); -extern ssize_t CIFSSMBQueryEA(const int xid, struct cifsTconInfo *tcon, - const unsigned char *searchName, const unsigned char *ea_name, - unsigned char *ea_value, size_t buf_size, - const struct nls_table *nls_codepage, int remap_special_chars); extern int CIFSSMBSetEA(const int xid, struct cifsTconInfo *tcon, const char *fileName, const char *ea_name, const void *ea_value, const __u16 ea_value_len, diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index e197e1647d5d..d6d40b883abc 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -5269,12 +5269,22 @@ int CIFSSMBNotify(const int xid, struct cifsTconInfo *tcon, cifs_buf_release(pSMB); return rc; } + #ifdef CONFIG_CIFS_XATTR +/* + * Do a path-based QUERY_ALL_EAS call and parse the result. This is a common + * function used by listxattr and getxattr type calls. When ea_name is set, + * it looks for that attribute name and stuffs that value into the EAData + * buffer. When ea_name is NULL, it stuffs a list of attribute names into the + * buffer. In both cases, the return value is either the length of the + * resulting data or a negative error code. If EAData is a NULL pointer then + * the data isn't copied to it, but the length is returned. + */ ssize_t CIFSSMBQAllEAs(const int xid, struct cifsTconInfo *tcon, - const unsigned char *searchName, - char *EAData, size_t buf_size, - const struct nls_table *nls_codepage, int remap) + const unsigned char *searchName, const unsigned char *ea_name, + char *EAData, size_t buf_size, + const struct nls_table *nls_codepage, int remap) { /* BB assumes one setup word */ TRANSACTION2_QPI_REQ *pSMB = NULL; @@ -5403,27 +5413,46 @@ QAllEAsRetry: goto QAllEAsOut; } - /* account for prefix user. and trailing null */ - rc += (5 + 1 + name_len); - if (rc < (int) buf_size) { - memcpy(EAData, "user.", 5); - EAData += 5; - memcpy(EAData, temp_ptr, name_len); - EAData += name_len; - /* null terminate name */ - *EAData = 0; - ++EAData; - } else if (buf_size == 0) { - /* skip copy - calc size only */ + if (ea_name) { + if (strncmp(ea_name, temp_ptr, name_len) == 0) { + temp_ptr += name_len + 1; + rc = value_len; + if (buf_size == 0) + goto QAllEAsOut; + if ((size_t)value_len > buf_size) { + rc = -ERANGE; + goto QAllEAsOut; + } + memcpy(EAData, temp_ptr, value_len); + goto QAllEAsOut; + } } else { - /* stop before overrun buffer */ - rc = -ERANGE; - break; + /* account for prefix user. and trailing null */ + rc += (5 + 1 + name_len); + if (rc < (int) buf_size) { + memcpy(EAData, "user.", 5); + EAData += 5; + memcpy(EAData, temp_ptr, name_len); + EAData += name_len; + /* null terminate name */ + *EAData = 0; + ++EAData; + } else if (buf_size == 0) { + /* skip copy - calc size only */ + } else { + /* stop before overrun buffer */ + rc = -ERANGE; + break; + } } temp_ptr += name_len + 1 + value_len; temp_fea = (struct fea *)temp_ptr; } + /* didn't find the named attribute */ + if (ea_name) + rc = -ENODATA; + QAllEAsOut: cifs_buf_release(pSMB); if (rc == -EAGAIN) @@ -5432,155 +5461,6 @@ QAllEAsOut: return (ssize_t)rc; } -ssize_t CIFSSMBQueryEA(const int xid, struct cifsTconInfo *tcon, - const unsigned char *searchName, const unsigned char *ea_name, - unsigned char *ea_value, size_t buf_size, - const struct nls_table *nls_codepage, int remap) -{ - TRANSACTION2_QPI_REQ *pSMB = NULL; - TRANSACTION2_QPI_RSP *pSMBr = NULL; - int rc = 0; - int bytes_returned; - int name_len; - struct fea *temp_fea; - char *temp_ptr; - __u16 params, byte_count; - - cFYI(1, ("In Query EA path %s", searchName)); -QEARetry: - rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, - (void **) &pSMBr); - if (rc) - return rc; - - if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { - name_len = - cifsConvertToUCS((__le16 *) pSMB->FileName, searchName, - PATH_MAX, nls_codepage, remap); - name_len++; /* trailing null */ - name_len *= 2; - } else { /* BB improve the check for buffer overruns BB */ - name_len = strnlen(searchName, PATH_MAX); - name_len++; /* trailing null */ - strncpy(pSMB->FileName, searchName, name_len); - } - - params = 2 /* level */ + 4 /* reserved */ + name_len /* includes NUL */; - pSMB->TotalDataCount = 0; - pSMB->MaxParameterCount = cpu_to_le16(2); - /* BB find exact max SMB PDU from sess structure BB */ - pSMB->MaxDataCount = cpu_to_le16(4000); - pSMB->MaxSetupCount = 0; - pSMB->Reserved = 0; - pSMB->Flags = 0; - pSMB->Timeout = 0; - pSMB->Reserved2 = 0; - pSMB->ParameterOffset = cpu_to_le16(offsetof( - struct smb_com_transaction2_qpi_req, InformationLevel) - 4); - pSMB->DataCount = 0; - pSMB->DataOffset = 0; - pSMB->SetupCount = 1; - pSMB->Reserved3 = 0; - pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_PATH_INFORMATION); - byte_count = params + 1 /* pad */ ; - pSMB->TotalParameterCount = cpu_to_le16(params); - pSMB->ParameterCount = pSMB->TotalParameterCount; - pSMB->InformationLevel = cpu_to_le16(SMB_INFO_QUERY_ALL_EAS); - pSMB->Reserved4 = 0; - pSMB->hdr.smb_buf_length += byte_count; - pSMB->ByteCount = cpu_to_le16(byte_count); - - rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, - (struct smb_hdr *) pSMBr, &bytes_returned, 0); - if (rc) { - cFYI(1, ("Send error in Query EA = %d", rc)); - } else { /* decode response */ - rc = validate_t2((struct smb_t2_rsp *)pSMBr); - - /* BB also check enough total bytes returned */ - /* BB we need to improve the validity checking - of these trans2 responses */ - if (rc || (pSMBr->ByteCount < 4)) - rc = -EIO; /* bad smb */ - /* else if (pFindData){ - memcpy((char *) pFindData, - (char *) &pSMBr->hdr.Protocol + - data_offset, kl); - }*/ else { - /* check that length of list is not more than bcc */ - /* check that each entry does not go beyond length - of list */ - /* check that each element of each entry does not - go beyond end of list */ - __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset); - struct fealist *ea_response_data; - rc = -ENODATA; - /* validate_trans2_offsets() */ - /* BB check if start of smb + data_offset > &bcc+ bcc*/ - ea_response_data = (struct fealist *) - (((char *) &pSMBr->hdr.Protocol) + - data_offset); - name_len = le32_to_cpu(ea_response_data->list_len); - cFYI(1, ("ea length %d", name_len)); - if (name_len <= 8) { - /* returned EA size zeroed at top of function */ - cFYI(1, ("empty EA list returned from server")); - } else { - /* account for ea list len */ - name_len -= 4; - temp_fea = ea_response_data->list; - temp_ptr = (char *)temp_fea; - /* loop through checking if we have a matching - name and then return the associated value */ - while (name_len > 0) { - __u16 value_len; - name_len -= 4; - temp_ptr += 4; - value_len = - le16_to_cpu(temp_fea->value_len); - /* BB validate that value_len falls within SMB, - even though maximum for name_len is 255 */ - if (memcmp(temp_fea->name, ea_name, - temp_fea->name_len) == 0) { - /* found a match */ - rc = value_len; - /* account for prefix user. and trailing null */ - if (rc <= (int)buf_size) { - memcpy(ea_value, - temp_fea->name+temp_fea->name_len+1, - rc); - /* ea values, unlike ea - names, are not null - terminated */ - } else if (buf_size == 0) { - /* skip copy - calc size only */ - } else { - /* stop before overrun buffer */ - rc = -ERANGE; - } - break; - } - name_len -= temp_fea->name_len; - temp_ptr += temp_fea->name_len; - /* account for trailing null */ - name_len--; - temp_ptr++; - name_len -= value_len; - temp_ptr += value_len; - /* No trailing null to account for in - value_len. Go on to next EA */ - temp_fea = (struct fea *)temp_ptr; - } - } - } - } - cifs_buf_release(pSMB); - if (rc == -EAGAIN) - goto QEARetry; - - return (ssize_t)rc; -} - int CIFSSMBSetEA(const int xid, struct cifsTconInfo *tcon, const char *fileName, const char *ea_name, const void *ea_value, diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index e3fda978f481..5e9cc9c2fc7d 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -366,7 +366,7 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path, char ea_value[4]; __u32 mode; - rc = CIFSSMBQueryEA(xid, cifs_sb->tcon, path, "SETFILEBITS", + rc = CIFSSMBQAllEAs(xid, cifs_sb->tcon, path, "SETFILEBITS", ea_value, 4 /* size of buf */, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c index a75afa3dd9e1..3e2ef0de1209 100644 --- a/fs/cifs/xattr.c +++ b/fs/cifs/xattr.c @@ -244,7 +244,7 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name, /* revalidate/getattr then populate from inode */ } /* BB add else when above is implemented */ ea_name += 5; /* skip past user. prefix */ - rc = CIFSSMBQueryEA(xid, pTcon, full_path, ea_name, ea_value, + rc = CIFSSMBQAllEAs(xid, pTcon, full_path, ea_name, ea_value, buf_size, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); } else if (strncmp(ea_name, CIFS_XATTR_OS2_PREFIX, 4) == 0) { @@ -252,7 +252,7 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name, goto get_ea_exit; ea_name += 4; /* skip past os2. prefix */ - rc = CIFSSMBQueryEA(xid, pTcon, full_path, ea_name, ea_value, + rc = CIFSSMBQAllEAs(xid, pTcon, full_path, ea_name, ea_value, buf_size, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); } else if (strncmp(ea_name, POSIX_ACL_XATTR_ACCESS, @@ -364,8 +364,8 @@ ssize_t cifs_listxattr(struct dentry *direntry, char *data, size_t buf_size) /* if proc/fs/cifs/streamstoxattr is set then search server for EAs or streams to returns as xattrs */ - rc = CIFSSMBQAllEAs(xid, pTcon, full_path, data, buf_size, - cifs_sb->local_nls, + rc = CIFSSMBQAllEAs(xid, pTcon, full_path, NULL, data, + buf_size, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); -- cgit v1.2.3 From 96c03bccc7defb1f0d486165af5c0b8590f56c11 Mon Sep 17 00:00:00 2001 From: Steve French <sfrench@us.ibm.com> Date: Tue, 23 Feb 2010 20:51:43 +0000 Subject: [CIFS] Minor cleanup to EA patch CC: Jeff Layton <jlayton@redhat.com> Signed-off-by: Steve French <sfrench@us.ibm.com> --- fs/cifs/CHANGES | 3 ++- fs/cifs/cifssmb.c | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES index 49503d2edc7e..bc0025cdd1c9 100644 --- a/fs/cifs/CHANGES +++ b/fs/cifs/CHANGES @@ -1,6 +1,7 @@ Version 1.62 ------------ -Add sockopt=TCP_NODELAY mount option. +Add sockopt=TCP_NODELAY mount option. EA (xattr) routines hardened +to more strictly handle corrupt frames. Version 1.61 ------------ diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index d6d40b883abc..99ae57d01d4f 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -5392,7 +5392,7 @@ QAllEAsRetry: temp_fea = ea_response_data->list; temp_ptr = (char *)temp_fea; while (list_len > 0) { - __u8 name_len; + int name_len; __u16 value_len; list_len -= 4; -- cgit v1.2.3 From 835a36ca4a4cd9da557318c0e213346644a4b2c8 Mon Sep 17 00:00:00 2001 From: Jeff Layton <jlayton@redhat.com> Date: Wed, 10 Feb 2010 16:21:33 -0500 Subject: cifs: set server_eof in cifs_fattr_to_inode Signed-off-by: Jeff Layton <jlayton@redhat.com> Signed-off-by: Steve French <sfrench@us.ibm.com> --- fs/cifs/inode.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 5e9cc9c2fc7d..8bdbc818164c 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -111,6 +111,7 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr) cifs_i->delete_pending = fattr->cf_flags & CIFS_FATTR_DELETE_PENDING; + cifs_i->server_eof = fattr->cf_eof; /* * Can't safely change the file size here if the client is writing to * it due to potential races. -- cgit v1.2.3 From c8d46e41bc744c8fa0092112af3942fcd46c8b18 Mon Sep 17 00:00:00 2001 From: Jiaying Zhang <jiayingz@google.com> Date: Wed, 24 Feb 2010 09:52:53 -0500 Subject: ext4: Add flag to files with blocks intentionally past EOF fallocate() may potentially instantiate blocks past EOF, depending on the flags used when it is called. e2fsck currently has a test for blocks past i_size, and it sometimes trips up - noticeably on xfstests 013 which runs fsstress. This patch from Jiayang does fix it up - it (along with e2fsprogs updates and other patches recently from Aneesh) has survived many fsstress runs in a row. Signed-off-by: Eric Sandeen <sandeen@redhat.com> Signed-off-by: Jiaying Zhang <jiayingz@google.com> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu> --- fs/ext4/ext4.h | 5 +++-- fs/ext4/extents.c | 22 +++++++++++++++++++++- fs/ext4/inode.c | 9 ++++++++- fs/ext4/ioctl.c | 9 +++++++++ 4 files changed, 41 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 509437ffb71b..74664ca19e22 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -285,10 +285,11 @@ struct flex_groups { #define EXT4_HUGE_FILE_FL 0x00040000 /* Set to each huge file */ #define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */ #define EXT4_EA_INODE_FL 0x00200000 /* Inode used for large EA */ +#define EXT4_EOFBLOCKS_FL 0x00400000 /* Blocks allocated beyond EOF */ #define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ -#define EXT4_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */ -#define EXT4_FL_USER_MODIFIABLE 0x000B80FF /* User modifiable flags */ +#define EXT4_FL_USER_VISIBLE 0x004BDFFF /* User visible flags */ +#define EXT4_FL_USER_MODIFIABLE 0x004B80FF /* User modifiable flags */ /* Flags that should be inherited by new inodes from their parent. */ #define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\ diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 7d54850f7136..a2c21aa09e2b 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3186,7 +3186,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, { struct ext4_ext_path *path = NULL; struct ext4_extent_header *eh; - struct ext4_extent newex, *ex; + struct ext4_extent newex, *ex, *last_ex; ext4_fsblk_t newblock; int err = 0, depth, ret, cache_type; unsigned int allocated = 0; @@ -3367,6 +3367,19 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, EXT4_STATE_DIO_UNWRITTEN); } } + + if (unlikely(EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL)) { + if (eh->eh_entries) { + last_ex = EXT_LAST_EXTENT(eh); + if (iblock + ar.len > le32_to_cpu(last_ex->ee_block) + + ext4_ext_get_actual_len(last_ex)) + EXT4_I(inode)->i_flags &= ~EXT4_EOFBLOCKS_FL; + } else { + WARN_ON(eh->eh_entries == 0); + ext4_error(inode->i_sb, __func__, + "inode#%lu, eh->eh_entries = 0!", inode->i_ino); + } + } err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); if (err) { /* free data blocks we just allocated */ @@ -3500,6 +3513,13 @@ static void ext4_falloc_update_inode(struct inode *inode, i_size_write(inode, new_size); if (new_size > EXT4_I(inode)->i_disksize) ext4_update_i_disksize(inode, new_size); + } else { + /* + * Mark that we allocate beyond EOF so the subsequent truncate + * can proceed even if the new size is the same as i_size. + */ + if (new_size > i_size_read(inode)) + EXT4_I(inode)->i_flags |= EXT4_EOFBLOCKS_FL; } } diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index ecac8c5a6f5c..edb7edc99f71 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4456,6 +4456,8 @@ void ext4_truncate(struct inode *inode) if (!ext4_can_truncate(inode)) return; + EXT4_I(inode)->i_flags &= ~EXT4_EOFBLOCKS_FL; + if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); @@ -5305,7 +5307,9 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) } if (S_ISREG(inode->i_mode) && - attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) { + attr->ia_valid & ATTR_SIZE && + (attr->ia_size < inode->i_size || + (EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL))) { handle_t *handle; handle = ext4_journal_start(inode, 3); @@ -5336,6 +5340,9 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) goto err_out; } } + /* ext4_truncate will clear the flag */ + if ((EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL)) + ext4_truncate(inode); } rc = inode_setattr(inode, attr); diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index b63d193126db..2220feb2dcc1 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -92,6 +92,15 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) flags &= ~EXT4_EXTENTS_FL; } + if (flags & EXT4_EOFBLOCKS_FL) { + /* we don't support adding EOFBLOCKS flag */ + if (!(oldflags & EXT4_EOFBLOCKS_FL)) { + err = -EOPNOTSUPP; + goto flags_out; + } + } else if (oldflags & EXT4_EOFBLOCKS_FL) + ext4_truncate(inode); + handle = ext4_journal_start(inode, 1); if (IS_ERR(handle)) { err = PTR_ERR(handle); -- cgit v1.2.3 From 482a74258fd08d30bf2ab0f5549afab5a5c9daba Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov <dmonakhov@openvz.org> Date: Wed, 24 Feb 2010 11:35:32 -0500 Subject: ext4: mount flags manipulation cleanup Replace intermediate EXT4_MOUNT_XXX flags manipulation to corresponding macro. Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org> Acked-by: Jan Kara <jack@suse.cz> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu> --- fs/ext4/super.c | 31 +++++++++++++------------------ 1 file changed, 13 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 1c85bb67e6eb..7e8b1b4236d3 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -796,10 +796,10 @@ static inline void ext4_show_quota_options(struct seq_file *seq, if (sbi->s_qf_names[GRPQUOTA]) seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]); - if (sbi->s_mount_opt & EXT4_MOUNT_USRQUOTA) + if (test_opt(sb, USRQUOTA)) seq_puts(seq, ",usrquota"); - if (sbi->s_mount_opt & EXT4_MOUNT_GRPQUOTA) + if (test_opt(sb, GRPQUOTA)) seq_puts(seq, ",grpquota"); #endif } @@ -1383,14 +1383,13 @@ static int parse_options(char *options, struct super_block *sb, data_opt = EXT4_MOUNT_WRITEBACK_DATA; datacheck: if (is_remount) { - if ((sbi->s_mount_opt & EXT4_MOUNT_DATA_FLAGS) - != data_opt) { + if (test_opt(sb, DATA_FLAGS) != data_opt) { ext4_msg(sb, KERN_ERR, "Cannot change data mode on remount"); return 0; } } else { - sbi->s_mount_opt &= ~EXT4_MOUNT_DATA_FLAGS; + clear_opt(sbi->s_mount_opt, DATA_FLAGS); sbi->s_mount_opt |= data_opt; } break; @@ -1625,18 +1624,14 @@ set_qf_format: } #ifdef CONFIG_QUOTA if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) { - if ((sbi->s_mount_opt & EXT4_MOUNT_USRQUOTA) && - sbi->s_qf_names[USRQUOTA]) + if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA]) clear_opt(sbi->s_mount_opt, USRQUOTA); - if ((sbi->s_mount_opt & EXT4_MOUNT_GRPQUOTA) && - sbi->s_qf_names[GRPQUOTA]) + if (test_opt(sb, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA]) clear_opt(sbi->s_mount_opt, GRPQUOTA); - if ((sbi->s_qf_names[USRQUOTA] && - (sbi->s_mount_opt & EXT4_MOUNT_GRPQUOTA)) || - (sbi->s_qf_names[GRPQUOTA] && - (sbi->s_mount_opt & EXT4_MOUNT_USRQUOTA))) { + if ((sbi->s_qf_names[USRQUOTA] && test_opt(sb, GRPQUOTA)) || + (sbi->s_qf_names[GRPQUOTA] && test_opt(sb, USRQUOTA))) { ext4_msg(sb, KERN_ERR, "old and new quota " "format mixing"); return 0; @@ -2452,11 +2447,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) set_opt(sbi->s_mount_opt, POSIX_ACL); #endif if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA) - sbi->s_mount_opt |= EXT4_MOUNT_JOURNAL_DATA; + set_opt(sbi->s_mount_opt, JOURNAL_DATA); else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED) - sbi->s_mount_opt |= EXT4_MOUNT_ORDERED_DATA; + set_opt(sbi->s_mount_opt, ORDERED_DATA); else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK) - sbi->s_mount_opt |= EXT4_MOUNT_WRITEBACK_DATA; + set_opt(sbi->s_mount_opt, WRITEBACK_DATA); if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC) set_opt(sbi->s_mount_opt, ERRORS_PANIC); @@ -2484,7 +2479,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount; sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | - ((sbi->s_mount_opt & EXT4_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); + (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0); if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV && (EXT4_HAS_COMPAT_FEATURE(sb, ~0U) || @@ -3520,7 +3515,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) ext4_abort(sb, __func__, "Abort forced by user"); sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | - ((sbi->s_mount_opt & EXT4_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); + (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0); es = sbi->s_es; -- cgit v1.2.3 From 56c50f11f4d11cb14d78fe52330efb69d219c62f Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov <dmonakhov@openvz.org> Date: Mon, 1 Mar 2010 23:28:41 -0500 Subject: ext4: trivial quota cleanup The patch is aimed to reorganize and simplify quota code a bit. Quota code is itself complex enough, but we can make it more readable in some places: - Move quota option parsing to separate functions. - Simplify old-quota and journaled-quota mix check. Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org> Acked-by: Jan Kara <jack@suse.cz> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu> --- fs/ext4/super.c | 123 +++++++++++++++++++++++++++++++------------------------- 1 file changed, 69 insertions(+), 54 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 7e8b1b4236d3..79937e921988 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1206,6 +1206,64 @@ static ext4_fsblk_t get_sb_block(void **data) #define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3)) +#ifdef CONFIG_QUOTA +static int set_qf_name(struct super_block *sb, int qtype, substring_t *args) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + char *qname; + + if (sb_any_quota_loaded(sb) && + !sbi->s_qf_names[qtype]) { + ext4_msg(sb, KERN_ERR, + "Cannot change journaled " + "quota options when quota turned on"); + return 0; + } + qname = match_strdup(args); + if (!qname) { + ext4_msg(sb, KERN_ERR, + "Not enough memory for storing quotafile name"); + return 0; + } + if (sbi->s_qf_names[qtype] && + strcmp(sbi->s_qf_names[qtype], qname)) { + ext4_msg(sb, KERN_ERR, + "%s quota file already specified", QTYPE2NAME(qtype)); + kfree(qname); + return 0; + } + sbi->s_qf_names[qtype] = qname; + if (strchr(sbi->s_qf_names[qtype], '/')) { + ext4_msg(sb, KERN_ERR, + "quotafile must be on filesystem root"); + kfree(sbi->s_qf_names[qtype]); + sbi->s_qf_names[qtype] = NULL; + return 0; + } + set_opt(sbi->s_mount_opt, QUOTA); + return 1; +} + +static int clear_qf_name(struct super_block *sb, int qtype) +{ + + struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (sb_any_quota_loaded(sb) && + sbi->s_qf_names[qtype]) { + ext4_msg(sb, KERN_ERR, "Cannot change journaled quota options" + " when quota turned on"); + return 0; + } + /* + * The space will be released later when all options are confirmed + * to be correct + */ + sbi->s_qf_names[qtype] = NULL; + return 1; +} +#endif + static int parse_options(char *options, struct super_block *sb, unsigned long *journal_devnum, unsigned int *journal_ioprio, @@ -1217,8 +1275,7 @@ static int parse_options(char *options, struct super_block *sb, int data_opt = 0; int option; #ifdef CONFIG_QUOTA - int qtype, qfmt; - char *qname; + int qfmt; #endif if (!options) @@ -1401,63 +1458,22 @@ static int parse_options(char *options, struct super_block *sb, break; #ifdef CONFIG_QUOTA case Opt_usrjquota: - qtype = USRQUOTA; - goto set_qf_name; - case Opt_grpjquota: - qtype = GRPQUOTA; -set_qf_name: - if (sb_any_quota_loaded(sb) && - !sbi->s_qf_names[qtype]) { - ext4_msg(sb, KERN_ERR, - "Cannot change journaled " - "quota options when quota turned on"); + if (!set_qf_name(sb, USRQUOTA, &args[0])) return 0; - } - qname = match_strdup(&args[0]); - if (!qname) { - ext4_msg(sb, KERN_ERR, - "Not enough memory for " - "storing quotafile name"); - return 0; - } - if (sbi->s_qf_names[qtype] && - strcmp(sbi->s_qf_names[qtype], qname)) { - ext4_msg(sb, KERN_ERR, - "%s quota file already " - "specified", QTYPE2NAME(qtype)); - kfree(qname); - return 0; - } - sbi->s_qf_names[qtype] = qname; - if (strchr(sbi->s_qf_names[qtype], '/')) { - ext4_msg(sb, KERN_ERR, - "quotafile must be on " - "filesystem root"); - kfree(sbi->s_qf_names[qtype]); - sbi->s_qf_names[qtype] = NULL; + break; + case Opt_grpjquota: + if (!set_qf_name(sb, GRPQUOTA, &args[0])) return 0; - } - set_opt(sbi->s_mount_opt, QUOTA); break; case Opt_offusrjquota: - qtype = USRQUOTA; - goto clear_qf_name; + if (!clear_qf_name(sb, USRQUOTA)) + return 0; + break; case Opt_offgrpjquota: - qtype = GRPQUOTA; -clear_qf_name: - if (sb_any_quota_loaded(sb) && - sbi->s_qf_names[qtype]) { - ext4_msg(sb, KERN_ERR, "Cannot change " - "journaled quota options when " - "quota turned on"); + if (!clear_qf_name(sb, GRPQUOTA)) return 0; - } - /* - * The space will be released later when all options - * are confirmed to be correct - */ - sbi->s_qf_names[qtype] = NULL; break; + case Opt_jqfmt_vfsold: qfmt = QFMT_VFS_OLD; goto set_qf_format; @@ -1630,8 +1646,7 @@ set_qf_format: if (test_opt(sb, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA]) clear_opt(sbi->s_mount_opt, GRPQUOTA); - if ((sbi->s_qf_names[USRQUOTA] && test_opt(sb, GRPQUOTA)) || - (sbi->s_qf_names[GRPQUOTA] && test_opt(sb, USRQUOTA))) { + if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) { ext4_msg(sb, KERN_ERR, "old and new quota " "format mixing"); return 0; -- cgit v1.2.3 From 23e2af3518facab6838d7aac1f46fbd7a5a290ce Mon Sep 17 00:00:00 2001 From: dingdinghua <dingdinghua@nrchpc.ac.cn> Date: Wed, 24 Feb 2010 12:11:20 -0500 Subject: jbd2: clean up an assertion in jbd2_journal_commit_transaction() commit_transaction has the same value as journal->j_running_transaction, so we can simplify the assert statement. Signed-off-by: dingdinghua <dingdinghua@nrchpc.ac.cn> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu> --- fs/jbd2/commit.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 3ee211ed58f1..671da7fb7ffd 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -883,8 +883,7 @@ restart_loop: spin_unlock(&journal->j_list_lock); bh = jh2bh(jh); jbd_lock_bh_state(bh); - J_ASSERT_JH(jh, jh->b_transaction == commit_transaction || - jh->b_transaction == journal->j_running_transaction); + J_ASSERT_JH(jh, jh->b_transaction == commit_transaction); /* * If there is undo-protected committed data against -- cgit v1.2.3 From 7fe2b3190b8b299409f13cf3a6f85c2bd371f8bb Mon Sep 17 00:00:00 2001 From: David Teigland <teigland@redhat.com> Date: Wed, 24 Feb 2010 11:08:18 -0600 Subject: dlm: fix ordering of bast and cast When both blocking and completion callbacks are queued for lock, the dlm would always deliver the completion callback (cast) first. In some cases the blocking callback (bast) is queued before the cast, though, and should be delivered first. This patch keeps track of the order in which they were queued and delivers them in that order. This patch also keeps track of the granted mode in the last cast and eliminates the following bast if the bast mode is compatible with the preceding cast mode. This happens when a remotely mastered lock is demoted, e.g. EX->NL, in which case the local node queues a cast immediately after sending the demote message. In this way a cast can be queued for a mode, e.g. NL, that makes an in-transit bast extraneous. Signed-off-by: David Teigland <teigland@redhat.com> --- fs/dlm/ast.c | 74 ++++++++++++++++++++++++++++++++++++++++----------- fs/dlm/ast.h | 4 +-- fs/dlm/dlm_internal.h | 10 +++++-- fs/dlm/lock.c | 4 +-- fs/dlm/user.c | 10 ++++--- fs/dlm/user.h | 4 +-- 6 files changed, 78 insertions(+), 28 deletions(-) (limited to 'fs') diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c index dc2ad6008b2d..4314f0d48d85 100644 --- a/fs/dlm/ast.c +++ b/fs/dlm/ast.c @@ -2,7 +2,7 @@ ******************************************************************************* ** ** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. -** Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. +** Copyright (C) 2004-2010 Red Hat, Inc. All rights reserved. ** ** This copyrighted material is made available to anyone wishing to use, ** modify, copy, or redistribute it subject to the terms and conditions @@ -33,10 +33,10 @@ void dlm_del_ast(struct dlm_lkb *lkb) spin_unlock(&ast_queue_lock); } -void dlm_add_ast(struct dlm_lkb *lkb, int type, int bastmode) +void dlm_add_ast(struct dlm_lkb *lkb, int type, int mode) { if (lkb->lkb_flags & DLM_IFL_USER) { - dlm_user_add_ast(lkb, type, bastmode); + dlm_user_add_ast(lkb, type, mode); return; } @@ -44,10 +44,21 @@ void dlm_add_ast(struct dlm_lkb *lkb, int type, int bastmode) if (!(lkb->lkb_ast_type & (AST_COMP | AST_BAST))) { kref_get(&lkb->lkb_ref); list_add_tail(&lkb->lkb_astqueue, &ast_queue); + lkb->lkb_ast_first = type; } + + /* sanity check, this should not happen */ + + if ((type == AST_COMP) && (lkb->lkb_ast_type & AST_COMP)) + log_print("repeat cast %d castmode %d lock %x %s", + mode, lkb->lkb_castmode, + lkb->lkb_id, lkb->lkb_resource->res_name); + lkb->lkb_ast_type |= type; - if (bastmode) - lkb->lkb_bastmode = bastmode; + if (type == AST_BAST) + lkb->lkb_bastmode = mode; + else + lkb->lkb_castmode = mode; spin_unlock(&ast_queue_lock); set_bit(WAKE_ASTS, &astd_wakeflags); @@ -59,9 +70,9 @@ static void process_asts(void) struct dlm_ls *ls = NULL; struct dlm_rsb *r = NULL; struct dlm_lkb *lkb; - void (*cast) (void *astparam); - void (*bast) (void *astparam, int mode); - int type = 0, bastmode; + void (*castfn) (void *astparam); + void (*bastfn) (void *astparam, int mode); + int type, first, bastmode, castmode, do_bast, do_cast, last_castmode; repeat: spin_lock(&ast_queue_lock); @@ -75,17 +86,48 @@ repeat: list_del(&lkb->lkb_astqueue); type = lkb->lkb_ast_type; lkb->lkb_ast_type = 0; + first = lkb->lkb_ast_first; + lkb->lkb_ast_first = 0; bastmode = lkb->lkb_bastmode; - + castmode = lkb->lkb_castmode; + castfn = lkb->lkb_astfn; + bastfn = lkb->lkb_bastfn; spin_unlock(&ast_queue_lock); - cast = lkb->lkb_astfn; - bast = lkb->lkb_bastfn; - - if ((type & AST_COMP) && cast) - cast(lkb->lkb_astparam); - if ((type & AST_BAST) && bast) - bast(lkb->lkb_astparam, bastmode); + do_cast = (type & AST_COMP) && castfn; + do_bast = (type & AST_BAST) && bastfn; + + /* Skip a bast if its blocking mode is compatible with the + granted mode of the preceding cast. */ + + if (do_bast) { + if (first == AST_COMP) + last_castmode = castmode; + else + last_castmode = lkb->lkb_castmode_done; + if (dlm_modes_compat(bastmode, last_castmode)) + do_bast = 0; + } + + if (first == AST_COMP) { + if (do_cast) + castfn(lkb->lkb_astparam); + if (do_bast) + bastfn(lkb->lkb_astparam, bastmode); + } else if (first == AST_BAST) { + if (do_bast) + bastfn(lkb->lkb_astparam, bastmode); + if (do_cast) + castfn(lkb->lkb_astparam); + } else { + log_error(ls, "bad ast_first %d ast_type %d", + first, type); + } + + if (do_cast) + lkb->lkb_castmode_done = castmode; + if (do_bast) + lkb->lkb_bastmode_done = bastmode; /* this removes the reference added by dlm_add_ast and may result in the lkb being freed */ diff --git a/fs/dlm/ast.h b/fs/dlm/ast.h index 1b5fc5f428fd..bcb1aaba519d 100644 --- a/fs/dlm/ast.h +++ b/fs/dlm/ast.h @@ -1,7 +1,7 @@ /****************************************************************************** ******************************************************************************* ** -** Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved. +** Copyright (C) 2005-2010 Red Hat, Inc. All rights reserved. ** ** This copyrighted material is made available to anyone wishing to use, ** modify, copy, or redistribute it subject to the terms and conditions @@ -13,7 +13,7 @@ #ifndef __ASTD_DOT_H__ #define __ASTD_DOT_H__ -void dlm_add_ast(struct dlm_lkb *lkb, int type, int bastmode); +void dlm_add_ast(struct dlm_lkb *lkb, int type, int mode); void dlm_del_ast(struct dlm_lkb *lkb); void dlm_astd_wake(void); diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index 826d3dc6e0ab..f632b58cd222 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h @@ -2,7 +2,7 @@ ******************************************************************************* ** ** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. -** Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. +** Copyright (C) 2004-2010 Red Hat, Inc. All rights reserved. ** ** This copyrighted material is made available to anyone wishing to use, ** modify, copy, or redistribute it subject to the terms and conditions @@ -232,11 +232,17 @@ struct dlm_lkb { int8_t lkb_status; /* granted, waiting, convert */ int8_t lkb_rqmode; /* requested lock mode */ int8_t lkb_grmode; /* granted lock mode */ - int8_t lkb_bastmode; /* requested mode */ int8_t lkb_highbast; /* highest mode bast sent for */ + int8_t lkb_wait_type; /* type of reply waiting for */ int8_t lkb_wait_count; int8_t lkb_ast_type; /* type of ast queued for */ + int8_t lkb_ast_first; /* type of first ast queued */ + + int8_t lkb_bastmode; /* req mode of queued bast */ + int8_t lkb_castmode; /* gr mode of queued cast */ + int8_t lkb_bastmode_done; /* last delivered bastmode */ + int8_t lkb_castmode_done; /* last delivered castmode */ struct list_head lkb_idtbl_list; /* lockspace lkbtbl */ struct list_head lkb_statequeue; /* rsb g/c/w list */ diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index 9c0c1db1e105..e08ea93432bc 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -1,7 +1,7 @@ /****************************************************************************** ******************************************************************************* ** -** Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved. +** Copyright (C) 2005-2010 Red Hat, Inc. All rights reserved. ** ** This copyrighted material is made available to anyone wishing to use, ** modify, copy, or redistribute it subject to the terms and conditions @@ -307,7 +307,7 @@ static void queue_cast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv) lkb->lkb_lksb->sb_status = rv; lkb->lkb_lksb->sb_flags = lkb->lkb_sbflags; - dlm_add_ast(lkb, AST_COMP, 0); + dlm_add_ast(lkb, AST_COMP, lkb->lkb_grmode); } static inline void queue_cast_overlap(struct dlm_rsb *r, struct dlm_lkb *lkb) diff --git a/fs/dlm/user.c b/fs/dlm/user.c index e73a4bb572aa..a4bfd31ac45b 100644 --- a/fs/dlm/user.c +++ b/fs/dlm/user.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2006-2009 Red Hat, Inc. All rights reserved. + * Copyright (C) 2006-2010 Red Hat, Inc. All rights reserved. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions @@ -173,7 +173,7 @@ static int lkb_is_endoflife(struct dlm_lkb *lkb, int sb_status, int type) /* we could possibly check if the cancel of an orphan has resulted in the lkb being removed and then remove that lkb from the orphans list and free it */ -void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int bastmode) +void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int mode) { struct dlm_ls *ls; struct dlm_user_args *ua; @@ -206,8 +206,10 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int bastmode) ast_type = lkb->lkb_ast_type; lkb->lkb_ast_type |= type; - if (bastmode) - lkb->lkb_bastmode = bastmode; + if (type == AST_BAST) + lkb->lkb_bastmode = mode; + else + lkb->lkb_castmode = mode; if (!ast_type) { kref_get(&lkb->lkb_ref); diff --git a/fs/dlm/user.h b/fs/dlm/user.h index 1c9686492286..f196091dd7ff 100644 --- a/fs/dlm/user.h +++ b/fs/dlm/user.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2006-2008 Red Hat, Inc. All rights reserved. + * Copyright (C) 2006-2010 Red Hat, Inc. All rights reserved. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions @@ -9,7 +9,7 @@ #ifndef __USER_DOT_H__ #define __USER_DOT_H__ -void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int bastmode); +void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int mode); int dlm_user_init(void); void dlm_user_exit(void); int dlm_device_deregister(struct dlm_ls *ls); -- cgit v1.2.3 From 122ca0076e5f84fa12943f22f6586ff285670783 Mon Sep 17 00:00:00 2001 From: Steve French <sfrench@us.ibm.com> Date: Wed, 24 Feb 2010 21:56:48 +0000 Subject: [CIFS] Use unsigned ea length for clarity Jeff correctly noted that using unsigned ea length is more intuitive. CC: Jeff Lyaton <jlayton@redhat.com> Signed-off-by: Steve French <sfrench@us.ibm.com> --- fs/cifs/cifssmb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 99ae57d01d4f..b79ff68c47c7 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -5392,7 +5392,7 @@ QAllEAsRetry: temp_fea = ea_response_data->list; temp_ptr = (char *)temp_fea; while (list_len > 0) { - int name_len; + unsigned int name_len; __u16 value_len; list_len -= 4; -- cgit v1.2.3 From 58255a4e3ce506b43bb14d5579006731a981490d Mon Sep 17 00:00:00 2001 From: Chuck Lever <chuck.lever@oracle.com> Date: Wed, 24 Feb 2010 13:48:06 -0800 Subject: NFSD: NFSv4 callback client should use RPC_TASK_SOFTCONN The server's callback client should stop trying to connect to the client's callback server as soon as it gets ECONNREFUSED. The NFS server's callback client does not call rpc_ping(), but appears to have it's own "ping" procedure, so it wasn't covered by commit caabea8a. Signed-off-by: Chuck Lever <chuck.lever@oracle.com> Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu> --- fs/nfsd/nfs4callback.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index c6eed2a3b093..8fa412ce8021 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -542,7 +542,8 @@ void do_probe_callback(struct nfs4_client *clp) }; int status; - status = rpc_call_async(cb->cb_client, &msg, RPC_TASK_SOFT, + status = rpc_call_async(cb->cb_client, &msg, + RPC_TASK_SOFT | RPC_TASK_SOFTCONN, &nfsd4_cb_probe_ops, (void *)clp); if (status) { warn_no_callback_path(clp, status); -- cgit v1.2.3 From d7b619cf56218704ffce9d510aa497f0a0bcda0b Mon Sep 17 00:00:00 2001 From: Steve French <sfrench@us.ibm.com> Date: Thu, 25 Feb 2010 05:36:46 +0000 Subject: [CIFS] pSesInfo->sesSem is used as mutex. Rename it to session_mutex and convert it to a real mutex. Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Acked-by: Jeff Layton <jlayton@redhat.com> Signed-off-by: Steve French <sfrench@us.ibm.com> --- fs/cifs/cifsglob.h | 2 +- fs/cifs/cifssmb.c | 12 ++++++------ fs/cifs/connect.c | 8 ++++---- fs/cifs/misc.c | 2 +- 4 files changed, 12 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index ed751bb657db..a1c817eb291a 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -205,7 +205,7 @@ struct cifsUidInfo { struct cifsSesInfo { struct list_head smb_ses_list; struct list_head tcon_list; - struct semaphore sesSem; + struct mutex session_mutex; #if 0 struct cifsUidInfo *uidInfo; /* pointer to user info */ #endif diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index b79ff68c47c7..9d17df3e0768 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -170,19 +170,19 @@ cifs_reconnect_tcon(struct cifsTconInfo *tcon, int smb_command) * need to prevent multiple threads trying to simultaneously * reconnect the same SMB session */ - down(&ses->sesSem); + mutex_lock(&ses->session_mutex); if (ses->need_reconnect) rc = cifs_setup_session(0, ses, nls_codepage); /* do we need to reconnect tcon? */ if (rc || !tcon->need_reconnect) { - up(&ses->sesSem); + mutex_unlock(&ses->session_mutex); goto out; } mark_open_files_invalid(tcon); rc = CIFSTCon(0, ses, tcon->treeName, tcon, nls_codepage); - up(&ses->sesSem); + mutex_unlock(&ses->session_mutex); cFYI(1, ("reconnect tcon rc = %d", rc)); if (rc) @@ -700,13 +700,13 @@ CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses) if (!ses || !ses->server) return -EIO; - down(&ses->sesSem); + mutex_lock(&ses->session_mutex); if (ses->need_reconnect) goto session_already_dead; /* no need to send SMBlogoff if uid already closed due to reconnect */ rc = small_smb_init(SMB_COM_LOGOFF_ANDX, 2, NULL, (void **)&pSMB); if (rc) { - up(&ses->sesSem); + mutex_unlock(&ses->session_mutex); return rc; } @@ -721,7 +721,7 @@ CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses) pSMB->AndXCommand = 0xFF; rc = SendReceiveNoRsp(xid, ses, (struct smb_hdr *) pSMB, 0); session_already_dead: - up(&ses->sesSem); + mutex_unlock(&ses->session_mutex); /* if session dead then we do not need to do ulogoff, since server closed smb session, no sense reporting diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 2e9e09ca0e30..45eb6cba793f 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -2388,13 +2388,13 @@ try_mount_again: */ cifs_put_tcp_session(srvTcp); - down(&pSesInfo->sesSem); + mutex_lock(&pSesInfo->session_mutex); if (pSesInfo->need_reconnect) { cFYI(1, ("Session needs reconnect")); rc = cifs_setup_session(xid, pSesInfo, cifs_sb->local_nls); } - up(&pSesInfo->sesSem); + mutex_unlock(&pSesInfo->session_mutex); } else if (!rc) { cFYI(1, ("Existing smb sess not found")); pSesInfo = sesInfoAlloc(); @@ -2437,12 +2437,12 @@ try_mount_again: } pSesInfo->linux_uid = volume_info->linux_uid; pSesInfo->overrideSecFlg = volume_info->secFlg; - down(&pSesInfo->sesSem); + mutex_lock(&pSesInfo->session_mutex); /* BB FIXME need to pass vol->secFlgs BB */ rc = cifs_setup_session(xid, pSesInfo, cifs_sb->local_nls); - up(&pSesInfo->sesSem); + mutex_unlock(&pSesInfo->session_mutex); } /* search for existing tcon to this server share */ diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index d27d4ec6579b..d1474996a812 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -79,7 +79,7 @@ sesInfoAlloc(void) ++ret_buf->ses_count; INIT_LIST_HEAD(&ret_buf->smb_ses_list); INIT_LIST_HEAD(&ret_buf->tcon_list); - init_MUTEX(&ret_buf->sesSem); + mutex_init(&ret_buf->session_mutex); } return ret_buf; } -- cgit v1.2.3 From 7dc52157982ab771f40e3c0b7dc55b954c3c2d19 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com> Date: Mon, 22 Feb 2010 17:04:52 -0800 Subject: vfs: Apply lockdep-based checking to rcu_dereference() uses Add lockdep-ified RCU primitives to alloc_fd(), files_fdtable() and fcheck_files(). Cc: Alexander Viro <viro@zeniv.linux.org.uk> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: mathieu.desnoyers@polymtl.ca Cc: josh@joshtriplett.org Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org Cc: Valdis.Kletnieks@vt.edu Cc: dhowells@redhat.com Cc: Alexander Viro <viro@zeniv.linux.org.uk> LKML-Reference: <1266887105-1528-8-git-send-email-paulmck@linux.vnet.ibm.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- fs/file.c | 2 +- fs/proc/array.c | 2 ++ fs/proc/base.c | 6 +++++- include/linux/fdtable.h | 8 ++++++-- 4 files changed, 14 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/file.c b/fs/file.c index 87e129030ab1..38039af67663 100644 --- a/fs/file.c +++ b/fs/file.c @@ -478,7 +478,7 @@ repeat: error = fd; #if 1 /* Sanity check */ - if (rcu_dereference(fdt->fd[fd]) != NULL) { + if (rcu_dereference_raw(fdt->fd[fd]) != NULL) { printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd); rcu_assign_pointer(fdt->fd[fd], NULL); } diff --git a/fs/proc/array.c b/fs/proc/array.c index 13b5d0708175..18e20feee251 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -270,7 +270,9 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p) blocked = p->blocked; collect_sigign_sigcatch(p, &ignored, &caught); num_threads = atomic_read(&p->signal->count); + rcu_read_lock(); /* FIXME: is this correct? */ qsize = atomic_read(&__task_cred(p)->user->sigpending); + rcu_read_unlock(); qlim = p->signal->rlim[RLIMIT_SIGPENDING].rlim_cur; unlock_task_sighand(p, &flags); } diff --git a/fs/proc/base.c b/fs/proc/base.c index 58324c299165..623e2ffb5d2b 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1095,8 +1095,12 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf, if (!capable(CAP_AUDIT_CONTROL)) return -EPERM; - if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) + rcu_read_lock(); + if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) { + rcu_read_unlock(); return -EPERM; + } + rcu_read_unlock(); if (count >= PAGE_SIZE) count = PAGE_SIZE - 1; diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h index a2ec74bc4812..144412ffaced 100644 --- a/include/linux/fdtable.h +++ b/include/linux/fdtable.h @@ -57,7 +57,11 @@ struct files_struct { struct file * fd_array[NR_OPEN_DEFAULT]; }; -#define files_fdtable(files) (rcu_dereference((files)->fdt)) +#define files_fdtable(files) \ + (rcu_dereference_check((files)->fdt, \ + rcu_read_lock_held() || \ + lockdep_is_held(&(files)->file_lock) || \ + atomic_read(&files->count) == 1)) struct file_operations; struct vfsmount; @@ -78,7 +82,7 @@ static inline struct file * fcheck_files(struct files_struct *files, unsigned in struct fdtable *fdt = files_fdtable(files); if (fd < fdt->max_fds) - file = rcu_dereference(fdt->fd[fd]); + file = rcu_dereference_check(fdt->fd[fd], rcu_read_lock_held() || lockdep_is_held(&files->file_lock) || atomic_read(&files->count) == 1); return file; } -- cgit v1.2.3 From 8a78362c4eefc1deddbefe2c7f38aabbc2429d6b Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" <martin.petersen@oracle.com> Date: Fri, 26 Feb 2010 00:20:39 -0500 Subject: block: Consolidate phys_segment and hw_segment limits Except for SCSI no device drivers distinguish between physical and hardware segment limits. Consolidate the two into a single segment limit. Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com> Signed-off-by: Jens Axboe <jens.axboe@oracle.com> --- arch/um/drivers/ubd_kern.c | 2 +- block/blk-core.c | 3 +- block/blk-merge.c | 8 ++--- block/blk-settings.c | 60 ++++++++----------------------------- drivers/ata/sata_nv.c | 2 +- drivers/block/DAC960.c | 2 +- drivers/block/cciss.c | 5 +--- drivers/block/cpqarray.c | 5 +--- drivers/block/drbd/drbd_nl.c | 3 +- drivers/block/paride/pf.c | 3 +- drivers/block/pktcdvd.c | 4 +-- drivers/block/ps3disk.c | 3 +- drivers/block/ps3vram.c | 3 +- drivers/block/sunvdc.c | 3 +- drivers/block/sx8.c | 3 +- drivers/block/ub.c | 3 +- drivers/block/viodasd.c | 3 +- drivers/block/xen-blkfront.c | 3 +- drivers/cdrom/gdrom.c | 2 +- drivers/cdrom/viocd.c | 3 +- drivers/ide/ide-probe.c | 3 +- drivers/md/raid5.c | 2 +- drivers/memstick/core/mspro_block.c | 3 +- drivers/message/i2o/i2o_block.c | 3 +- drivers/mmc/card/queue.c | 6 ++-- drivers/s390/block/dasd.c | 3 +- drivers/s390/char/tape_block.c | 3 +- drivers/scsi/ibmvscsi/ibmvfc.c | 4 +-- drivers/scsi/scsi_lib.c | 4 +-- drivers/scsi/sg.c | 6 ++-- drivers/scsi/st.c | 3 +- drivers/staging/hv/blkvsc_drv.c | 5 +--- fs/bio.c | 9 ++---- include/linux/blkdev.h | 27 ++++++++++------- include/linux/i2o.h | 2 +- 35 files changed, 70 insertions(+), 136 deletions(-) (limited to 'fs') diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c index c2051b0737cb..c1ff6903b622 100644 --- a/arch/um/drivers/ubd_kern.c +++ b/arch/um/drivers/ubd_kern.c @@ -849,7 +849,7 @@ static int ubd_add(int n, char **error_out) } ubd_dev->queue->queuedata = ubd_dev; - blk_queue_max_hw_segments(ubd_dev->queue, MAX_SG); + blk_queue_max_segments(ubd_dev->queue, MAX_SG); err = ubd_disk_register(UBD_MAJOR, ubd_dev->size, n, &ubd_gendisk[n]); if(err){ *error_out = "Failed to register device"; diff --git a/block/blk-core.c b/block/blk-core.c index 36c0deebc2dc..9fe174dc74d1 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1614,8 +1614,7 @@ int blk_rq_check_limits(struct request_queue *q, struct request *rq) * limitation. */ blk_recalc_rq_segments(rq); - if (rq->nr_phys_segments > queue_max_phys_segments(q) || - rq->nr_phys_segments > queue_max_hw_segments(q)) { + if (rq->nr_phys_segments > queue_max_segments(q)) { printk(KERN_ERR "%s: over max segments limit.\n", __func__); return -EIO; } diff --git a/block/blk-merge.c b/block/blk-merge.c index 99cb5cf1f447..5e7dc9973458 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -206,8 +206,7 @@ static inline int ll_new_hw_segment(struct request_queue *q, { int nr_phys_segs = bio_phys_segments(q, bio); - if (req->nr_phys_segments + nr_phys_segs > queue_max_hw_segments(q) || - req->nr_phys_segments + nr_phys_segs > queue_max_phys_segments(q)) { + if (req->nr_phys_segments + nr_phys_segs > queue_max_segments(q)) { req->cmd_flags |= REQ_NOMERGE; if (req == q->last_merge) q->last_merge = NULL; @@ -300,10 +299,7 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req, total_phys_segments--; } - if (total_phys_segments > queue_max_phys_segments(q)) - return 0; - - if (total_phys_segments > queue_max_hw_segments(q)) + if (total_phys_segments > queue_max_segments(q)) return 0; /* Merge is OK... */ diff --git a/block/blk-settings.c b/block/blk-settings.c index 61afae9dbc6d..31e7a9375c13 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -91,8 +91,7 @@ EXPORT_SYMBOL_GPL(blk_queue_lld_busy); */ void blk_set_default_limits(struct queue_limits *lim) { - lim->max_phys_segments = MAX_PHYS_SEGMENTS; - lim->max_hw_segments = MAX_HW_SEGMENTS; + lim->max_segments = BLK_MAX_SEGMENTS; lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK; lim->max_segment_size = BLK_MAX_SEGMENT_SIZE; lim->max_sectors = BLK_DEF_MAX_SECTORS; @@ -252,17 +251,15 @@ void blk_queue_max_discard_sectors(struct request_queue *q, EXPORT_SYMBOL(blk_queue_max_discard_sectors); /** - * blk_queue_max_phys_segments - set max phys segments for a request for this queue + * blk_queue_max_segments - set max hw segments for a request for this queue * @q: the request queue for the device * @max_segments: max number of segments * * Description: * Enables a low level driver to set an upper limit on the number of - * physical data segments in a request. This would be the largest sized - * scatter list the driver could handle. + * hw data segments in a request. **/ -void blk_queue_max_phys_segments(struct request_queue *q, - unsigned short max_segments) +void blk_queue_max_segments(struct request_queue *q, unsigned short max_segments) { if (!max_segments) { max_segments = 1; @@ -270,33 +267,9 @@ void blk_queue_max_phys_segments(struct request_queue *q, __func__, max_segments); } - q->limits.max_phys_segments = max_segments; + q->limits.max_segments = max_segments; } -EXPORT_SYMBOL(blk_queue_max_phys_segments); - -/** - * blk_queue_max_hw_segments - set max hw segments for a request for this queue - * @q: the request queue for the device - * @max_segments: max number of segments - * - * Description: - * Enables a low level driver to set an upper limit on the number of - * hw data segments in a request. This would be the largest number of - * address/length pairs the host adapter can actually give at once - * to the device. - **/ -void blk_queue_max_hw_segments(struct request_queue *q, - unsigned short max_segments) -{ - if (!max_segments) { - max_segments = 1; - printk(KERN_INFO "%s: set to minimum %d\n", - __func__, max_segments); - } - - q->limits.max_hw_segments = max_segments; -} -EXPORT_SYMBOL(blk_queue_max_hw_segments); +EXPORT_SYMBOL(blk_queue_max_segments); /** * blk_queue_max_segment_size - set max segment size for blk_rq_map_sg @@ -531,11 +504,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask, b->seg_boundary_mask); - t->max_phys_segments = min_not_zero(t->max_phys_segments, - b->max_phys_segments); - - t->max_hw_segments = min_not_zero(t->max_hw_segments, - b->max_hw_segments); + t->max_segments = min_not_zero(t->max_segments, b->max_segments); t->max_segment_size = min_not_zero(t->max_segment_size, b->max_segment_size); @@ -739,22 +708,19 @@ EXPORT_SYMBOL(blk_queue_update_dma_pad); * does is adjust the queue so that the buf is always appended * silently to the scatterlist. * - * Note: This routine adjusts max_hw_segments to make room for - * appending the drain buffer. If you call - * blk_queue_max_hw_segments() or blk_queue_max_phys_segments() after - * calling this routine, you must set the limit to one fewer than your - * device can support otherwise there won't be room for the drain - * buffer. + * Note: This routine adjusts max_hw_segments to make room for appending + * the drain buffer. If you call blk_queue_max_segments() after calling + * this routine, you must set the limit to one fewer than your device + * can support otherwise there won't be room for the drain buffer. */ int blk_queue_dma_drain(struct request_queue *q, dma_drain_needed_fn *dma_drain_needed, void *buf, unsigned int size) { - if (queue_max_hw_segments(q) < 2 || queue_max_phys_segments(q) < 2) + if (queue_max_segments(q) < 2) return -EINVAL; /* make room for appending the drain */ - blk_queue_max_hw_segments(q, queue_max_hw_segments(q) - 1); - blk_queue_max_phys_segments(q, queue_max_phys_segments(q) - 1); + blk_queue_max_segments(q, queue_max_segments(q) - 1); q->dma_drain_needed = dma_drain_needed; q->dma_drain_buffer = buf; q->dma_drain_size = size; diff --git a/drivers/ata/sata_nv.c b/drivers/ata/sata_nv.c index 0c82d335c55d..684fe04dbbb7 100644 --- a/drivers/ata/sata_nv.c +++ b/drivers/ata/sata_nv.c @@ -772,7 +772,7 @@ static int nv_adma_slave_config(struct scsi_device *sdev) } blk_queue_segment_boundary(sdev->request_queue, segment_boundary); - blk_queue_max_hw_segments(sdev->request_queue, sg_tablesize); + blk_queue_max_segments(sdev->request_queue, sg_tablesize); ata_port_printk(ap, KERN_INFO, "DMA mask 0x%llX, segment boundary 0x%lX, hw segs %hu\n", (unsigned long long)*ap->host->dev->dma_mask, diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c index 1c0cd35e1913..459f1bc25a7b 100644 --- a/drivers/block/DAC960.c +++ b/drivers/block/DAC960.c @@ -2534,7 +2534,7 @@ static bool DAC960_RegisterBlockDevice(DAC960_Controller_T *Controller) blk_queue_bounce_limit(RequestQueue, Controller->BounceBufferLimit); RequestQueue->queuedata = Controller; blk_queue_max_hw_segments(RequestQueue, Controller->DriverScatterGatherLimit); - blk_queue_max_phys_segments(RequestQueue, Controller->DriverScatterGatherLimit); + blk_queue_max_segments(RequestQueue, Controller->DriverScatterGatherLimit); blk_queue_max_hw_sectors(RequestQueue, Controller->MaxBlocksPerCommand); disk->queue = RequestQueue; sprintf(disk->disk_name, "rd/c%dd%d", Controller->ControllerNumber, n); diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index 030e52d72254..a29e69418a03 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c @@ -1797,10 +1797,7 @@ static int cciss_add_disk(ctlr_info_t *h, struct gendisk *disk, blk_queue_bounce_limit(disk->queue, h->pdev->dma_mask); /* This is a hardware imposed limit. */ - blk_queue_max_hw_segments(disk->queue, h->maxsgentries); - - /* This is a limit in the driver and could be eliminated. */ - blk_queue_max_phys_segments(disk->queue, h->maxsgentries); + blk_queue_max_segments(disk->queue, h->maxsgentries); blk_queue_max_hw_sectors(disk->queue, h->cciss_max_sectors); diff --git a/drivers/block/cpqarray.c b/drivers/block/cpqarray.c index 6422651ec364..91d11631cec9 100644 --- a/drivers/block/cpqarray.c +++ b/drivers/block/cpqarray.c @@ -448,11 +448,8 @@ static int __init cpqarray_register_ctlr( int i, struct pci_dev *pdev) blk_queue_bounce_limit(q, hba[i]->pci_dev->dma_mask); /* This is a hardware imposed limit. */ - blk_queue_max_hw_segments(q, SG_MAX); + blk_queue_max_segments(q, SG_MAX); - /* This is a driver limit and could be eliminated. */ - blk_queue_max_phys_segments(q, SG_MAX); - init_timer(&hba[i]->timer); hba[i]->timer.expires = jiffies + IDA_TIMER; hba[i]->timer.data = (unsigned long)hba[i]; diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 9b55e64196fc..4df3b40b1057 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -710,8 +710,7 @@ void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int max_seg_s) __mu max_seg_s = min(queue_max_sectors(b) * queue_logical_block_size(b), max_seg_s); blk_queue_max_hw_sectors(q, max_seg_s >> 9); - blk_queue_max_phys_segments(q, max_segments ? max_segments : MAX_PHYS_SEGMENTS); - blk_queue_max_hw_segments(q, max_segments ? max_segments : MAX_HW_SEGMENTS); + blk_queue_max_segments(q, max_segments ? max_segments : BLK_MAX_SEGMENTS); blk_queue_max_segment_size(q, max_seg_s); blk_queue_logical_block_size(q, 512); blk_queue_segment_boundary(q, PAGE_SIZE-1); diff --git a/drivers/block/paride/pf.c b/drivers/block/paride/pf.c index ea54ea393553..ddb4f9abd480 100644 --- a/drivers/block/paride/pf.c +++ b/drivers/block/paride/pf.c @@ -956,8 +956,7 @@ static int __init pf_init(void) return -ENOMEM; } - blk_queue_max_phys_segments(pf_queue, cluster); - blk_queue_max_hw_segments(pf_queue, cluster); + blk_queue_max_segments(pf_queue, cluster); for (pf = units, unit = 0; unit < PF_UNITS; pf++, unit++) { struct gendisk *disk = pf->disk; diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 6e1daa24da2f..b72935b8f203 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -950,14 +950,14 @@ static void pkt_iosched_process_queue(struct pktcdvd_device *pd) static int pkt_set_segment_merging(struct pktcdvd_device *pd, struct request_queue *q) { if ((pd->settings.size << 9) / CD_FRAMESIZE - <= queue_max_phys_segments(q)) { + <= queue_max_segments(q)) { /* * The cdrom device can handle one segment/frame */ clear_bit(PACKET_MERGE_SEGS, &pd->flags); return 0; } else if ((pd->settings.size << 9) / PAGE_SIZE - <= queue_max_phys_segments(q)) { + <= queue_max_segments(q)) { /* * We can handle this case at the expense of some extra memory * copies during write operations diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c index 9cd1a4a542b8..bc95469d33c1 100644 --- a/drivers/block/ps3disk.c +++ b/drivers/block/ps3disk.c @@ -482,8 +482,7 @@ static int __devinit ps3disk_probe(struct ps3_system_bus_device *_dev) blk_queue_ordered(queue, QUEUE_ORDERED_DRAIN_FLUSH, ps3disk_prepare_flush); - blk_queue_max_phys_segments(queue, -1); - blk_queue_max_hw_segments(queue, -1); + blk_queue_max_segments(queue, -1); blk_queue_max_segment_size(queue, dev->bounce_size); gendisk = alloc_disk(PS3DISK_MINORS); diff --git a/drivers/block/ps3vram.c b/drivers/block/ps3vram.c index 6416b262934b..83ebb390b164 100644 --- a/drivers/block/ps3vram.c +++ b/drivers/block/ps3vram.c @@ -751,8 +751,7 @@ static int __devinit ps3vram_probe(struct ps3_system_bus_device *dev) priv->queue = queue; queue->queuedata = dev; blk_queue_make_request(queue, ps3vram_make_request); - blk_queue_max_phys_segments(queue, MAX_PHYS_SEGMENTS); - blk_queue_max_hw_segments(queue, MAX_HW_SEGMENTS); + blk_queue_max_segments(queue, BLK_MAX_HW_SEGMENTS); blk_queue_max_segment_size(queue, BLK_MAX_SEGMENT_SIZE); blk_queue_max_hw_sectors(queue, BLK_SAFE_MAX_SECTORS); diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c index dd30cddd0f7f..48e8fee9f2d4 100644 --- a/drivers/block/sunvdc.c +++ b/drivers/block/sunvdc.c @@ -691,8 +691,7 @@ static int probe_disk(struct vdc_port *port) port->disk = g; - blk_queue_max_hw_segments(q, port->ring_cookies); - blk_queue_max_phys_segments(q, port->ring_cookies); + blk_queue_max_segments(q, port->ring_cookies); blk_queue_max_hw_sectors(q, port->max_xfer_size); g->major = vdc_major; g->first_minor = port->vio.vdev->dev_no << PARTITION_SHIFT; diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c index 7bd7b2f83116..b70f0fca9a42 100644 --- a/drivers/block/sx8.c +++ b/drivers/block/sx8.c @@ -1518,8 +1518,7 @@ static int carm_init_disks(struct carm_host *host) break; } disk->queue = q; - blk_queue_max_hw_segments(q, CARM_MAX_REQ_SG); - blk_queue_max_phys_segments(q, CARM_MAX_REQ_SG); + blk_queue_max_segments(q, CARM_MAX_REQ_SG); blk_queue_segment_boundary(q, CARM_SG_BOUNDARY); q->queuedata = port; diff --git a/drivers/block/ub.c b/drivers/block/ub.c index 352ea24d66e8..2e889838e819 100644 --- a/drivers/block/ub.c +++ b/drivers/block/ub.c @@ -2320,8 +2320,7 @@ static int ub_probe_lun(struct ub_dev *sc, int lnum) disk->queue = q; blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH); - blk_queue_max_hw_segments(q, UB_MAX_REQ_SG); - blk_queue_max_phys_segments(q, UB_MAX_REQ_SG); + blk_queue_max_segments(q, UB_MAX_REQ_SG); blk_queue_segment_boundary(q, 0xffffffff); /* Dubious. */ blk_queue_max_hw_sectors(q, UB_MAX_SECTORS); blk_queue_logical_block_size(q, lun->capacity.bsize); diff --git a/drivers/block/viodasd.c b/drivers/block/viodasd.c index d44ece7d7b7c..c12b31362ac6 100644 --- a/drivers/block/viodasd.c +++ b/drivers/block/viodasd.c @@ -471,8 +471,7 @@ retry: } d->disk = g; - blk_queue_max_hw_segments(q, VIOMAXBLOCKDMA); - blk_queue_max_phys_segments(q, VIOMAXBLOCKDMA); + blk_queue_max_segments(q, VIOMAXBLOCKDMA); blk_queue_max_hw_sectors(q, VIODASD_MAXSECTORS); g->major = VIODASD_MAJOR; g->first_minor = dev_no << PARTITION_SHIFT; diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index f9861aaa1fef..9c09694b2520 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -353,8 +353,7 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size) blk_queue_max_segment_size(rq, PAGE_SIZE); /* Ensure a merged request will fit in a single I/O ring slot. */ - blk_queue_max_phys_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST); - blk_queue_max_hw_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST); + blk_queue_max_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST); /* Make sure buffer addresses are sector-aligned. */ blk_queue_dma_alignment(rq, 511); diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c index e789e6c9a422..03c71f7698cb 100644 --- a/drivers/cdrom/gdrom.c +++ b/drivers/cdrom/gdrom.c @@ -741,7 +741,7 @@ static int __devinit probe_gdrom_setupqueue(void) { blk_queue_logical_block_size(gd.gdrom_rq, GDROM_HARD_SECTOR); /* using DMA so memory will need to be contiguous */ - blk_queue_max_hw_segments(gd.gdrom_rq, 1); + blk_queue_max_segments(gd.gdrom_rq, 1); /* set a large max size to get most from DMA */ blk_queue_max_segment_size(gd.gdrom_rq, 0x40000); gd.disk->queue = gd.gdrom_rq; diff --git a/drivers/cdrom/viocd.c b/drivers/cdrom/viocd.c index b1dfd23eb832..cc435be0bc13 100644 --- a/drivers/cdrom/viocd.c +++ b/drivers/cdrom/viocd.c @@ -616,8 +616,7 @@ static int viocd_probe(struct vio_dev *vdev, const struct vio_device_id *id) gendisk->first_minor = deviceno; strncpy(gendisk->disk_name, c->name, sizeof(gendisk->disk_name)); - blk_queue_max_hw_segments(q, 1); - blk_queue_max_phys_segments(q, 1); + blk_queue_max_segments(q, 1); blk_queue_max_hw_sectors(q, 4096 / 512); gendisk->queue = q; gendisk->fops = &viocd_fops; diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c index 1ec8b31277bd..f8c1ae6ad74c 100644 --- a/drivers/ide/ide-probe.c +++ b/drivers/ide/ide-probe.c @@ -790,8 +790,7 @@ static int ide_init_queue(ide_drive_t *drive) max_sg_entries >>= 1; #endif /* CONFIG_PCI */ - blk_queue_max_hw_segments(q, max_sg_entries); - blk_queue_max_phys_segments(q, max_sg_entries); + blk_queue_max_segments(q, max_sg_entries); /* assign drive queue */ drive->queue = q; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index ceb24afdc147..509c8f3dd9a5 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3739,7 +3739,7 @@ static int bio_fits_rdev(struct bio *bi) if ((bi->bi_size>>9) > queue_max_sectors(q)) return 0; blk_recount_segments(q, bi); - if (bi->bi_phys_segments > queue_max_phys_segments(q)) + if (bi->bi_phys_segments > queue_max_segments(q)) return 0; if (q->merge_bvec_fn) diff --git a/drivers/memstick/core/mspro_block.c b/drivers/memstick/core/mspro_block.c index 44d4178c4c15..972b87069d55 100644 --- a/drivers/memstick/core/mspro_block.c +++ b/drivers/memstick/core/mspro_block.c @@ -1227,8 +1227,7 @@ static int mspro_block_init_disk(struct memstick_dev *card) blk_queue_bounce_limit(msb->queue, limit); blk_queue_max_hw_sectors(msb->queue, MSPRO_BLOCK_MAX_PAGES); - blk_queue_max_phys_segments(msb->queue, MSPRO_BLOCK_MAX_SEGS); - blk_queue_max_hw_segments(msb->queue, MSPRO_BLOCK_MAX_SEGS); + blk_queue_max_segments(msb->queue, MSPRO_BLOCK_MAX_SEGS); blk_queue_max_segment_size(msb->queue, MSPRO_BLOCK_MAX_PAGES * msb->page_size); diff --git a/drivers/message/i2o/i2o_block.c b/drivers/message/i2o/i2o_block.c index d033cfdb516f..2658b1484a2c 100644 --- a/drivers/message/i2o/i2o_block.c +++ b/drivers/message/i2o/i2o_block.c @@ -1065,9 +1065,8 @@ static int i2o_block_probe(struct device *dev) queue = gd->queue; queue->queuedata = i2o_blk_dev; - blk_queue_max_phys_segments(queue, I2O_MAX_PHYS_SEGMENTS); blk_queue_max_hw_sectors(queue, max_sectors); - blk_queue_max_hw_segments(queue, i2o_sg_tablesize(c, body_size)); + blk_queue_max_segments(queue, i2o_sg_tablesize(c, body_size)); osm_debug("max sectors = %d\n", queue->max_sectors); osm_debug("phys segments = %d\n", queue->max_phys_segments); diff --git a/drivers/mmc/card/queue.c b/drivers/mmc/card/queue.c index 09b633d5657b..381fe032caa1 100644 --- a/drivers/mmc/card/queue.c +++ b/drivers/mmc/card/queue.c @@ -155,8 +155,7 @@ int mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card, spinlock_t *lock if (mq->bounce_buf) { blk_queue_bounce_limit(mq->queue, BLK_BOUNCE_ANY); blk_queue_max_hw_sectors(mq->queue, bouncesz / 512); - blk_queue_max_phys_segments(mq->queue, bouncesz / 512); - blk_queue_max_hw_segments(mq->queue, bouncesz / 512); + blk_queue_max_segments(mq->queue, bouncesz / 512); blk_queue_max_segment_size(mq->queue, bouncesz); mq->sg = kmalloc(sizeof(struct scatterlist), @@ -182,8 +181,7 @@ int mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card, spinlock_t *lock blk_queue_bounce_limit(mq->queue, limit); blk_queue_max_hw_sectors(mq->queue, min(host->max_blk_count, host->max_req_size / 512)); - blk_queue_max_phys_segments(mq->queue, host->max_phys_segs); - blk_queue_max_hw_segments(mq->queue, host->max_hw_segs); + blk_queue_max_segments(mq->queue, host->max_hw_segs); blk_queue_max_segment_size(mq->queue, host->max_seg_size); mq->sg = kmalloc(sizeof(struct scatterlist) * diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c index 14b1e25b9dcf..8831e9308d05 100644 --- a/drivers/s390/block/dasd.c +++ b/drivers/s390/block/dasd.c @@ -2130,8 +2130,7 @@ static void dasd_setup_queue(struct dasd_block *block) blk_queue_logical_block_size(block->request_queue, block->bp_block); max = block->base->discipline->max_blocks << block->s2b_shift; blk_queue_max_hw_sectors(block->request_queue, max); - blk_queue_max_phys_segments(block->request_queue, -1L); - blk_queue_max_hw_segments(block->request_queue, -1L); + blk_queue_max_segments(block->request_queue, -1L); /* with page sized segments we can translate each segement into * one idaw/tidaw */ diff --git a/drivers/s390/char/tape_block.c b/drivers/s390/char/tape_block.c index 509ed056fddd..097da8ce6be6 100644 --- a/drivers/s390/char/tape_block.c +++ b/drivers/s390/char/tape_block.c @@ -223,8 +223,7 @@ tapeblock_setup_device(struct tape_device * device) blk_queue_logical_block_size(blkdat->request_queue, TAPEBLOCK_HSEC_SIZE); blk_queue_max_hw_sectors(blkdat->request_queue, TAPEBLOCK_MAX_SEC); - blk_queue_max_phys_segments(blkdat->request_queue, -1L); - blk_queue_max_hw_segments(blkdat->request_queue, -1L); + blk_queue_max_segments(blkdat->request_queue, -1L); blk_queue_max_segment_size(blkdat->request_queue, -1L); blk_queue_segment_boundary(blkdat->request_queue, -1L); diff --git a/drivers/scsi/ibmvscsi/ibmvfc.c b/drivers/scsi/ibmvscsi/ibmvfc.c index 87b536a97cb4..732f6d35b4a8 100644 --- a/drivers/scsi/ibmvscsi/ibmvfc.c +++ b/drivers/scsi/ibmvscsi/ibmvfc.c @@ -4195,7 +4195,7 @@ static void ibmvfc_tgt_add_rport(struct ibmvfc_target *tgt) if (tgt->service_parms.class3_parms[0] & 0x80000000) rport->supported_classes |= FC_COS_CLASS3; if (rport->rqst_q) - blk_queue_max_hw_segments(rport->rqst_q, 1); + blk_queue_max_segments(rport->rqst_q, 1); } else tgt_dbg(tgt, "rport add failed\n"); spin_unlock_irqrestore(vhost->host->host_lock, flags); @@ -4669,7 +4669,7 @@ static int ibmvfc_probe(struct vio_dev *vdev, const struct vio_device_id *id) } if (shost_to_fc_host(shost)->rqst_q) - blk_queue_max_hw_segments(shost_to_fc_host(shost)->rqst_q, 1); + blk_queue_max_segments(shost_to_fc_host(shost)->rqst_q, 1); dev_set_drvdata(dev, vhost); spin_lock(&ibmvfc_driver_lock); list_add_tail(&vhost->queue, &ibmvfc_head); diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index ac3cca74bdfb..f8fbf47377ae 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -1624,8 +1624,8 @@ struct request_queue *__scsi_alloc_queue(struct Scsi_Host *shost, /* * this limit is imposed by hardware restrictions */ - blk_queue_max_hw_segments(q, shost->sg_tablesize); - blk_queue_max_phys_segments(q, SCSI_MAX_SG_CHAIN_SEGMENTS); + blk_queue_max_segments(q, min_t(unsigned short, shost->sg_tablesize, + SCSI_MAX_SG_CHAIN_SEGMENTS)); blk_queue_max_hw_sectors(q, shost->max_sectors); blk_queue_bounce_limit(q, scsi_calculate_bounce_limit(shost)); diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index 040f751809ea..c996d98636f3 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -287,8 +287,7 @@ sg_open(struct inode *inode, struct file *filp) if (list_empty(&sdp->sfds)) { /* no existing opens on this device */ sdp->sgdebug = 0; q = sdp->device->request_queue; - sdp->sg_tablesize = min(queue_max_hw_segments(q), - queue_max_phys_segments(q)); + sdp->sg_tablesize = queue_max_segments(q); } if ((sfp = sg_add_sfp(sdp, dev))) filp->private_data = sfp; @@ -1376,8 +1375,7 @@ static Sg_device *sg_alloc(struct gendisk *disk, struct scsi_device *scsidp) sdp->device = scsidp; INIT_LIST_HEAD(&sdp->sfds); init_waitqueue_head(&sdp->o_excl_wait); - sdp->sg_tablesize = min(queue_max_hw_segments(q), - queue_max_phys_segments(q)); + sdp->sg_tablesize = queue_max_segments(q); sdp->index = k; kref_init(&sdp->d_ref); diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c index d04ea9a6f673..f67d1a159aad 100644 --- a/drivers/scsi/st.c +++ b/drivers/scsi/st.c @@ -3983,8 +3983,7 @@ static int st_probe(struct device *dev) return -ENODEV; } - i = min(queue_max_hw_segments(SDp->request_queue), - queue_max_phys_segments(SDp->request_queue)); + i = queue_max_segments(SDp->request_queue); if (st_max_sg_segs < i) i = st_max_sg_segs; buffer = new_tape_buffer((SDp->host)->unchecked_isa_dma, i); diff --git a/drivers/staging/hv/blkvsc_drv.c b/drivers/staging/hv/blkvsc_drv.c index 62b282844a53..45d908114d11 100644 --- a/drivers/staging/hv/blkvsc_drv.c +++ b/drivers/staging/hv/blkvsc_drv.c @@ -363,10 +363,7 @@ static int blkvsc_probe(struct device *device) blkdev->gd->queue = blk_init_queue(blkvsc_request, &blkdev->lock); blk_queue_max_segment_size(blkdev->gd->queue, PAGE_SIZE); - blk_queue_max_phys_segments(blkdev->gd->queue, - MAX_MULTIPAGE_BUFFER_COUNT); - blk_queue_max_hw_segments(blkdev->gd->queue, - MAX_MULTIPAGE_BUFFER_COUNT); + blk_queue_max_segments(blkdev->gd->queue, MAX_MULTIPAGE_BUFFER_COUNT); blk_queue_segment_boundary(blkdev->gd->queue, PAGE_SIZE-1); blk_queue_bounce_limit(blkdev->gd->queue, BLK_BOUNCE_ANY); blk_queue_dma_alignment(blkdev->gd->queue, 511); diff --git a/fs/bio.c b/fs/bio.c index 88094afc29ea..dc17afd672e3 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -507,10 +507,8 @@ int bio_get_nr_vecs(struct block_device *bdev) int nr_pages; nr_pages = ((queue_max_sectors(q) << 9) + PAGE_SIZE - 1) >> PAGE_SHIFT; - if (nr_pages > queue_max_phys_segments(q)) - nr_pages = queue_max_phys_segments(q); - if (nr_pages > queue_max_hw_segments(q)) - nr_pages = queue_max_hw_segments(q); + if (nr_pages > queue_max_segments(q)) + nr_pages = queue_max_segments(q); return nr_pages; } @@ -575,8 +573,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page * make this too complex. */ - while (bio->bi_phys_segments >= queue_max_phys_segments(q) - || bio->bi_phys_segments >= queue_max_hw_segments(q)) { + while (bio->bi_phys_segments >= queue_max_segments(q)) { if (retried_segments) return 0; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 57bc48446e92..ebd22dbed861 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -316,8 +316,7 @@ struct queue_limits { unsigned int discard_alignment; unsigned short logical_block_size; - unsigned short max_hw_segments; - unsigned short max_phys_segments; + unsigned short max_segments; unsigned char misaligned; unsigned char discard_misaligned; @@ -929,8 +928,19 @@ static inline void blk_queue_max_sectors(struct request_queue *q, unsigned int m blk_queue_max_hw_sectors(q, max); } -extern void blk_queue_max_phys_segments(struct request_queue *, unsigned short); -extern void blk_queue_max_hw_segments(struct request_queue *, unsigned short); +extern void blk_queue_max_segments(struct request_queue *, unsigned short); + +static inline void blk_queue_max_phys_segments(struct request_queue *q, unsigned short max) +{ + blk_queue_max_segments(q, max); +} + +static inline void blk_queue_max_hw_segments(struct request_queue *q, unsigned short max) +{ + blk_queue_max_segments(q, max); +} + + extern void blk_queue_max_segment_size(struct request_queue *, unsigned int); extern void blk_queue_max_discard_sectors(struct request_queue *q, unsigned int max_discard_sectors); @@ -1055,14 +1065,9 @@ static inline unsigned int queue_max_hw_sectors(struct request_queue *q) return q->limits.max_hw_sectors; } -static inline unsigned short queue_max_hw_segments(struct request_queue *q) -{ - return q->limits.max_hw_segments; -} - -static inline unsigned short queue_max_phys_segments(struct request_queue *q) +static inline unsigned short queue_max_segments(struct request_queue *q) { - return q->limits.max_phys_segments; + return q->limits.max_segments; } static inline unsigned int queue_max_segment_size(struct request_queue *q) diff --git a/include/linux/i2o.h b/include/linux/i2o.h index 4c4e57d1f19d..87018dc5527d 100644 --- a/include/linux/i2o.h +++ b/include/linux/i2o.h @@ -385,7 +385,7 @@ /* defines for max_sectors and max_phys_segments */ #define I2O_MAX_SECTORS 1024 #define I2O_MAX_SECTORS_LIMITED 128 -#define I2O_MAX_PHYS_SEGMENTS MAX_PHYS_SEGMENTS +#define I2O_MAX_PHYS_SEGMENTS BLK_MAX_SEGMENTS /* * Message structures -- cgit v1.2.3 From cf6620acc0f6fac57968aafef79ab372bdcf6157 Mon Sep 17 00:00:00 2001 From: David Teigland <teigland@redhat.com> Date: Wed, 24 Feb 2010 11:59:23 -0600 Subject: dlm: send reply before bast When the lock master processes a successful operation (request, convert, cancel, or unlock), it will process the effects of the change before sending the reply for the operation. The "effects" of the operation are: - blocking callbacks (basts) for any newly granted locks - waiting or converting locks that can now be granted The cast is queued on the local node when the reply from the lock master is received. This means that a lock holder can receive a bast for a lock mode that is doesn't yet know has been granted. Signed-off-by: David Teigland <teigland@redhat.com> --- fs/dlm/lock.c | 110 ++++++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 84 insertions(+), 26 deletions(-) (limited to 'fs') diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index e08ea93432bc..d0e43a3da887 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -2280,20 +2280,30 @@ static int do_request(struct dlm_rsb *r, struct dlm_lkb *lkb) if (can_be_queued(lkb)) { error = -EINPROGRESS; add_lkb(r, lkb, DLM_LKSTS_WAITING); - send_blocking_asts(r, lkb); add_timeout(lkb); goto out; } error = -EAGAIN; - if (force_blocking_asts(lkb)) - send_blocking_asts_all(r, lkb); queue_cast(r, lkb, -EAGAIN); - out: return error; } +static void do_request_effects(struct dlm_rsb *r, struct dlm_lkb *lkb, + int error) +{ + switch (error) { + case -EAGAIN: + if (force_blocking_asts(lkb)) + send_blocking_asts_all(r, lkb); + break; + case -EINPROGRESS: + send_blocking_asts(r, lkb); + break; + } +} + static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb) { int error = 0; @@ -2304,7 +2314,6 @@ static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb) if (can_be_granted(r, lkb, 1, &deadlk)) { grant_lock(r, lkb); queue_cast(r, lkb, 0); - grant_pending_locks(r); goto out; } @@ -2334,7 +2343,6 @@ static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb) if (_can_be_granted(r, lkb, 1)) { grant_lock(r, lkb); queue_cast(r, lkb, 0); - grant_pending_locks(r); goto out; } /* else fall through and move to convert queue */ @@ -2344,28 +2352,47 @@ static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb) error = -EINPROGRESS; del_lkb(r, lkb); add_lkb(r, lkb, DLM_LKSTS_CONVERT); - send_blocking_asts(r, lkb); add_timeout(lkb); goto out; } error = -EAGAIN; - if (force_blocking_asts(lkb)) - send_blocking_asts_all(r, lkb); queue_cast(r, lkb, -EAGAIN); - out: return error; } +static void do_convert_effects(struct dlm_rsb *r, struct dlm_lkb *lkb, + int error) +{ + switch (error) { + case 0: + grant_pending_locks(r); + /* grant_pending_locks also sends basts */ + break; + case -EAGAIN: + if (force_blocking_asts(lkb)) + send_blocking_asts_all(r, lkb); + break; + case -EINPROGRESS: + send_blocking_asts(r, lkb); + break; + } +} + static int do_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb) { remove_lock(r, lkb); queue_cast(r, lkb, -DLM_EUNLOCK); - grant_pending_locks(r); return -DLM_EUNLOCK; } +static void do_unlock_effects(struct dlm_rsb *r, struct dlm_lkb *lkb, + int error) +{ + grant_pending_locks(r); +} + /* returns: 0 did nothing, -DLM_ECANCEL canceled lock */ static int do_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb) @@ -2375,12 +2402,18 @@ static int do_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb) error = revert_lock(r, lkb); if (error) { queue_cast(r, lkb, -DLM_ECANCEL); - grant_pending_locks(r); return -DLM_ECANCEL; } return 0; } +static void do_cancel_effects(struct dlm_rsb *r, struct dlm_lkb *lkb, + int error) +{ + if (error) + grant_pending_locks(r); +} + /* * Four stage 3 varieties: * _request_lock(), _convert_lock(), _unlock_lock(), _cancel_lock() @@ -2402,11 +2435,15 @@ static int _request_lock(struct dlm_rsb *r, struct dlm_lkb *lkb) goto out; } - if (is_remote(r)) + if (is_remote(r)) { /* receive_request() calls do_request() on remote node */ error = send_request(r, lkb); - else + } else { error = do_request(r, lkb); + /* for remote locks the request_reply is sent + between do_request and do_request_effects */ + do_request_effects(r, lkb, error); + } out: return error; } @@ -2417,11 +2454,15 @@ static int _convert_lock(struct dlm_rsb *r, struct dlm_lkb *lkb) { int error; - if (is_remote(r)) + if (is_remote(r)) { /* receive_convert() calls do_convert() on remote node */ error = send_convert(r, lkb); - else + } else { error = do_convert(r, lkb); + /* for remote locks the convert_reply is sent + between do_convert and do_convert_effects */ + do_convert_effects(r, lkb, error); + } return error; } @@ -2432,11 +2473,15 @@ static int _unlock_lock(struct dlm_rsb *r, struct dlm_lkb *lkb) { int error; - if (is_remote(r)) + if (is_remote(r)) { /* receive_unlock() calls do_unlock() on remote node */ error = send_unlock(r, lkb); - else + } else { error = do_unlock(r, lkb); + /* for remote locks the unlock_reply is sent + between do_unlock and do_unlock_effects */ + do_unlock_effects(r, lkb, error); + } return error; } @@ -2447,11 +2492,15 @@ static int _cancel_lock(struct dlm_rsb *r, struct dlm_lkb *lkb) { int error; - if (is_remote(r)) + if (is_remote(r)) { /* receive_cancel() calls do_cancel() on remote node */ error = send_cancel(r, lkb); - else + } else { error = do_cancel(r, lkb); + /* for remote locks the cancel_reply is sent + between do_cancel and do_cancel_effects */ + do_cancel_effects(r, lkb, error); + } return error; } @@ -3191,6 +3240,7 @@ static void receive_request(struct dlm_ls *ls, struct dlm_message *ms) attach_lkb(r, lkb); error = do_request(r, lkb); send_request_reply(r, lkb, error); + do_request_effects(r, lkb, error); unlock_rsb(r); put_rsb(r); @@ -3226,15 +3276,19 @@ static void receive_convert(struct dlm_ls *ls, struct dlm_message *ms) goto out; receive_flags(lkb, ms); + error = receive_convert_args(ls, lkb, ms); - if (error) - goto out_reply; + if (error) { + send_convert_reply(r, lkb, error); + goto out; + } + reply = !down_conversion(lkb); error = do_convert(r, lkb); - out_reply: if (reply) send_convert_reply(r, lkb, error); + do_convert_effects(r, lkb, error); out: unlock_rsb(r); put_rsb(r); @@ -3266,13 +3320,16 @@ static void receive_unlock(struct dlm_ls *ls, struct dlm_message *ms) goto out; receive_flags(lkb, ms); + error = receive_unlock_args(ls, lkb, ms); - if (error) - goto out_reply; + if (error) { + send_unlock_reply(r, lkb, error); + goto out; + } error = do_unlock(r, lkb); - out_reply: send_unlock_reply(r, lkb, error); + do_unlock_effects(r, lkb, error); out: unlock_rsb(r); put_rsb(r); @@ -3307,6 +3364,7 @@ static void receive_cancel(struct dlm_ls *ls, struct dlm_message *ms) error = do_cancel(r, lkb); send_cancel_reply(r, lkb, error); + do_cancel_effects(r, lkb, error); out: unlock_rsb(r); put_rsb(r); -- cgit v1.2.3 From b4a5d4bc377e49239374f266f0a0e2772c29749c Mon Sep 17 00:00:00 2001 From: Steven Whitehouse <swhiteho@redhat.com> Date: Wed, 17 Feb 2010 09:41:34 +0000 Subject: dlm: Send lockspace name with uevents Although it is possible to get this information from the path, its much easier to provide the lockspace as a seperate env variable. Signed-off-by: Steven Whitehouse <swhiteho@redhat.com> Signed-off-by: David Teigland <teigland@redhat.com> --- fs/dlm/lockspace.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index c010ecfc0d29..26a8bd40400a 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -191,6 +191,18 @@ static int do_uevent(struct dlm_ls *ls, int in) return error; } +static int dlm_uevent(struct kset *kset, struct kobject *kobj, + struct kobj_uevent_env *env) +{ + struct dlm_ls *ls = container_of(kobj, struct dlm_ls, ls_kobj); + + add_uevent_var(env, "LOCKSPACE=%s", ls->ls_name); + return 0; +} + +static struct kset_uevent_ops dlm_uevent_ops = { + .uevent = dlm_uevent, +}; int __init dlm_lockspace_init(void) { @@ -199,7 +211,7 @@ int __init dlm_lockspace_init(void) INIT_LIST_HEAD(&lslist); spin_lock_init(&lslist_lock); - dlm_kset = kset_create_and_add("dlm", NULL, kernel_kobj); + dlm_kset = kset_create_and_add("dlm", &dlm_uevent_ops, kernel_kobj); if (!dlm_kset) { printk(KERN_WARNING "%s: can not create kset\n", __func__); return -ENOMEM; -- cgit v1.2.3 From b6fa8796b2da0390e9f4115e8789a01004fc1c9b Mon Sep 17 00:00:00 2001 From: David Teigland <teigland@redhat.com> Date: Thu, 25 Feb 2010 12:20:57 -0600 Subject: dlm: use bastmode in debugfs output The bast mode that appears in the debugfs output should be useful on both master and process nodes. lkb_highbast is currently printed, and is only useful on the master node. lkb_bastmode is only useful on the process node. This patch sets lkb_bastmode on the master node as well, and uses that value in the debugfs print. Signed-off-by: David Teigland <teigland@redhat.com> --- fs/dlm/debug_fs.c | 2 +- fs/dlm/lock.c | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c index 375a2359b3bf..29d6139c35fc 100644 --- a/fs/dlm/debug_fs.c +++ b/fs/dlm/debug_fs.c @@ -256,7 +256,7 @@ static int print_format3_lock(struct seq_file *s, struct dlm_lkb *lkb, lkb->lkb_status, lkb->lkb_grmode, lkb->lkb_rqmode, - lkb->lkb_highbast, + lkb->lkb_bastmode, rsb_lookup, lkb->lkb_wait_type, lkb->lkb_lvbseq, diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index d0e43a3da887..46ffd3eeaaf7 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -320,10 +320,12 @@ static void queue_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rqmode) { lkb->lkb_time_bast = ktime_get(); - if (is_master_copy(lkb)) + if (is_master_copy(lkb)) { + lkb->lkb_bastmode = rqmode; /* printed by debugfs */ send_bast(r, lkb, rqmode); - else + } else { dlm_add_ast(lkb, AST_BAST, rqmode); + } } /* -- cgit v1.2.3 From b89c54282db0c8634a2d2dc200f196d571750ce5 Mon Sep 17 00:00:00 2001 From: Tiger Yang <tiger.yang@oracle.com> Date: Mon, 25 Jan 2010 14:11:06 +0800 Subject: ocfs2: add extent block stealing for ocfs2 v5 This patch add extent block (metadata) stealing mechanism for extent allocation. This mechanism is same as the inode stealing. if no room in slot specific extent_alloc, we will try to allocate extent block from the next slot. Signed-off-by: Tiger Yang <tiger.yang@oracle.com> Acked-by: Tao Ma <tao.ma@oracle.com> Signed-off-by: Joel Becker <joel.becker@oracle.com> --- fs/ocfs2/alloc.c | 5 +- fs/ocfs2/dir.c | 2 +- fs/ocfs2/localalloc.c | 2 +- fs/ocfs2/ocfs2.h | 29 +------- fs/ocfs2/refcounttree.c | 6 +- fs/ocfs2/suballoc.c | 171 +++++++++++++++++++++++++++++++++++++----------- fs/ocfs2/suballoc.h | 1 + fs/ocfs2/super.c | 10 ++- fs/ocfs2/xattr.c | 2 +- 9 files changed, 150 insertions(+), 78 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index d17bdc718f74..2bbe1ecc08c0 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -1050,7 +1050,8 @@ static int ocfs2_create_new_meta_bhs(handle_t *handle, strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE); eb->h_blkno = cpu_to_le64(first_blkno); eb->h_fs_generation = cpu_to_le32(osb->fs_generation); - eb->h_suballoc_slot = cpu_to_le16(osb->slot_num); + eb->h_suballoc_slot = + cpu_to_le16(meta_ac->ac_alloc_slot); eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start); eb->h_list.l_count = cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb)); @@ -6037,7 +6038,7 @@ static void ocfs2_truncate_log_worker(struct work_struct *work) if (status < 0) mlog_errno(status); else - ocfs2_init_inode_steal_slot(osb); + ocfs2_init_steal_slots(osb); mlog_exit(status); } diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index 28c3ec238796..765d66c70989 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -2439,7 +2439,7 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb, dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; memset(dx_root, 0, osb->sb->s_blocksize); strcpy(dx_root->dr_signature, OCFS2_DX_ROOT_SIGNATURE); - dx_root->dr_suballoc_slot = cpu_to_le16(osb->slot_num); + dx_root->dr_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot); dx_root->dr_suballoc_bit = cpu_to_le16(dr_suballoc_bit); dx_root->dr_fs_generation = cpu_to_le32(osb->fs_generation); dx_root->dr_blkno = cpu_to_le64(dr_blkno); diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index ac10f83edb95..ca992d91f511 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -476,7 +476,7 @@ out_mutex: out: if (!status) - ocfs2_init_inode_steal_slot(osb); + ocfs2_init_steal_slots(osb); mlog_exit(status); return status; } diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 740f448041e2..8857dd724f90 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -305,7 +305,9 @@ struct ocfs2_super u32 s_next_generation; unsigned long osb_flags; s16 s_inode_steal_slot; + s16 s_meta_steal_slot; atomic_t s_num_inodes_stolen; + atomic_t s_num_meta_stolen; unsigned long s_mount_opt; unsigned int s_atime_quantum; @@ -760,33 +762,6 @@ static inline unsigned int ocfs2_megabytes_to_clusters(struct super_block *sb, return megs << (20 - OCFS2_SB(sb)->s_clustersize_bits); } -static inline void ocfs2_init_inode_steal_slot(struct ocfs2_super *osb) -{ - spin_lock(&osb->osb_lock); - osb->s_inode_steal_slot = OCFS2_INVALID_SLOT; - spin_unlock(&osb->osb_lock); - atomic_set(&osb->s_num_inodes_stolen, 0); -} - -static inline void ocfs2_set_inode_steal_slot(struct ocfs2_super *osb, - s16 slot) -{ - spin_lock(&osb->osb_lock); - osb->s_inode_steal_slot = slot; - spin_unlock(&osb->osb_lock); -} - -static inline s16 ocfs2_get_inode_steal_slot(struct ocfs2_super *osb) -{ - s16 slot; - - spin_lock(&osb->osb_lock); - slot = osb->s_inode_steal_slot; - spin_unlock(&osb->osb_lock); - - return slot; -} - #define ocfs2_set_bit ext2_set_bit #define ocfs2_clear_bit ext2_clear_bit #define ocfs2_test_bit ext2_test_bit diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 8ae65c9c020c..fb6aa7acf54b 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -626,7 +626,7 @@ static int ocfs2_create_refcount_tree(struct inode *inode, rb = (struct ocfs2_refcount_block *)new_bh->b_data; memset(rb, 0, inode->i_sb->s_blocksize); strcpy((void *)rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE); - rb->rf_suballoc_slot = cpu_to_le16(osb->slot_num); + rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot); rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start); rb->rf_fs_generation = cpu_to_le32(osb->fs_generation); rb->rf_blkno = cpu_to_le64(first_blkno); @@ -1330,7 +1330,7 @@ static int ocfs2_expand_inline_ref_root(handle_t *handle, memcpy(new_bh->b_data, ref_root_bh->b_data, sb->s_blocksize); new_rb = (struct ocfs2_refcount_block *)new_bh->b_data; - new_rb->rf_suballoc_slot = cpu_to_le16(OCFS2_SB(sb)->slot_num); + new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot); new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start); new_rb->rf_blkno = cpu_to_le64(blkno); new_rb->rf_cpos = cpu_to_le32(0); @@ -1576,7 +1576,7 @@ static int ocfs2_new_leaf_refcount_block(handle_t *handle, new_rb = (struct ocfs2_refcount_block *)new_bh->b_data; memset(new_rb, 0, sb->s_blocksize); strcpy((void *)new_rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE); - new_rb->rf_suballoc_slot = cpu_to_le16(OCFS2_SB(sb)->slot_num); + new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot); new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start); new_rb->rf_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation); new_rb->rf_blkno = cpu_to_le64(blkno); diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index c30b644d9572..c3c60bc3e072 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -51,7 +51,7 @@ #define ALLOC_NEW_GROUP 0x1 #define ALLOC_GROUPS_FROM_GLOBAL 0x2 -#define OCFS2_MAX_INODES_TO_STEAL 1024 +#define OCFS2_MAX_TO_STEAL 1024 static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg); static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe); @@ -637,12 +637,113 @@ bail: return status; } +static void ocfs2_init_inode_steal_slot(struct ocfs2_super *osb) +{ + spin_lock(&osb->osb_lock); + osb->s_inode_steal_slot = OCFS2_INVALID_SLOT; + spin_unlock(&osb->osb_lock); + atomic_set(&osb->s_num_inodes_stolen, 0); +} + +static void ocfs2_init_meta_steal_slot(struct ocfs2_super *osb) +{ + spin_lock(&osb->osb_lock); + osb->s_meta_steal_slot = OCFS2_INVALID_SLOT; + spin_unlock(&osb->osb_lock); + atomic_set(&osb->s_num_meta_stolen, 0); +} + +void ocfs2_init_steal_slots(struct ocfs2_super *osb) +{ + ocfs2_init_inode_steal_slot(osb); + ocfs2_init_meta_steal_slot(osb); +} + +static void __ocfs2_set_steal_slot(struct ocfs2_super *osb, int slot, int type) +{ + spin_lock(&osb->osb_lock); + if (type == INODE_ALLOC_SYSTEM_INODE) + osb->s_inode_steal_slot = slot; + else if (type == EXTENT_ALLOC_SYSTEM_INODE) + osb->s_meta_steal_slot = slot; + spin_unlock(&osb->osb_lock); +} + +static int __ocfs2_get_steal_slot(struct ocfs2_super *osb, int type) +{ + int slot = OCFS2_INVALID_SLOT; + + spin_lock(&osb->osb_lock); + if (type == INODE_ALLOC_SYSTEM_INODE) + slot = osb->s_inode_steal_slot; + else if (type == EXTENT_ALLOC_SYSTEM_INODE) + slot = osb->s_meta_steal_slot; + spin_unlock(&osb->osb_lock); + + return slot; +} + +static int ocfs2_get_inode_steal_slot(struct ocfs2_super *osb) +{ + return __ocfs2_get_steal_slot(osb, INODE_ALLOC_SYSTEM_INODE); +} + +static int ocfs2_get_meta_steal_slot(struct ocfs2_super *osb) +{ + return __ocfs2_get_steal_slot(osb, EXTENT_ALLOC_SYSTEM_INODE); +} + +static int ocfs2_steal_resource(struct ocfs2_super *osb, + struct ocfs2_alloc_context *ac, + int type) +{ + int i, status = -ENOSPC; + int slot = __ocfs2_get_steal_slot(osb, type); + + /* Start to steal resource from the first slot after ours. */ + if (slot == OCFS2_INVALID_SLOT) + slot = osb->slot_num + 1; + + for (i = 0; i < osb->max_slots; i++, slot++) { + if (slot == osb->max_slots) + slot = 0; + + if (slot == osb->slot_num) + continue; + + status = ocfs2_reserve_suballoc_bits(osb, ac, + type, + (u32)slot, NULL, + NOT_ALLOC_NEW_GROUP); + if (status >= 0) { + __ocfs2_set_steal_slot(osb, slot, type); + break; + } + + ocfs2_free_ac_resource(ac); + } + + return status; +} + +static int ocfs2_steal_inode(struct ocfs2_super *osb, + struct ocfs2_alloc_context *ac) +{ + return ocfs2_steal_resource(osb, ac, INODE_ALLOC_SYSTEM_INODE); +} + +static int ocfs2_steal_meta(struct ocfs2_super *osb, + struct ocfs2_alloc_context *ac) +{ + return ocfs2_steal_resource(osb, ac, EXTENT_ALLOC_SYSTEM_INODE); +} + int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb, int blocks, struct ocfs2_alloc_context **ac) { int status; - u32 slot; + int slot = ocfs2_get_meta_steal_slot(osb); *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL); if (!(*ac)) { @@ -653,12 +754,34 @@ int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb, (*ac)->ac_bits_wanted = blocks; (*ac)->ac_which = OCFS2_AC_USE_META; - slot = osb->slot_num; (*ac)->ac_group_search = ocfs2_block_group_search; + if (slot != OCFS2_INVALID_SLOT && + atomic_read(&osb->s_num_meta_stolen) < OCFS2_MAX_TO_STEAL) + goto extent_steal; + + atomic_set(&osb->s_num_meta_stolen, 0); status = ocfs2_reserve_suballoc_bits(osb, (*ac), EXTENT_ALLOC_SYSTEM_INODE, - slot, NULL, ALLOC_NEW_GROUP); + (u32)osb->slot_num, NULL, + ALLOC_NEW_GROUP); + + + if (status >= 0) { + status = 0; + if (slot != OCFS2_INVALID_SLOT) + ocfs2_init_meta_steal_slot(osb); + goto bail; + } else if (status < 0 && status != -ENOSPC) { + mlog_errno(status); + goto bail; + } + + ocfs2_free_ac_resource(*ac); + +extent_steal: + status = ocfs2_steal_meta(osb, *ac); + atomic_inc(&osb->s_num_meta_stolen); if (status < 0) { if (status != -ENOSPC) mlog_errno(status); @@ -685,43 +808,11 @@ int ocfs2_reserve_new_metadata(struct ocfs2_super *osb, ac); } -static int ocfs2_steal_inode_from_other_nodes(struct ocfs2_super *osb, - struct ocfs2_alloc_context *ac) -{ - int i, status = -ENOSPC; - s16 slot = ocfs2_get_inode_steal_slot(osb); - - /* Start to steal inodes from the first slot after ours. */ - if (slot == OCFS2_INVALID_SLOT) - slot = osb->slot_num + 1; - - for (i = 0; i < osb->max_slots; i++, slot++) { - if (slot == osb->max_slots) - slot = 0; - - if (slot == osb->slot_num) - continue; - - status = ocfs2_reserve_suballoc_bits(osb, ac, - INODE_ALLOC_SYSTEM_INODE, - slot, NULL, - NOT_ALLOC_NEW_GROUP); - if (status >= 0) { - ocfs2_set_inode_steal_slot(osb, slot); - break; - } - - ocfs2_free_ac_resource(ac); - } - - return status; -} - int ocfs2_reserve_new_inode(struct ocfs2_super *osb, struct ocfs2_alloc_context **ac) { int status; - s16 slot = ocfs2_get_inode_steal_slot(osb); + int slot = ocfs2_get_inode_steal_slot(osb); u64 alloc_group; *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL); @@ -754,14 +845,14 @@ int ocfs2_reserve_new_inode(struct ocfs2_super *osb, * need to check our slots to see whether there is some space for us. */ if (slot != OCFS2_INVALID_SLOT && - atomic_read(&osb->s_num_inodes_stolen) < OCFS2_MAX_INODES_TO_STEAL) + atomic_read(&osb->s_num_inodes_stolen) < OCFS2_MAX_TO_STEAL) goto inode_steal; atomic_set(&osb->s_num_inodes_stolen, 0); alloc_group = osb->osb_inode_alloc_group; status = ocfs2_reserve_suballoc_bits(osb, *ac, INODE_ALLOC_SYSTEM_INODE, - osb->slot_num, + (u32)osb->slot_num, &alloc_group, ALLOC_NEW_GROUP | ALLOC_GROUPS_FROM_GLOBAL); @@ -789,7 +880,7 @@ int ocfs2_reserve_new_inode(struct ocfs2_super *osb, ocfs2_free_ac_resource(*ac); inode_steal: - status = ocfs2_steal_inode_from_other_nodes(osb, *ac); + status = ocfs2_steal_inode(osb, *ac); atomic_inc(&osb->s_num_inodes_stolen); if (status < 0) { if (status != -ENOSPC) diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h index 8c9a78a43164..fa60723c43e8 100644 --- a/fs/ocfs2/suballoc.h +++ b/fs/ocfs2/suballoc.h @@ -56,6 +56,7 @@ struct ocfs2_alloc_context { is the same as ~0 - unlimited */ }; +void ocfs2_init_steal_slots(struct ocfs2_super *osb); void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac); static inline int ocfs2_alloc_context_bits_left(struct ocfs2_alloc_context *ac) { diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 755cd49a5ef3..dee03197a494 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -69,6 +69,7 @@ #include "xattr.h" #include "quota.h" #include "refcounttree.h" +#include "suballoc.h" #include "buffer_head_io.h" @@ -301,9 +302,12 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len) spin_lock(&osb->osb_lock); out += snprintf(buf + out, len - out, - "%10s => Slot: %d NumStolen: %d\n", "Steal", + "%10s => InodeSlot: %d StolenInodes: %d, " + "MetaSlot: %d StolenMeta: %d\n", "Steal", osb->s_inode_steal_slot, - atomic_read(&osb->s_num_inodes_stolen)); + atomic_read(&osb->s_num_inodes_stolen), + osb->s_meta_steal_slot, + atomic_read(&osb->s_num_meta_stolen)); spin_unlock(&osb->osb_lock); out += snprintf(buf + out, len - out, "OrphanScan => "); @@ -1997,7 +2001,7 @@ static int ocfs2_initialize_super(struct super_block *sb, osb->blocked_lock_count = 0; spin_lock_init(&osb->osb_lock); spin_lock_init(&osb->osb_xattr_lock); - ocfs2_init_inode_steal_slot(osb); + ocfs2_init_steal_slots(osb); atomic_set(&osb->alloc_stats.moves, 0); atomic_set(&osb->alloc_stats.local_data, 0); diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 8fc6fb071c6d..8ae4e5d1f730 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -2282,7 +2282,7 @@ static int ocfs2_create_xattr_block(handle_t *handle, xblk = (struct ocfs2_xattr_block *)new_bh->b_data; memset(xblk, 0, inode->i_sb->s_blocksize); strcpy((void *)xblk, OCFS2_XATTR_BLOCK_SIGNATURE); - xblk->xb_suballoc_slot = cpu_to_le16(osb->slot_num); + xblk->xb_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot); xblk->xb_suballoc_bit = cpu_to_le16(suballoc_bit_start); xblk->xb_fs_generation = cpu_to_le32(osb->fs_generation); xblk->xb_blkno = cpu_to_le64(first_blkno); -- cgit v1.2.3 From 96a1cc731adb28dc4feb71701091b80e67d486a7 Mon Sep 17 00:00:00 2001 From: Wengang Wang <wen.gang.wang@oracle.com> Date: Tue, 9 Feb 2010 14:57:45 +0800 Subject: ocfs2: Clean up the checks for CoW and direct I/O. When ocfs2 has to do CoW for refcounted extents, we disable direct I/O and go through the buffered I/O path. This makes the combined check easier to read. Signed-off-by: Wengang Wang <wen.gang.wang@oracle.com> Signed-off-by: Joel Becker <joel.becker@oracle.com> --- fs/ocfs2/file.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 558ce0312421..da097bd07b72 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1836,6 +1836,8 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry, &meta_level); if (has_refcount) *has_refcount = 1; + if (direct_io) + *direct_io = 0; } if (ret < 0) { @@ -1859,10 +1861,6 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry, break; } - if (has_refcount && *has_refcount == 1) { - *direct_io = 0; - break; - } /* * Allowing concurrent direct writes means * i_size changes wouldn't be synchronized, so -- cgit v1.2.3 From 8545e03d82b6739461bbd60db7aba144f7dbe80f Mon Sep 17 00:00:00 2001 From: Sunil Mushran <sunil.mushran@oracle.com> Date: Fri, 12 Feb 2010 14:09:06 -0800 Subject: ocfs2: Add current->comm in trace output Add current->comm to the standard mlog() output to help with debugging. Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com> Signed-off-by: Joel Becker <joel.becker@oracle.com> --- fs/ocfs2/cluster/masklog.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h index 9b4d11726cf2..0442366b3060 100644 --- a/fs/ocfs2/cluster/masklog.h +++ b/fs/ocfs2/cluster/masklog.h @@ -194,9 +194,9 @@ extern struct mlog_bits mlog_and_bits, mlog_not_bits; * previous token if args expands to nothing. */ #define __mlog_printk(level, fmt, args...) \ - printk(level "(%u,%lu):%s:%d " fmt, task_pid_nr(current), \ - __mlog_cpu_guess, __PRETTY_FUNCTION__, __LINE__ , \ - ##args) + printk(level "(%s,%u,%lu):%s:%d " fmt, current->comm, \ + task_pid_nr(current), __mlog_cpu_guess, \ + __PRETTY_FUNCTION__, __LINE__ , ##args) #define mlog(mask, fmt, args...) do { \ u64 __m = MLOG_MASK_PREFIX | (mask); \ -- cgit v1.2.3 From 11179f2c92cb025b1ff0b794f9714b3fb395855f Mon Sep 17 00:00:00 2001 From: Joel Becker <joel.becker@oracle.com> Date: Fri, 14 Aug 2009 16:07:44 -0700 Subject: ocfs2: Introduce ocfs2_xa_loc The ocfs2 extended attribute (xattr) code is very flexible. It can store xattrs in the inode itself, in an external block, or in a tree of data structures. This allows the number of xattrs to be bounded by the filesystem size. However, the code that manages each possible storage location is different. Maintaining the ocfs2 xattr code requires changing each hunk separately. This patch is the start of a series introducing the ocfs2_xa_loc structure. This structure wraps the on-disk details of an xattr entry. The goal is that the generic xattr routines can use ocfs2_xa_loc without knowing the underlying storage location. This first pass merely implements the basic structure, initializing it, and wiping the name+value pair of the entry. Signed-off-by: Joel Becker <joel.becker@oracle.com> --- fs/ocfs2/xattr.c | 241 +++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 226 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 8ae4e5d1f730..945ca697eb25 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -137,6 +137,51 @@ struct ocfs2_xattr_search { int not_found; }; +/* Operations on struct ocfs2_xa_entry */ +struct ocfs2_xa_loc; +struct ocfs2_xa_loc_operations { + /* + * Return a pointer to the appropriate buffer in loc->xl_storage + * at the given offset from loc->xl_header. + */ + void *(*xlo_offset_pointer)(struct ocfs2_xa_loc *loc, int offset); + + /* + * Remove the name+value at this location. Do whatever is + * appropriate with the remaining name+value pairs. + */ + void (*xlo_wipe_namevalue)(struct ocfs2_xa_loc *loc); +}; + +/* + * Describes an xattr entry location. This is a memory structure + * tracking the on-disk structure. + */ +struct ocfs2_xa_loc { + /* The ocfs2_xattr_header inside the on-disk storage. Not NULL. */ + struct ocfs2_xattr_header *xl_header; + + /* Bytes from xl_header to the end of the storage */ + int xl_size; + + /* + * The ocfs2_xattr_entry this location describes. If this is + * NULL, this location describes the on-disk structure where it + * would have been. + */ + struct ocfs2_xattr_entry *xl_entry; + + /* + * Internal housekeeping + */ + + /* Buffer(s) containing this entry */ + void *xl_storage; + + /* Operations on the storage backing this location */ + const struct ocfs2_xa_loc_operations *xl_ops; +}; + static int ocfs2_xattr_bucket_get_name_value(struct super_block *sb, struct ocfs2_xattr_header *xh, int index, @@ -1417,6 +1462,170 @@ static int ocfs2_xattr_set_value_outside(struct inode *inode, return ret; } +/* + * Wipe the name+value pair and allow the storage to reclaim it. This + * must be followed by either removal of the entry or a call to + * ocfs2_xa_add_namevalue(). + */ +static void ocfs2_xa_wipe_namevalue(struct ocfs2_xa_loc *loc) +{ + loc->xl_ops->xlo_wipe_namevalue(loc); +} + +static void *ocfs2_xa_block_offset_pointer(struct ocfs2_xa_loc *loc, + int offset) +{ + BUG_ON(offset >= loc->xl_size); + return (char *)loc->xl_header + offset; +} + +/* + * Block storage for xattrs keeps the name+value pairs compacted. When + * we remove one, we have to shift any that preceded it towards the end. + */ +static void ocfs2_xa_block_wipe_namevalue(struct ocfs2_xa_loc *loc) +{ + int i, offset; + int namevalue_offset, first_namevalue_offset, namevalue_size; + struct ocfs2_xattr_entry *entry = loc->xl_entry; + struct ocfs2_xattr_header *xh = loc->xl_header; + u64 value_size = le64_to_cpu(entry->xe_value_size); + int count = le16_to_cpu(xh->xh_count); + + namevalue_offset = le16_to_cpu(entry->xe_name_offset); + namevalue_size = OCFS2_XATTR_SIZE(entry->xe_name_len); + if (value_size > OCFS2_XATTR_INLINE_SIZE) + namevalue_size += OCFS2_XATTR_ROOT_SIZE; + else + namevalue_size += OCFS2_XATTR_SIZE(value_size); + + for (i = 0, first_namevalue_offset = loc->xl_size; + i < count; i++) { + offset = le16_to_cpu(xh->xh_entries[i].xe_name_offset); + if (offset < first_namevalue_offset) + first_namevalue_offset = offset; + } + + /* Shift the name+value pairs */ + memmove((char *)xh + first_namevalue_offset + namevalue_size, + (char *)xh + first_namevalue_offset, + namevalue_offset - first_namevalue_offset); + memset((char *)xh + first_namevalue_offset, 0, namevalue_size); + + /* Now tell xh->xh_entries about it */ + for (i = 0; i < count; i++) { + offset = le16_to_cpu(xh->xh_entries[i].xe_name_offset); + if (offset < namevalue_offset) + le16_add_cpu(&xh->xh_entries[i].xe_name_offset, + namevalue_size); + } + + /* + * Note that we don't update xh_free_start or xh_name_value_len + * because they're not used in block-stored xattrs. + */ +} + +/* + * Operations for xattrs stored in blocks. This includes inline inode + * storage and unindexed ocfs2_xattr_blocks. + */ +static const struct ocfs2_xa_loc_operations ocfs2_xa_block_loc_ops = { + .xlo_offset_pointer = ocfs2_xa_block_offset_pointer, + .xlo_wipe_namevalue = ocfs2_xa_block_wipe_namevalue, +}; + +static void *ocfs2_xa_bucket_offset_pointer(struct ocfs2_xa_loc *loc, + int offset) +{ + struct ocfs2_xattr_bucket *bucket = loc->xl_storage; + int block, block_offset; + + BUG_ON(offset >= OCFS2_XATTR_BUCKET_SIZE); + + /* The header is at the front of the bucket */ + block = offset >> bucket->bu_inode->i_sb->s_blocksize_bits; + block_offset = offset % bucket->bu_inode->i_sb->s_blocksize; + + return bucket_block(bucket, block) + block_offset; +} + +static void ocfs2_xa_bucket_wipe_namevalue(struct ocfs2_xa_loc *loc) +{ + int namevalue_size; + struct ocfs2_xattr_entry *entry = loc->xl_entry; + u64 value_size = le64_to_cpu(entry->xe_value_size); + + namevalue_size = OCFS2_XATTR_SIZE(entry->xe_name_len); + if (value_size > OCFS2_XATTR_INLINE_SIZE) + namevalue_size += OCFS2_XATTR_ROOT_SIZE; + else + namevalue_size += OCFS2_XATTR_SIZE(value_size); + + le16_add_cpu(&loc->xl_header->xh_name_value_len, -namevalue_size); +} + +/* Operations for xattrs stored in buckets. */ +static const struct ocfs2_xa_loc_operations ocfs2_xa_bucket_loc_ops = { + .xlo_offset_pointer = ocfs2_xa_bucket_offset_pointer, + .xlo_wipe_namevalue = ocfs2_xa_bucket_wipe_namevalue, +}; + +static void ocfs2_xa_remove_entry(struct ocfs2_xa_loc *loc) +{ + ocfs2_xa_wipe_namevalue(loc); +} + +static void ocfs2_init_dinode_xa_loc(struct ocfs2_xa_loc *loc, + struct inode *inode, + struct buffer_head *bh, + struct ocfs2_xattr_entry *entry) +{ + struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data; + + loc->xl_ops = &ocfs2_xa_block_loc_ops; + loc->xl_storage = bh; + loc->xl_entry = entry; + + if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_XATTR_FL) + loc->xl_size = le16_to_cpu(di->i_xattr_inline_size); + else { + BUG_ON(entry); + loc->xl_size = OCFS2_SB(inode->i_sb)->s_xattr_inline_size; + } + loc->xl_header = + (struct ocfs2_xattr_header *)(bh->b_data + bh->b_size - + loc->xl_size); +} + +static void ocfs2_init_xattr_block_xa_loc(struct ocfs2_xa_loc *loc, + struct buffer_head *bh, + struct ocfs2_xattr_entry *entry) +{ + struct ocfs2_xattr_block *xb = + (struct ocfs2_xattr_block *)bh->b_data; + + BUG_ON(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED); + + loc->xl_ops = &ocfs2_xa_block_loc_ops; + loc->xl_storage = bh; + loc->xl_header = &(xb->xb_attrs.xb_header); + loc->xl_entry = entry; + loc->xl_size = bh->b_size - offsetof(struct ocfs2_xattr_block, + xb_attrs.xb_header); +} + +static void ocfs2_init_xattr_bucket_xa_loc(struct ocfs2_xa_loc *loc, + struct ocfs2_xattr_bucket *bucket, + struct ocfs2_xattr_entry *entry) +{ + loc->xl_ops = &ocfs2_xa_bucket_loc_ops; + loc->xl_storage = bucket; + loc->xl_header = bucket_xh(bucket); + loc->xl_entry = entry; + loc->xl_size = OCFS2_XATTR_BUCKET_SIZE; +} + /* * ocfs2_xattr_set_entry_local() * @@ -1430,7 +1639,14 @@ static void ocfs2_xattr_set_entry_local(struct inode *inode, { size_t name_len = strlen(xi->name); int i; + struct ocfs2_xa_loc loc; + if (xs->xattr_bh == xs->inode_bh) + ocfs2_init_dinode_xa_loc(&loc, inode, xs->inode_bh, + xs->not_found ? NULL : xs->here); + else + ocfs2_init_xattr_block_xa_loc(&loc, xs->xattr_bh, + xs->not_found ? NULL : xs->here); if (xi->value && xs->not_found) { /* Insert the new xattr entry. */ le16_add_cpu(&xs->header->xh_count, 1); @@ -1469,9 +1685,9 @@ static void ocfs2_xattr_set_entry_local(struct inode *inode, xi->value_len); return; } + /* Remove the old name+value. */ - memmove(first_val + size, first_val, val - first_val); - memset(first_val, 0, size); + ocfs2_xa_wipe_namevalue(&loc); xs->here->xe_name_hash = 0; xs->here->xe_name_offset = 0; ocfs2_xattr_set_local(xs->here, 1); @@ -1479,23 +1695,15 @@ static void ocfs2_xattr_set_entry_local(struct inode *inode, min_offs += size; - /* Adjust all value offsets. */ - last = xs->header->xh_entries; - for (i = 0 ; i < le16_to_cpu(xs->header->xh_count); i++) { - size_t o = le16_to_cpu(last->xe_name_offset); - - if (o < offs) - last->xe_name_offset = cpu_to_le16(o + size); - last += 1; - } - if (!xi->value) { /* Remove the old entry. */ - last -= 1; + i = le16_to_cpu(xs->header->xh_count) - 1; + last = &xs->header->xh_entries[i]; + xs->header->xh_count = cpu_to_le16(i); + memmove(xs->here, xs->here + 1, (void *)last - (void *)xs->here); memset(last, 0, sizeof(struct ocfs2_xattr_entry)); - le16_add_cpu(&xs->header->xh_count, -1); } } if (xi->value) { @@ -4769,7 +4977,10 @@ static void ocfs2_xattr_set_entry_normal(struct inode *inode, size_t blocksize = inode->i_sb->s_blocksize; char *val; size_t offs, size, new_size; + struct ocfs2_xa_loc loc; + ocfs2_init_xattr_bucket_xa_loc(&loc, xs->bucket, + xs->not_found ? NULL : xs->here); last = &xh->xh_entries[count]; if (!xs->not_found) { xe = xs->here; @@ -4790,7 +5001,7 @@ static void ocfs2_xattr_set_entry_normal(struct inode *inode, new_size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_SIZE(xi->value_len); - le16_add_cpu(&xh->xh_name_value_len, -size); + ocfs2_xa_wipe_namevalue(&loc); if (xi->value) { if (new_size > size) goto set_new_name_value; -- cgit v1.2.3 From bde1e5400a1b21ef47932ab00446c7361ff3c139 Mon Sep 17 00:00:00 2001 From: Joel Becker <joel.becker@oracle.com> Date: Fri, 14 Aug 2009 16:58:38 -0700 Subject: ocfs2: Remove xattrs via ocfs2_xa_loc Add ocfs2_xa_remove_entry(), which will remove an xattr entry from its storage via the ocfs2_xa_loc descriptor. Signed-off-by: Joel Becker <joel.becker@oracle.com> --- fs/ocfs2/xattr.c | 62 ++++++++++++++++++++++++++------------------------------ 1 file changed, 29 insertions(+), 33 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 945ca697eb25..22a60a7c35ca 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -1573,7 +1573,29 @@ static const struct ocfs2_xa_loc_operations ocfs2_xa_bucket_loc_ops = { static void ocfs2_xa_remove_entry(struct ocfs2_xa_loc *loc) { + int index, count; + struct ocfs2_xattr_header *xh = loc->xl_header; + struct ocfs2_xattr_entry *entry = loc->xl_entry; + ocfs2_xa_wipe_namevalue(loc); + loc->xl_entry = NULL; + + le16_add_cpu(&xh->xh_count, -1); + count = le16_to_cpu(xh->xh_count); + + /* + * Only zero out the entry if there are more remaining. This is + * important for an empty bucket, as it keeps track of the + * bucket's hash value. It doesn't hurt empty block storage. + */ + if (count) { + index = ((char *)entry - (char *)&xh->xh_entries) / + sizeof(struct ocfs2_xattr_entry); + memmove(&xh->xh_entries[index], &xh->xh_entries[index + 1], + (count - index) * sizeof(struct ocfs2_xattr_entry)); + memset(&xh->xh_entries[count], 0, + sizeof(struct ocfs2_xattr_entry)); + } } static void ocfs2_init_dinode_xa_loc(struct ocfs2_xa_loc *loc, @@ -1638,7 +1660,6 @@ static void ocfs2_xattr_set_entry_local(struct inode *inode, size_t min_offs) { size_t name_len = strlen(xi->name); - int i; struct ocfs2_xa_loc loc; if (xs->xattr_bh == xs->inode_bh) @@ -1686,25 +1707,12 @@ static void ocfs2_xattr_set_entry_local(struct inode *inode, return; } - /* Remove the old name+value. */ - ocfs2_xa_wipe_namevalue(&loc); - xs->here->xe_name_hash = 0; - xs->here->xe_name_offset = 0; - ocfs2_xattr_set_local(xs->here, 1); - xs->here->xe_value_size = 0; + if (!xi->value) + ocfs2_xa_remove_entry(&loc); + else + ocfs2_xa_wipe_namevalue(&loc); min_offs += size; - - if (!xi->value) { - /* Remove the old entry. */ - i = le16_to_cpu(xs->header->xh_count) - 1; - last = &xs->header->xh_entries[i]; - xs->header->xh_count = cpu_to_le16(i); - - memmove(xs->here, xs->here + 1, - (void *)last - (void *)xs->here); - memset(last, 0, sizeof(struct ocfs2_xattr_entry)); - } } if (xi->value) { /* Insert the new name+value. */ @@ -5001,8 +5009,8 @@ static void ocfs2_xattr_set_entry_normal(struct inode *inode, new_size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_SIZE(xi->value_len); - ocfs2_xa_wipe_namevalue(&loc); if (xi->value) { + ocfs2_xa_wipe_namevalue(&loc); if (new_size > size) goto set_new_name_value; @@ -5024,20 +5032,8 @@ static void ocfs2_xattr_set_entry_normal(struct inode *inode, ocfs2_xattr_set_local(xe, local); return; } else { - /* - * Remove the old entry if there is more than one. - * We don't remove the last entry so that we can - * use it to indicate the hash value of the empty - * bucket. - */ - last -= 1; - le16_add_cpu(&xh->xh_count, -1); - if (xh->xh_count) { - memmove(xe, xe + 1, - (void *)last - (void *)xe); - memset(last, 0, - sizeof(struct ocfs2_xattr_entry)); - } else + ocfs2_xa_remove_entry(&loc); + if (!xh->xh_count) xh->xh_free_start = cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE); -- cgit v1.2.3 From 6b240ff63c9dda93366c61c969b81ca22fe676ac Mon Sep 17 00:00:00 2001 From: Joel Becker <joel.becker@oracle.com> Date: Fri, 14 Aug 2009 18:02:52 -0700 Subject: ocfs2: Prefix the member fields of struct ocfs2_xattr_info. struct ocfs2_xattr_info is a useful structure describing an xattr you'd like to set. Let's put prefixes on the member fields so it's easier to read and use. Signed-off-by: Joel Becker <joel.becker@oracle.com> --- fs/ocfs2/xattr.c | 212 ++++++++++++++++++++++++++++--------------------------- 1 file changed, 108 insertions(+), 104 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 22a60a7c35ca..c675a6cda0bb 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -116,10 +116,10 @@ static struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = { }; struct ocfs2_xattr_info { - int name_index; - const char *name; - const void *value; - size_t value_len; + int xi_name_index; + const char *xi_name; + const void *xi_value; + size_t xi_value_len; }; struct ocfs2_xattr_search { @@ -1361,7 +1361,7 @@ static int ocfs2_xattr_cleanup(struct inode *inode, size_t offs) { int ret = 0; - size_t name_len = strlen(xi->name); + size_t name_len = strlen(xi->xi_name); void *val = xs->base + offs; size_t size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE; @@ -1401,8 +1401,8 @@ static int ocfs2_xattr_update_entry(struct inode *inode, } xs->here->xe_name_offset = cpu_to_le16(offs); - xs->here->xe_value_size = cpu_to_le64(xi->value_len); - if (xi->value_len <= OCFS2_XATTR_INLINE_SIZE) + xs->here->xe_value_size = cpu_to_le64(xi->xi_value_len); + if (xi->xi_value_len <= OCFS2_XATTR_INLINE_SIZE) ocfs2_xattr_set_local(xs->here, 1); else ocfs2_xattr_set_local(xs->here, 0); @@ -1427,14 +1427,14 @@ static int ocfs2_xattr_set_value_outside(struct inode *inode, struct ocfs2_xattr_value_buf *vb, size_t offs) { - size_t name_len = strlen(xi->name); + size_t name_len = strlen(xi->xi_name); void *val = xs->base + offs; struct ocfs2_xattr_value_root *xv = NULL; size_t size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE; int ret = 0; memset(val, 0, size); - memcpy(val, xi->name, name_len); + memcpy(val, xi->xi_name, name_len); xv = (struct ocfs2_xattr_value_root *) (val + OCFS2_XATTR_SIZE(name_len)); xv->xr_clusters = 0; @@ -1444,7 +1444,7 @@ static int ocfs2_xattr_set_value_outside(struct inode *inode, xv->xr_list.l_next_free_rec = 0; vb->vb_xv = xv; - ret = ocfs2_xattr_value_truncate(inode, vb, xi->value_len, ctxt); + ret = ocfs2_xattr_value_truncate(inode, vb, xi->xi_value_len, ctxt); if (ret < 0) { mlog_errno(ret); return ret; @@ -1455,7 +1455,7 @@ static int ocfs2_xattr_set_value_outside(struct inode *inode, return ret; } ret = __ocfs2_xattr_set_value_outside(inode, ctxt->handle, vb, - xi->value, xi->value_len); + xi->xi_value, xi->xi_value_len); if (ret < 0) mlog_errno(ret); @@ -1659,7 +1659,7 @@ static void ocfs2_xattr_set_entry_local(struct inode *inode, struct ocfs2_xattr_entry *last, size_t min_offs) { - size_t name_len = strlen(xi->name); + size_t name_len = strlen(xi->xi_name); struct ocfs2_xa_loc loc; if (xs->xattr_bh == xs->inode_bh) @@ -1668,10 +1668,10 @@ static void ocfs2_xattr_set_entry_local(struct inode *inode, else ocfs2_init_xattr_block_xa_loc(&loc, xs->xattr_bh, xs->not_found ? NULL : xs->here); - if (xi->value && xs->not_found) { + if (xi->xi_value && xs->not_found) { /* Insert the new xattr entry. */ le16_add_cpu(&xs->header->xh_count, 1); - ocfs2_xattr_set_type(last, xi->name_index); + ocfs2_xattr_set_type(last, xi->xi_name_index); ocfs2_xattr_set_local(last, 1); last->xe_name_len = name_len; } else { @@ -1691,42 +1691,42 @@ static void ocfs2_xattr_set_entry_local(struct inode *inode, size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_SIZE(le64_to_cpu(xs->here->xe_value_size)); - if (xi->value && size == OCFS2_XATTR_SIZE(name_len) + - OCFS2_XATTR_SIZE(xi->value_len)) { + if (xi->xi_value && size == OCFS2_XATTR_SIZE(name_len) + + OCFS2_XATTR_SIZE(xi->xi_value_len)) { /* The old and the new value have the same size. Just replace the value. */ ocfs2_xattr_set_local(xs->here, 1); - xs->here->xe_value_size = cpu_to_le64(xi->value_len); + xs->here->xe_value_size = cpu_to_le64(xi->xi_value_len); /* Clear value bytes. */ memset(val + OCFS2_XATTR_SIZE(name_len), 0, - OCFS2_XATTR_SIZE(xi->value_len)); + OCFS2_XATTR_SIZE(xi->xi_value_len)); memcpy(val + OCFS2_XATTR_SIZE(name_len), - xi->value, - xi->value_len); + xi->xi_value, + xi->xi_value_len); return; } - if (!xi->value) + if (!xi->xi_value) ocfs2_xa_remove_entry(&loc); else ocfs2_xa_wipe_namevalue(&loc); min_offs += size; } - if (xi->value) { + if (xi->xi_value) { /* Insert the new name+value. */ size_t size = OCFS2_XATTR_SIZE(name_len) + - OCFS2_XATTR_SIZE(xi->value_len); + OCFS2_XATTR_SIZE(xi->xi_value_len); void *val = xs->base + min_offs - size; xs->here->xe_name_offset = cpu_to_le16(min_offs - size); memset(val, 0, size); - memcpy(val, xi->name, name_len); + memcpy(val, xi->xi_name, name_len); memcpy(val + OCFS2_XATTR_SIZE(name_len), - xi->value, - xi->value_len); - xs->here->xe_value_size = cpu_to_le64(xi->value_len); + xi->xi_value, + xi->xi_value_len); + xs->here->xe_value_size = cpu_to_le64(xi->xi_value_len); ocfs2_xattr_set_local(xs->here, 1); ocfs2_xattr_hash_entry(inode, xs->header, xs->here); } @@ -1752,15 +1752,15 @@ static int ocfs2_xattr_set_entry(struct inode *inode, struct ocfs2_xattr_entry *last; struct ocfs2_inode_info *oi = OCFS2_I(inode); struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data; - size_t min_offs = xs->end - xs->base, name_len = strlen(xi->name); + size_t min_offs = xs->end - xs->base, name_len = strlen(xi->xi_name); size_t size_l = 0; handle_t *handle = ctxt->handle; int free, i, ret; struct ocfs2_xattr_info xi_l = { - .name_index = xi->name_index, - .name = xi->name, - .value = xi->value, - .value_len = xi->value_len, + .xi_name_index = xi->xi_name_index, + .xi_name = xi->xi_name, + .xi_value = xi->xi_value, + .xi_value_len = xi->xi_value_len, }; struct ocfs2_xattr_value_buf vb = { .vb_bh = xs->xattr_bh, @@ -1798,7 +1798,7 @@ static int ocfs2_xattr_set_entry(struct inode *inode, free += (size + sizeof(struct ocfs2_xattr_entry)); } /* Check free space in inode or block */ - if (xi->value && xi->value_len > OCFS2_XATTR_INLINE_SIZE) { + if (xi->xi_value && xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) { if (free < sizeof(struct ocfs2_xattr_entry) + OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE) { @@ -1806,12 +1806,12 @@ static int ocfs2_xattr_set_entry(struct inode *inode, goto out; } size_l = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE; - xi_l.value = (void *)&def_xv; - xi_l.value_len = OCFS2_XATTR_ROOT_SIZE; - } else if (xi->value) { + xi_l.xi_value = (void *)&def_xv; + xi_l.xi_value_len = OCFS2_XATTR_ROOT_SIZE; + } else if (xi->xi_value) { if (free < sizeof(struct ocfs2_xattr_entry) + OCFS2_XATTR_SIZE(name_len) + - OCFS2_XATTR_SIZE(xi->value_len)) { + OCFS2_XATTR_SIZE(xi->xi_value_len)) { ret = -ENOSPC; goto out; } @@ -1836,16 +1836,16 @@ static int ocfs2_xattr_set_entry(struct inode *inode, vb.vb_xv = (struct ocfs2_xattr_value_root *) (val + OCFS2_XATTR_SIZE(name_len)); - if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) { + if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) { /* * If new value need set outside also, * first truncate old value to new value, * then set new value with set_value_outside(). */ ret = ocfs2_xattr_value_truncate(inode, - &vb, - xi->value_len, - ctxt); + &vb, + xi->xi_value_len, + ctxt); if (ret < 0) { mlog_errno(ret); goto out; @@ -1863,10 +1863,10 @@ static int ocfs2_xattr_set_entry(struct inode *inode, } ret = __ocfs2_xattr_set_value_outside(inode, - handle, - &vb, - xi->value, - xi->value_len); + handle, + &vb, + xi->xi_value, + xi->xi_value_len); if (ret < 0) mlog_errno(ret); goto out; @@ -1944,7 +1944,7 @@ static int ocfs2_xattr_set_entry(struct inode *inode, if (ret < 0) mlog_errno(ret); - if (!ret && xi->value_len > OCFS2_XATTR_INLINE_SIZE) { + if (!ret && xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) { /* * Set value outside in B tree. * This is the second step for value size > INLINE_SIZE. @@ -2610,13 +2610,13 @@ static int ocfs2_xattr_can_be_in_inode(struct inode *inode, BUG_ON(!xs->not_found); - if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) + if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) value_size = OCFS2_XATTR_ROOT_SIZE; else - value_size = OCFS2_XATTR_SIZE(xi->value_len); + value_size = OCFS2_XATTR_SIZE(xi->xi_value_len); if (free >= sizeof(struct ocfs2_xattr_entry) + - OCFS2_XATTR_SIZE(strlen(xi->name)) + value_size) + OCFS2_XATTR_SIZE(strlen(xi->xi_name)) + value_size) return 1; return 0; @@ -2640,7 +2640,7 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode, char *base = NULL; int name_offset, name_len = 0; u32 new_clusters = ocfs2_clusters_for_bytes(inode->i_sb, - xi->value_len); + xi->xi_value_len); u64 value_size; /* @@ -2648,14 +2648,14 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode, * No matter whether we replace an old one or add a new one, * we need this for writing. */ - if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) + if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) credits += new_clusters * ocfs2_clusters_to_blocks(inode->i_sb, 1); if (xis->not_found && xbs->not_found) { credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb); - if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) { + if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) { clusters_add += new_clusters; credits += ocfs2_calc_extend_credits(inode->i_sb, &def_xv.xv.xr_list, @@ -2700,7 +2700,7 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode, * The credits for removing the value tree will be extended * by ocfs2_remove_extent itself. */ - if (!xi->value) { + if (!xi->xi_value) { if (!ocfs2_xattr_is_local(xe)) credits += ocfs2_remove_extent_credits(inode->i_sb); @@ -2730,7 +2730,7 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode, } } - if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) { + if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) { /* the new values will be stored outside. */ u32 old_clusters = 0; @@ -2763,9 +2763,10 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode, * value, we don't need any allocation, otherwise we have * to guess metadata allocation. */ - if ((ocfs2_xattr_is_local(xe) && value_size >= xi->value_len) || + if ((ocfs2_xattr_is_local(xe) && + (value_size >= xi->xi_value_len)) || (!ocfs2_xattr_is_local(xe) && - OCFS2_XATTR_ROOT_SIZE >= xi->value_len)) + OCFS2_XATTR_ROOT_SIZE >= xi->xi_value_len)) goto out; } @@ -2855,7 +2856,7 @@ static int ocfs2_init_xattr_set_ctxt(struct inode *inode, meta_add += extra_meta; mlog(0, "Set xattr %s, reserve meta blocks = %d, clusters = %d, " - "credits = %d\n", xi->name, meta_add, clusters_add, *credits); + "credits = %d\n", xi->xi_name, meta_add, clusters_add, *credits); if (meta_add) { ret = ocfs2_reserve_new_metadata_blocks(osb, meta_add, @@ -2895,7 +2896,7 @@ static int __ocfs2_xattr_set_handle(struct inode *inode, { int ret = 0, credits, old_found; - if (!xi->value) { + if (!xi->xi_value) { /* Remove existing extended attribute */ if (!xis->not_found) ret = ocfs2_xattr_ibody_set(inode, xi, xis, ctxt); @@ -2909,8 +2910,8 @@ static int __ocfs2_xattr_set_handle(struct inode *inode, * If succeed and that extended attribute existing in * external block, then we will remove it. */ - xi->value = NULL; - xi->value_len = 0; + xi->xi_value = NULL; + xi->xi_value_len = 0; old_found = xis->not_found; xis->not_found = -ENODATA; @@ -2938,8 +2939,8 @@ static int __ocfs2_xattr_set_handle(struct inode *inode, } else if (ret == -ENOSPC) { if (di->i_xattr_loc && !xbs->xattr_bh) { ret = ocfs2_xattr_block_find(inode, - xi->name_index, - xi->name, xbs); + xi->xi_name_index, + xi->xi_name, xbs); if (ret) goto out; @@ -2978,8 +2979,8 @@ static int __ocfs2_xattr_set_handle(struct inode *inode, * If succeed and that extended attribute * existing in inode, we will remove it. */ - xi->value = NULL; - xi->value_len = 0; + xi->xi_value = NULL; + xi->xi_value_len = 0; xbs->not_found = -ENODATA; ret = ocfs2_calc_xattr_set_need(inode, di, @@ -3045,10 +3046,10 @@ int ocfs2_xattr_set_handle(handle_t *handle, int ret; struct ocfs2_xattr_info xi = { - .name_index = name_index, - .name = name, - .value = value, - .value_len = value_len, + .xi_name_index = name_index, + .xi_name = name, + .xi_value = value, + .xi_value_len = value_len, }; struct ocfs2_xattr_search xis = { @@ -3128,10 +3129,10 @@ int ocfs2_xattr_set(struct inode *inode, struct ocfs2_refcount_tree *ref_tree = NULL; struct ocfs2_xattr_info xi = { - .name_index = name_index, - .name = name, - .value = value, - .value_len = value_len, + .xi_name_index = name_index, + .xi_name = name, + .xi_value = value, + .xi_value_len = value_len, }; struct ocfs2_xattr_search xis = { @@ -4979,7 +4980,7 @@ static void ocfs2_xattr_set_entry_normal(struct inode *inode, int local) { struct ocfs2_xattr_entry *last, *xe; - int name_len = strlen(xi->name); + int name_len = strlen(xi->xi_name); struct ocfs2_xattr_header *xh = xs->header; u16 count = le16_to_cpu(xh->xh_count), start; size_t blocksize = inode->i_sb->s_blocksize; @@ -5001,22 +5002,24 @@ static void ocfs2_xattr_set_entry_normal(struct inode *inode, OCFS2_XATTR_SIZE(OCFS2_XATTR_ROOT_SIZE); /* - * If the new value will be stored outside, xi->value has been - * initalized as an empty ocfs2_xattr_value_root, and the same - * goes with xi->value_len, so we can set new_size safely here. + * If the new value will be stored outside, xi->xi_value has + * been initalized as an empty ocfs2_xattr_value_root, and + * the same goes with xi->xi_value_len, so we can set + * new_size safely here. * See ocfs2_xattr_set_in_bucket. */ new_size = OCFS2_XATTR_SIZE(name_len) + - OCFS2_XATTR_SIZE(xi->value_len); + OCFS2_XATTR_SIZE(xi->xi_value_len); - if (xi->value) { + if (xi->xi_value) { ocfs2_xa_wipe_namevalue(&loc); if (new_size > size) goto set_new_name_value; /* Now replace the old value with new one. */ if (local) - xe->xe_value_size = cpu_to_le64(xi->value_len); + xe->xe_value_size = + cpu_to_le64(xi->xi_value_len); else xe->xe_value_size = 0; @@ -5024,9 +5027,9 @@ static void ocfs2_xattr_set_entry_normal(struct inode *inode, xs->bucket, offs); memset(val + OCFS2_XATTR_SIZE(name_len), 0, size - OCFS2_XATTR_SIZE(name_len)); - if (OCFS2_XATTR_SIZE(xi->value_len) > 0) + if (OCFS2_XATTR_SIZE(xi->xi_value_len) > 0) memcpy(val + OCFS2_XATTR_SIZE(name_len), - xi->value, xi->value_len); + xi->xi_value, xi->xi_value_len); le16_add_cpu(&xh->xh_name_value_len, new_size); ocfs2_xattr_set_local(xe, local); @@ -5067,12 +5070,12 @@ static void ocfs2_xattr_set_entry_normal(struct inode *inode, memset(xe, 0, sizeof(struct ocfs2_xattr_entry)); xe->xe_name_hash = cpu_to_le32(name_hash); xe->xe_name_len = name_len; - ocfs2_xattr_set_type(xe, xi->name_index); + ocfs2_xattr_set_type(xe, xi->xi_name_index); } set_new_name_value: /* Insert the new name+value. */ - size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_SIZE(xi->value_len); + size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_SIZE(xi->xi_value_len); /* * We must make sure that the name/value pair @@ -5091,10 +5094,11 @@ set_new_name_value: xe->xe_name_offset = cpu_to_le16(offs - size); memset(val, 0, size); - memcpy(val, xi->name, name_len); - memcpy(val + OCFS2_XATTR_SIZE(name_len), xi->value, xi->value_len); + memcpy(val, xi->xi_name, name_len); + memcpy(val + OCFS2_XATTR_SIZE(name_len), xi->xi_value, + xi->xi_value_len); - xe->xe_value_size = cpu_to_le64(xi->value_len); + xe->xe_value_size = cpu_to_le64(xi->xi_value_len); ocfs2_xattr_set_local(xe, local); xs->here = xe; le16_add_cpu(&xh->xh_free_start, -size); @@ -5119,7 +5123,7 @@ static int ocfs2_xattr_set_entry_in_bucket(struct inode *inode, u64 blkno; mlog(0, "Set xattr entry len = %lu index = %d in bucket %llu\n", - (unsigned long)xi->value_len, xi->name_index, + (unsigned long)xi->xi_value_len, xi->xi_name_index, (unsigned long long)bucket_blkno(xs->bucket)); if (!xs->bucket->bu_bhs[1]) { @@ -5417,10 +5421,10 @@ static int ocfs2_xattr_set_in_bucket(struct inode *inode, { int ret, local = 1; size_t value_len; - char *val = (char *)xi->value; + char *val = (char *)xi->xi_value; struct ocfs2_xattr_entry *xe = xs->here; - u32 name_hash = ocfs2_xattr_name_hash(inode, xi->name, - strlen(xi->name)); + u32 name_hash = ocfs2_xattr_name_hash(inode, xi->xi_name, + strlen(xi->xi_name)); if (!xs->not_found && !ocfs2_xattr_is_local(xe)) { /* @@ -5435,8 +5439,8 @@ static int ocfs2_xattr_set_in_bucket(struct inode *inode, * the modification to the xattr block will be done * by following steps. */ - if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) - value_len = xi->value_len; + if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) + value_len = xi->xi_value_len; else value_len = 0; @@ -5450,7 +5454,7 @@ static int ocfs2_xattr_set_in_bucket(struct inode *inode, goto set_value_outside; } - value_len = xi->value_len; + value_len = xi->xi_value_len; /* So we have to handle the inside block change now. */ if (value_len > OCFS2_XATTR_INLINE_SIZE) { /* @@ -5458,8 +5462,8 @@ static int ocfs2_xattr_set_in_bucket(struct inode *inode, * initalize a new empty value root and insert it first. */ local = 0; - xi->value = &def_xv; - xi->value_len = OCFS2_XATTR_ROOT_SIZE; + xi->xi_value = &def_xv; + xi->xi_value_len = OCFS2_XATTR_ROOT_SIZE; } ret = ocfs2_xattr_set_entry_in_bucket(inode, ctxt->handle, xi, xs, @@ -5533,11 +5537,11 @@ static int ocfs2_xattr_set_entry_index_block(struct inode *inode, struct ocfs2_xattr_entry *xe; u16 count, header_size, xh_free_start; int free, max_free, need, old; - size_t value_size = 0, name_len = strlen(xi->name); + size_t value_size = 0, name_len = strlen(xi->xi_name); size_t blocksize = inode->i_sb->s_blocksize; int ret, allocation = 0; - mlog_entry("Set xattr %s in xattr index block\n", xi->name); + mlog_entry("Set xattr %s in xattr index block\n", xi->xi_name); try_again: xh = xs->header; @@ -5553,10 +5557,10 @@ try_again: (unsigned long long)bucket_blkno(xs->bucket), header_size); - if (xi->value && xi->value_len > OCFS2_XATTR_INLINE_SIZE) + if (xi->xi_value && xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) value_size = OCFS2_XATTR_ROOT_SIZE; - else if (xi->value) - value_size = OCFS2_XATTR_SIZE(xi->value_len); + else if (xi->xi_value) + value_size = OCFS2_XATTR_SIZE(xi->xi_value_len); if (xs->not_found) need = sizeof(struct ocfs2_xattr_entry) + @@ -5639,7 +5643,7 @@ try_again: */ ret = ocfs2_check_xattr_bucket_collision(inode, xs->bucket, - xi->name); + xi->xi_name); if (ret) { mlog_errno(ret); goto out; @@ -5663,8 +5667,8 @@ try_again: */ ocfs2_xattr_bucket_relse(xs->bucket); ret = ocfs2_xattr_index_block_find(inode, xs->xattr_bh, - xi->name_index, - xi->name, xs); + xi->xi_name_index, + xi->xi_name, xs); if (ret && ret != -ENODATA) goto out; xs->not_found = ret; @@ -5885,7 +5889,7 @@ static int ocfs2_prepare_refcount_xattr(struct inode *inode, * refcount tree, and make the original extent become 3. So we will need * 2 * cluster more extent recs at most. */ - if (!xi->value || xi->value_len <= OCFS2_XATTR_INLINE_SIZE) { + if (!xi->xi_value || xi->xi_value_len <= OCFS2_XATTR_INLINE_SIZE) { ret = ocfs2_refcounted_xattr_delete_need(inode, &(*ref_tree)->rf_ci, -- cgit v1.2.3 From 18853b95d1fb964b76c3393a12c4d861e7779460 Mon Sep 17 00:00:00 2001 From: Joel Becker <joel.becker@oracle.com> Date: Fri, 14 Aug 2009 18:17:07 -0700 Subject: ocfs2: Add a name_len field to ocfs2_xattr_info. Rather than calculating strlen all over the place, let's store the name length directly on ocfs2_xattr_info. Signed-off-by: Joel Becker <joel.becker@oracle.com> --- fs/ocfs2/xattr.c | 84 +++++++++++++++++++++++++++++--------------------------- 1 file changed, 44 insertions(+), 40 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index c675a6cda0bb..68126adbf311 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -118,6 +118,7 @@ static struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = { struct ocfs2_xattr_info { int xi_name_index; const char *xi_name; + int xi_name_len; const void *xi_value; size_t xi_value_len; }; @@ -1361,9 +1362,9 @@ static int ocfs2_xattr_cleanup(struct inode *inode, size_t offs) { int ret = 0; - size_t name_len = strlen(xi->xi_name); void *val = xs->base + offs; - size_t size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE; + size_t size = OCFS2_XATTR_SIZE(xi->xi_name_len) + + OCFS2_XATTR_ROOT_SIZE; ret = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh, OCFS2_JOURNAL_ACCESS_WRITE); @@ -1427,16 +1428,16 @@ static int ocfs2_xattr_set_value_outside(struct inode *inode, struct ocfs2_xattr_value_buf *vb, size_t offs) { - size_t name_len = strlen(xi->xi_name); void *val = xs->base + offs; struct ocfs2_xattr_value_root *xv = NULL; - size_t size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE; + size_t size = OCFS2_XATTR_SIZE(xi->xi_name_len) + + OCFS2_XATTR_ROOT_SIZE; int ret = 0; memset(val, 0, size); - memcpy(val, xi->xi_name, name_len); + memcpy(val, xi->xi_name, xi->xi_name_len); xv = (struct ocfs2_xattr_value_root *) - (val + OCFS2_XATTR_SIZE(name_len)); + (val + OCFS2_XATTR_SIZE(xi->xi_name_len)); xv->xr_clusters = 0; xv->xr_last_eb_blk = 0; xv->xr_list.l_tree_depth = 0; @@ -1659,7 +1660,6 @@ static void ocfs2_xattr_set_entry_local(struct inode *inode, struct ocfs2_xattr_entry *last, size_t min_offs) { - size_t name_len = strlen(xi->xi_name); struct ocfs2_xa_loc loc; if (xs->xattr_bh == xs->inode_bh) @@ -1673,7 +1673,7 @@ static void ocfs2_xattr_set_entry_local(struct inode *inode, le16_add_cpu(&xs->header->xh_count, 1); ocfs2_xattr_set_type(last, xi->xi_name_index); ocfs2_xattr_set_local(last, 1); - last->xe_name_len = name_len; + last->xe_name_len = xi->xi_name_len; } else { void *first_val; void *val; @@ -1685,23 +1685,23 @@ static void ocfs2_xattr_set_entry_local(struct inode *inode, if (le64_to_cpu(xs->here->xe_value_size) > OCFS2_XATTR_INLINE_SIZE) - size = OCFS2_XATTR_SIZE(name_len) + + size = OCFS2_XATTR_SIZE(xi->xi_name_len) + OCFS2_XATTR_ROOT_SIZE; else - size = OCFS2_XATTR_SIZE(name_len) + + size = OCFS2_XATTR_SIZE(xi->xi_name_len) + OCFS2_XATTR_SIZE(le64_to_cpu(xs->here->xe_value_size)); - if (xi->xi_value && size == OCFS2_XATTR_SIZE(name_len) + + if (xi->xi_value && size == OCFS2_XATTR_SIZE(xi->xi_name_len) + OCFS2_XATTR_SIZE(xi->xi_value_len)) { /* The old and the new value have the same size. Just replace the value. */ ocfs2_xattr_set_local(xs->here, 1); xs->here->xe_value_size = cpu_to_le64(xi->xi_value_len); /* Clear value bytes. */ - memset(val + OCFS2_XATTR_SIZE(name_len), + memset(val + OCFS2_XATTR_SIZE(xi->xi_name_len), 0, OCFS2_XATTR_SIZE(xi->xi_value_len)); - memcpy(val + OCFS2_XATTR_SIZE(name_len), + memcpy(val + OCFS2_XATTR_SIZE(xi->xi_name_len), xi->xi_value, xi->xi_value_len); return; @@ -1716,14 +1716,14 @@ static void ocfs2_xattr_set_entry_local(struct inode *inode, } if (xi->xi_value) { /* Insert the new name+value. */ - size_t size = OCFS2_XATTR_SIZE(name_len) + + size_t size = OCFS2_XATTR_SIZE(xi->xi_name_len) + OCFS2_XATTR_SIZE(xi->xi_value_len); void *val = xs->base + min_offs - size; xs->here->xe_name_offset = cpu_to_le16(min_offs - size); memset(val, 0, size); - memcpy(val, xi->xi_name, name_len); - memcpy(val + OCFS2_XATTR_SIZE(name_len), + memcpy(val, xi->xi_name, xi->xi_name_len); + memcpy(val + OCFS2_XATTR_SIZE(xi->xi_name_len), xi->xi_value, xi->xi_value_len); xs->here->xe_value_size = cpu_to_le64(xi->xi_value_len); @@ -1752,13 +1752,14 @@ static int ocfs2_xattr_set_entry(struct inode *inode, struct ocfs2_xattr_entry *last; struct ocfs2_inode_info *oi = OCFS2_I(inode); struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data; - size_t min_offs = xs->end - xs->base, name_len = strlen(xi->xi_name); + size_t min_offs = xs->end - xs->base; size_t size_l = 0; handle_t *handle = ctxt->handle; int free, i, ret; struct ocfs2_xattr_info xi_l = { .xi_name_index = xi->xi_name_index, .xi_name = xi->xi_name, + .xi_name_len = xi->xi_name_len, .xi_value = xi->xi_value, .xi_value_len = xi->xi_value_len, }; @@ -1790,27 +1791,28 @@ static int ocfs2_xattr_set_entry(struct inode *inode, if (!xs->not_found) { size_t size = 0; if (ocfs2_xattr_is_local(xs->here)) - size = OCFS2_XATTR_SIZE(name_len) + + size = OCFS2_XATTR_SIZE(xi->xi_name_len) + OCFS2_XATTR_SIZE(le64_to_cpu(xs->here->xe_value_size)); else - size = OCFS2_XATTR_SIZE(name_len) + + size = OCFS2_XATTR_SIZE(xi->xi_name_len) + OCFS2_XATTR_ROOT_SIZE; free += (size + sizeof(struct ocfs2_xattr_entry)); } /* Check free space in inode or block */ if (xi->xi_value && xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) { if (free < sizeof(struct ocfs2_xattr_entry) + - OCFS2_XATTR_SIZE(name_len) + + OCFS2_XATTR_SIZE(xi->xi_name_len) + OCFS2_XATTR_ROOT_SIZE) { ret = -ENOSPC; goto out; } - size_l = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE; + size_l = OCFS2_XATTR_SIZE(xi->xi_name_len) + + OCFS2_XATTR_ROOT_SIZE; xi_l.xi_value = (void *)&def_xv; xi_l.xi_value_len = OCFS2_XATTR_ROOT_SIZE; } else if (xi->xi_value) { if (free < sizeof(struct ocfs2_xattr_entry) + - OCFS2_XATTR_SIZE(name_len) + + OCFS2_XATTR_SIZE(xi->xi_name_len) + OCFS2_XATTR_SIZE(xi->xi_value_len)) { ret = -ENOSPC; goto out; @@ -1819,7 +1821,7 @@ static int ocfs2_xattr_set_entry(struct inode *inode, if (!xs->not_found) { /* For existing extended attribute */ - size_t size = OCFS2_XATTR_SIZE(name_len) + + size_t size = OCFS2_XATTR_SIZE(xi->xi_name_len) + OCFS2_XATTR_SIZE(le64_to_cpu(xs->here->xe_value_size)); size_t offs = le16_to_cpu(xs->here->xe_name_offset); void *val = xs->base + offs; @@ -1834,7 +1836,7 @@ static int ocfs2_xattr_set_entry(struct inode *inode, } else if (!ocfs2_xattr_is_local(xs->here)) { /* For existing xattr which has value outside */ vb.vb_xv = (struct ocfs2_xattr_value_root *) - (val + OCFS2_XATTR_SIZE(name_len)); + (val + OCFS2_XATTR_SIZE(xi->xi_name_len)); if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) { /* @@ -2616,7 +2618,7 @@ static int ocfs2_xattr_can_be_in_inode(struct inode *inode, value_size = OCFS2_XATTR_SIZE(xi->xi_value_len); if (free >= sizeof(struct ocfs2_xattr_entry) + - OCFS2_XATTR_SIZE(strlen(xi->xi_name)) + value_size) + OCFS2_XATTR_SIZE(xi->xi_name_len) + value_size) return 1; return 0; @@ -3048,6 +3050,7 @@ int ocfs2_xattr_set_handle(handle_t *handle, struct ocfs2_xattr_info xi = { .xi_name_index = name_index, .xi_name = name, + .xi_name_len = strlen(name), .xi_value = value, .xi_value_len = value_len, }; @@ -3131,6 +3134,7 @@ int ocfs2_xattr_set(struct inode *inode, struct ocfs2_xattr_info xi = { .xi_name_index = name_index, .xi_name = name, + .xi_name_len = strlen(name), .xi_value = value, .xi_value_len = value_len, }; @@ -4980,7 +4984,6 @@ static void ocfs2_xattr_set_entry_normal(struct inode *inode, int local) { struct ocfs2_xattr_entry *last, *xe; - int name_len = strlen(xi->xi_name); struct ocfs2_xattr_header *xh = xs->header; u16 count = le16_to_cpu(xh->xh_count), start; size_t blocksize = inode->i_sb->s_blocksize; @@ -4995,10 +4998,10 @@ static void ocfs2_xattr_set_entry_normal(struct inode *inode, xe = xs->here; offs = le16_to_cpu(xe->xe_name_offset); if (ocfs2_xattr_is_local(xe)) - size = OCFS2_XATTR_SIZE(name_len) + + size = OCFS2_XATTR_SIZE(xi->xi_name_len) + OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size)); else - size = OCFS2_XATTR_SIZE(name_len) + + size = OCFS2_XATTR_SIZE(xi->xi_name_len) + OCFS2_XATTR_SIZE(OCFS2_XATTR_ROOT_SIZE); /* @@ -5008,7 +5011,7 @@ static void ocfs2_xattr_set_entry_normal(struct inode *inode, * new_size safely here. * See ocfs2_xattr_set_in_bucket. */ - new_size = OCFS2_XATTR_SIZE(name_len) + + new_size = OCFS2_XATTR_SIZE(xi->xi_name_len) + OCFS2_XATTR_SIZE(xi->xi_value_len); if (xi->xi_value) { @@ -5025,10 +5028,10 @@ static void ocfs2_xattr_set_entry_normal(struct inode *inode, val = ocfs2_xattr_bucket_get_val(inode, xs->bucket, offs); - memset(val + OCFS2_XATTR_SIZE(name_len), 0, - size - OCFS2_XATTR_SIZE(name_len)); + memset(val + OCFS2_XATTR_SIZE(xi->xi_name_len), 0, + size - OCFS2_XATTR_SIZE(xi->xi_name_len)); if (OCFS2_XATTR_SIZE(xi->xi_value_len) > 0) - memcpy(val + OCFS2_XATTR_SIZE(name_len), + memcpy(val + OCFS2_XATTR_SIZE(xi->xi_name_len), xi->xi_value, xi->xi_value_len); le16_add_cpu(&xh->xh_name_value_len, new_size); @@ -5069,13 +5072,14 @@ static void ocfs2_xattr_set_entry_normal(struct inode *inode, le16_add_cpu(&xh->xh_count, 1); memset(xe, 0, sizeof(struct ocfs2_xattr_entry)); xe->xe_name_hash = cpu_to_le32(name_hash); - xe->xe_name_len = name_len; + xe->xe_name_len = xi->xi_name_len; ocfs2_xattr_set_type(xe, xi->xi_name_index); } set_new_name_value: /* Insert the new name+value. */ - size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_SIZE(xi->xi_value_len); + size = OCFS2_XATTR_SIZE(xi->xi_name_len) + + OCFS2_XATTR_SIZE(xi->xi_value_len); /* * We must make sure that the name/value pair @@ -5094,8 +5098,8 @@ set_new_name_value: xe->xe_name_offset = cpu_to_le16(offs - size); memset(val, 0, size); - memcpy(val, xi->xi_name, name_len); - memcpy(val + OCFS2_XATTR_SIZE(name_len), xi->xi_value, + memcpy(val, xi->xi_name, xi->xi_name_len); + memcpy(val + OCFS2_XATTR_SIZE(xi->xi_name_len), xi->xi_value, xi->xi_value_len); xe->xe_value_size = cpu_to_le64(xi->xi_value_len); @@ -5424,7 +5428,7 @@ static int ocfs2_xattr_set_in_bucket(struct inode *inode, char *val = (char *)xi->xi_value; struct ocfs2_xattr_entry *xe = xs->here; u32 name_hash = ocfs2_xattr_name_hash(inode, xi->xi_name, - strlen(xi->xi_name)); + xi->xi_name_len); if (!xs->not_found && !ocfs2_xattr_is_local(xe)) { /* @@ -5537,7 +5541,7 @@ static int ocfs2_xattr_set_entry_index_block(struct inode *inode, struct ocfs2_xattr_entry *xe; u16 count, header_size, xh_free_start; int free, max_free, need, old; - size_t value_size = 0, name_len = strlen(xi->xi_name); + size_t value_size = 0; size_t blocksize = inode->i_sb->s_blocksize; int ret, allocation = 0; @@ -5564,9 +5568,9 @@ try_again: if (xs->not_found) need = sizeof(struct ocfs2_xattr_entry) + - OCFS2_XATTR_SIZE(name_len) + value_size; + OCFS2_XATTR_SIZE(xi->xi_name_len) + value_size; else { - need = value_size + OCFS2_XATTR_SIZE(name_len); + need = value_size + OCFS2_XATTR_SIZE(xi->xi_name_len); /* * We only replace the old value if the new length is smaller -- cgit v1.2.3 From 199799a3609f6d5bb231a75c2e702afaac805431 Mon Sep 17 00:00:00 2001 From: Joel Becker <joel.becker@oracle.com> Date: Fri, 14 Aug 2009 19:04:15 -0700 Subject: ocfs2: Wrap calculation of name+value pair size. An ocfs2 xattr entry stores the text name and value as a pair in the storage area. Obviously names and values can be variable-sized. If a value is too large for the entry storage, a tree root is stored instead. The name+value pair is also padded. Because of this, there are a million places in the code that do: if (needs_external_tree(value_size) namevalue_size = pad(name_size) + tree_root_size; else namevalue_size = pad(name_size) + pad(value_size); Let's create some convenience functions to make the code more readable. There are three forms. The first takes the raw sizes. The second takes an ocfs2_xattr_info structure. The third takes an existing ocfs2_xattr_entry. Signed-off-by: Joel Becker <joel.becker@oracle.com> --- fs/ocfs2/xattr.c | 170 +++++++++++++++++++++---------------------------------- 1 file changed, 65 insertions(+), 105 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 68126adbf311..064ec6d6c23c 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -183,6 +183,33 @@ struct ocfs2_xa_loc { const struct ocfs2_xa_loc_operations *xl_ops; }; +/* + * Convenience functions to calculate how much space is needed for a + * given name+value pair + */ +static int namevalue_size(int name_len, uint64_t value_len) +{ + if (value_len > OCFS2_XATTR_INLINE_SIZE) + return OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE; + else + return OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_SIZE(value_len); +} + +static int namevalue_size_xi(struct ocfs2_xattr_info *xi) +{ + return namevalue_size(xi->xi_name_len, xi->xi_value_len); +} + +static int namevalue_size_xe(struct ocfs2_xattr_entry *xe) +{ + u64 value_len = le64_to_cpu(xe->xe_value_size); + + BUG_ON((value_len > OCFS2_XATTR_INLINE_SIZE) && + ocfs2_xattr_is_local(xe)); + return namevalue_size(xe->xe_name_len, value_len); +} + + static int ocfs2_xattr_bucket_get_name_value(struct super_block *sb, struct ocfs2_xattr_header *xh, int index, @@ -529,15 +556,20 @@ static void ocfs2_xattr_hash_entry(struct inode *inode, static int ocfs2_xattr_entry_real_size(int name_len, size_t value_len) { - int size = 0; + return namevalue_size(name_len, value_len) + + sizeof(struct ocfs2_xattr_entry); +} - if (value_len <= OCFS2_XATTR_INLINE_SIZE) - size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_SIZE(value_len); - else - size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE; - size += sizeof(struct ocfs2_xattr_entry); +static int ocfs2_xi_entry_usage(struct ocfs2_xattr_info *xi) +{ + return namevalue_size_xi(xi) + + sizeof(struct ocfs2_xattr_entry); +} - return size; +static int ocfs2_xe_entry_usage(struct ocfs2_xattr_entry *xe) +{ + return namevalue_size_xe(xe) + + sizeof(struct ocfs2_xattr_entry); } int ocfs2_calc_security_init(struct inode *dir, @@ -1363,8 +1395,7 @@ static int ocfs2_xattr_cleanup(struct inode *inode, { int ret = 0; void *val = xs->base + offs; - size_t size = OCFS2_XATTR_SIZE(xi->xi_name_len) + - OCFS2_XATTR_ROOT_SIZE; + size_t size = namevalue_size_xi(xi); ret = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh, OCFS2_JOURNAL_ACCESS_WRITE); @@ -1430,8 +1461,7 @@ static int ocfs2_xattr_set_value_outside(struct inode *inode, { void *val = xs->base + offs; struct ocfs2_xattr_value_root *xv = NULL; - size_t size = OCFS2_XATTR_SIZE(xi->xi_name_len) + - OCFS2_XATTR_ROOT_SIZE; + size_t size = namevalue_size_xi(xi); int ret = 0; memset(val, 0, size); @@ -1490,15 +1520,10 @@ static void ocfs2_xa_block_wipe_namevalue(struct ocfs2_xa_loc *loc) int namevalue_offset, first_namevalue_offset, namevalue_size; struct ocfs2_xattr_entry *entry = loc->xl_entry; struct ocfs2_xattr_header *xh = loc->xl_header; - u64 value_size = le64_to_cpu(entry->xe_value_size); int count = le16_to_cpu(xh->xh_count); namevalue_offset = le16_to_cpu(entry->xe_name_offset); - namevalue_size = OCFS2_XATTR_SIZE(entry->xe_name_len); - if (value_size > OCFS2_XATTR_INLINE_SIZE) - namevalue_size += OCFS2_XATTR_ROOT_SIZE; - else - namevalue_size += OCFS2_XATTR_SIZE(value_size); + namevalue_size = namevalue_size_xe(entry); for (i = 0, first_namevalue_offset = loc->xl_size; i < count; i++) { @@ -1553,17 +1578,8 @@ static void *ocfs2_xa_bucket_offset_pointer(struct ocfs2_xa_loc *loc, static void ocfs2_xa_bucket_wipe_namevalue(struct ocfs2_xa_loc *loc) { - int namevalue_size; - struct ocfs2_xattr_entry *entry = loc->xl_entry; - u64 value_size = le64_to_cpu(entry->xe_value_size); - - namevalue_size = OCFS2_XATTR_SIZE(entry->xe_name_len); - if (value_size > OCFS2_XATTR_INLINE_SIZE) - namevalue_size += OCFS2_XATTR_ROOT_SIZE; - else - namevalue_size += OCFS2_XATTR_SIZE(value_size); - - le16_add_cpu(&loc->xl_header->xh_name_value_len, -namevalue_size); + le16_add_cpu(&loc->xl_header->xh_name_value_len, + -namevalue_size_xe(loc->xl_entry)); } /* Operations for xattrs stored in buckets. */ @@ -1683,16 +1699,8 @@ static void ocfs2_xattr_set_entry_local(struct inode *inode, offs = le16_to_cpu(xs->here->xe_name_offset); val = xs->base + offs; - if (le64_to_cpu(xs->here->xe_value_size) > - OCFS2_XATTR_INLINE_SIZE) - size = OCFS2_XATTR_SIZE(xi->xi_name_len) + - OCFS2_XATTR_ROOT_SIZE; - else - size = OCFS2_XATTR_SIZE(xi->xi_name_len) + - OCFS2_XATTR_SIZE(le64_to_cpu(xs->here->xe_value_size)); - - if (xi->xi_value && size == OCFS2_XATTR_SIZE(xi->xi_name_len) + - OCFS2_XATTR_SIZE(xi->xi_value_len)) { + size = namevalue_size_xe(xs->here); + if (xi->xi_value && (size == namevalue_size_xi(xi))) { /* The old and the new value have the same size. Just replace the value. */ ocfs2_xattr_set_local(xs->here, 1); @@ -1716,8 +1724,7 @@ static void ocfs2_xattr_set_entry_local(struct inode *inode, } if (xi->xi_value) { /* Insert the new name+value. */ - size_t size = OCFS2_XATTR_SIZE(xi->xi_name_len) + - OCFS2_XATTR_SIZE(xi->xi_value_len); + size_t size = namevalue_size_xi(xi); void *val = xs->base + min_offs - size; xs->here->xe_name_offset = cpu_to_le16(min_offs - size); @@ -1788,41 +1795,25 @@ static int ocfs2_xattr_set_entry(struct inode *inode, if (free < 0) return -EIO; - if (!xs->not_found) { - size_t size = 0; - if (ocfs2_xattr_is_local(xs->here)) - size = OCFS2_XATTR_SIZE(xi->xi_name_len) + - OCFS2_XATTR_SIZE(le64_to_cpu(xs->here->xe_value_size)); - else - size = OCFS2_XATTR_SIZE(xi->xi_name_len) + - OCFS2_XATTR_ROOT_SIZE; - free += (size + sizeof(struct ocfs2_xattr_entry)); - } + if (!xs->not_found) + free += ocfs2_xe_entry_usage(xs->here); + /* Check free space in inode or block */ - if (xi->xi_value && xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) { - if (free < sizeof(struct ocfs2_xattr_entry) + - OCFS2_XATTR_SIZE(xi->xi_name_len) + - OCFS2_XATTR_ROOT_SIZE) { + if (xi->xi_value) { + if (free < ocfs2_xi_entry_usage(xi)) { ret = -ENOSPC; goto out; } - size_l = OCFS2_XATTR_SIZE(xi->xi_name_len) + - OCFS2_XATTR_ROOT_SIZE; - xi_l.xi_value = (void *)&def_xv; - xi_l.xi_value_len = OCFS2_XATTR_ROOT_SIZE; - } else if (xi->xi_value) { - if (free < sizeof(struct ocfs2_xattr_entry) + - OCFS2_XATTR_SIZE(xi->xi_name_len) + - OCFS2_XATTR_SIZE(xi->xi_value_len)) { - ret = -ENOSPC; - goto out; + if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) { + size_l = namevalue_size_xi(xi); + xi_l.xi_value = (void *)&def_xv; + xi_l.xi_value_len = OCFS2_XATTR_ROOT_SIZE; } } if (!xs->not_found) { /* For existing extended attribute */ - size_t size = OCFS2_XATTR_SIZE(xi->xi_name_len) + - OCFS2_XATTR_SIZE(le64_to_cpu(xs->here->xe_value_size)); + size_t size = namevalue_size_xe(xs->here); size_t offs = le16_to_cpu(xs->here->xe_name_offset); void *val = xs->base + offs; @@ -2589,7 +2580,6 @@ static int ocfs2_xattr_can_be_in_inode(struct inode *inode, struct ocfs2_xattr_info *xi, struct ocfs2_xattr_search *xs) { - u64 value_size; struct ocfs2_xattr_entry *last; int free, i; size_t min_offs = xs->end - xs->base; @@ -2612,13 +2602,7 @@ static int ocfs2_xattr_can_be_in_inode(struct inode *inode, BUG_ON(!xs->not_found); - if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) - value_size = OCFS2_XATTR_ROOT_SIZE; - else - value_size = OCFS2_XATTR_SIZE(xi->xi_value_len); - - if (free >= sizeof(struct ocfs2_xattr_entry) + - OCFS2_XATTR_SIZE(xi->xi_name_len) + value_size) + if (free >= (sizeof(struct ocfs2_xattr_entry) + namevalue_size_xi(xi))) return 1; return 0; @@ -3980,7 +3964,7 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode, struct ocfs2_xattr_bucket *bucket) { int ret, i; - size_t end, offset, len, value_len; + size_t end, offset, len; struct ocfs2_xattr_header *xh; char *entries, *buf, *bucket_buf = NULL; u64 blkno = bucket_blkno(bucket); @@ -4034,12 +4018,7 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode, end = OCFS2_XATTR_BUCKET_SIZE; for (i = 0; i < le16_to_cpu(xh->xh_count); i++, xe++) { offset = le16_to_cpu(xe->xe_name_offset); - if (ocfs2_xattr_is_local(xe)) - value_len = OCFS2_XATTR_SIZE( - le64_to_cpu(xe->xe_value_size)); - else - value_len = OCFS2_XATTR_ROOT_SIZE; - len = OCFS2_XATTR_SIZE(xe->xe_name_len) + value_len; + len = namevalue_size_xe(xe); /* * We must make sure that the name/value pair @@ -4228,7 +4207,7 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode, int new_bucket_head) { int ret, i; - int count, start, len, name_value_len = 0, xe_len, name_offset = 0; + int count, start, len, name_value_len = 0, name_offset = 0; struct ocfs2_xattr_bucket *s_bucket = NULL, *t_bucket = NULL; struct ocfs2_xattr_header *xh; struct ocfs2_xattr_entry *xe; @@ -4319,13 +4298,7 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode, name_value_len = 0; for (i = 0; i < start; i++) { xe = &xh->xh_entries[i]; - xe_len = OCFS2_XATTR_SIZE(xe->xe_name_len); - if (ocfs2_xattr_is_local(xe)) - xe_len += - OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size)); - else - xe_len += OCFS2_XATTR_ROOT_SIZE; - name_value_len += xe_len; + name_value_len += namevalue_size_xe(xe); if (le16_to_cpu(xe->xe_name_offset) < name_offset) name_offset = le16_to_cpu(xe->xe_name_offset); } @@ -4355,12 +4328,6 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode, xh->xh_free_start = cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE); for (i = 0; i < le16_to_cpu(xh->xh_count); i++) { xe = &xh->xh_entries[i]; - xe_len = OCFS2_XATTR_SIZE(xe->xe_name_len); - if (ocfs2_xattr_is_local(xe)) - xe_len += - OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size)); - else - xe_len += OCFS2_XATTR_ROOT_SIZE; if (le16_to_cpu(xe->xe_name_offset) < le16_to_cpu(xh->xh_free_start)) xh->xh_free_start = xe->xe_name_offset; @@ -4997,12 +4964,7 @@ static void ocfs2_xattr_set_entry_normal(struct inode *inode, if (!xs->not_found) { xe = xs->here; offs = le16_to_cpu(xe->xe_name_offset); - if (ocfs2_xattr_is_local(xe)) - size = OCFS2_XATTR_SIZE(xi->xi_name_len) + - OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size)); - else - size = OCFS2_XATTR_SIZE(xi->xi_name_len) + - OCFS2_XATTR_SIZE(OCFS2_XATTR_ROOT_SIZE); + size = namevalue_size_xe(xe); /* * If the new value will be stored outside, xi->xi_value has @@ -5011,8 +4973,7 @@ static void ocfs2_xattr_set_entry_normal(struct inode *inode, * new_size safely here. * See ocfs2_xattr_set_in_bucket. */ - new_size = OCFS2_XATTR_SIZE(xi->xi_name_len) + - OCFS2_XATTR_SIZE(xi->xi_value_len); + new_size = namevalue_size_xi(xi); if (xi->xi_value) { ocfs2_xa_wipe_namevalue(&loc); @@ -5078,8 +5039,7 @@ static void ocfs2_xattr_set_entry_normal(struct inode *inode, set_new_name_value: /* Insert the new name+value. */ - size = OCFS2_XATTR_SIZE(xi->xi_name_len) + - OCFS2_XATTR_SIZE(xi->xi_value_len); + size = namevalue_size_xi(xi); /* * We must make sure that the name/value pair -- cgit v1.2.3 From 69a3e539d083ac09aec92b8705b8ff2c2e5c810c Mon Sep 17 00:00:00 2001 From: Joel Becker <joel.becker@oracle.com> Date: Mon, 17 Aug 2009 12:24:39 -0700 Subject: ocfs2: Set the xattr name+value pair in one place We create two new functions on ocfs2_xa_loc, ocfs2_xa_prepare_entry() and ocfs2_xa_store_inline_value(). ocfs2_xa_prepare_entry() makes sure that the xl_entry field of ocfs2_xa_loc is ready to receive an xattr. The entry will point to an appropriately sized name+value region in storage. If an existing entry can be reused, it will be. If no entry already exists, it will be allocated. If there isn't space to allocate it, -ENOSPC will be returned. ocfs2_xa_store_inline_value() stores the data that goes into the 'value' part of the name+value pair. For values that don't fit directly, this stores the value tree root. A number of operations are added to ocfs2_xa_loc_operations to support these functions. This reflects the disparate behaviors of xattr blocks and buckets. With these functions, the overlapping ocfs2_xattr_set_entry_local() and ocfs2_xattr_set_entry_normal() can be replaced with a single call scheme. Signed-off-by: Joel Becker <joel.becker@oracle.com> --- fs/ocfs2/xattr.c | 634 ++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 411 insertions(+), 223 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 064ec6d6c23c..4bf6ec849e19 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -147,11 +147,31 @@ struct ocfs2_xa_loc_operations { */ void *(*xlo_offset_pointer)(struct ocfs2_xa_loc *loc, int offset); + /* Can we reuse the existing entry for the new value? */ + int (*xlo_can_reuse)(struct ocfs2_xa_loc *loc, + struct ocfs2_xattr_info *xi); + + /* How much space is needed for the new value? */ + int (*xlo_check_space)(struct ocfs2_xa_loc *loc, + struct ocfs2_xattr_info *xi); + + /* + * Return the offset of the first name+value pair. This is + * the start of our downward-filling free space. + */ + int (*xlo_get_free_start)(struct ocfs2_xa_loc *loc); + /* * Remove the name+value at this location. Do whatever is * appropriate with the remaining name+value pairs. */ void (*xlo_wipe_namevalue)(struct ocfs2_xa_loc *loc); + + /* Fill xl_entry with a new entry */ + void (*xlo_add_entry)(struct ocfs2_xa_loc *loc, u32 name_hash); + + /* Add name+value storage to an entry */ + void (*xlo_add_namevalue)(struct ocfs2_xa_loc *loc, int size); }; /* @@ -1493,6 +1513,33 @@ static int ocfs2_xattr_set_value_outside(struct inode *inode, return ret; } +static int ocfs2_xa_check_space_helper(int needed_space, int free_start, + int num_entries) +{ + int free_space; + + if (!needed_space) + return 0; + + free_space = free_start - + sizeof(struct ocfs2_xattr_header) - + (num_entries * sizeof(struct ocfs2_xattr_entry)) - + OCFS2_XATTR_HEADER_GAP; + if (free_space < 0) + return -EIO; + if (free_space < needed_space) + return -ENOSPC; + + return 0; +} + +/* Give a pointer into the storage for the given offset */ +static void *ocfs2_xa_offset_pointer(struct ocfs2_xa_loc *loc, int offset) +{ + BUG_ON(offset >= loc->xl_size); + return loc->xl_ops->xlo_offset_pointer(loc, offset); +} + /* * Wipe the name+value pair and allow the storage to reclaim it. This * must be followed by either removal of the entry or a call to @@ -1503,13 +1550,117 @@ static void ocfs2_xa_wipe_namevalue(struct ocfs2_xa_loc *loc) loc->xl_ops->xlo_wipe_namevalue(loc); } +/* + * Find lowest offset to a name+value pair. This is the start of our + * downward-growing free space. + */ +static int ocfs2_xa_get_free_start(struct ocfs2_xa_loc *loc) +{ + return loc->xl_ops->xlo_get_free_start(loc); +} + +/* Can we reuse loc->xl_entry for xi? */ +static int ocfs2_xa_can_reuse_entry(struct ocfs2_xa_loc *loc, + struct ocfs2_xattr_info *xi) +{ + return loc->xl_ops->xlo_can_reuse(loc, xi); +} + +/* How much free space is needed to set the new value */ +static int ocfs2_xa_check_space(struct ocfs2_xa_loc *loc, + struct ocfs2_xattr_info *xi) +{ + return loc->xl_ops->xlo_check_space(loc, xi); +} + +static void ocfs2_xa_add_entry(struct ocfs2_xa_loc *loc, u32 name_hash) +{ + loc->xl_ops->xlo_add_entry(loc, name_hash); + loc->xl_entry->xe_name_hash = cpu_to_le32(name_hash); + /* + * We can't leave the new entry's xe_name_offset at zero or + * add_namevalue() will go nuts. We set it to the size of our + * storage so that it can never be less than any other entry. + */ + loc->xl_entry->xe_name_offset = cpu_to_le16(loc->xl_size); +} + +static void ocfs2_xa_add_namevalue(struct ocfs2_xa_loc *loc, + struct ocfs2_xattr_info *xi) +{ + int size = namevalue_size_xi(xi); + int nameval_offset; + char *nameval_buf; + + loc->xl_ops->xlo_add_namevalue(loc, size); + loc->xl_entry->xe_value_size = cpu_to_le64(xi->xi_value_len); + loc->xl_entry->xe_name_len = xi->xi_name_len; + ocfs2_xattr_set_type(loc->xl_entry, xi->xi_name_index); + ocfs2_xattr_set_local(loc->xl_entry, + xi->xi_value_len <= OCFS2_XATTR_INLINE_SIZE); + + nameval_offset = le16_to_cpu(loc->xl_entry->xe_name_offset); + nameval_buf = ocfs2_xa_offset_pointer(loc, nameval_offset); + memset(nameval_buf, 0, size); + memcpy(nameval_buf, xi->xi_name, xi->xi_name_len); +} + static void *ocfs2_xa_block_offset_pointer(struct ocfs2_xa_loc *loc, int offset) { - BUG_ON(offset >= loc->xl_size); return (char *)loc->xl_header + offset; } +static int ocfs2_xa_block_can_reuse(struct ocfs2_xa_loc *loc, + struct ocfs2_xattr_info *xi) +{ + /* + * Block storage is strict. If the sizes aren't exact, we will + * remove the old one and reinsert the new. + */ + return namevalue_size_xe(loc->xl_entry) == + namevalue_size_xi(xi); +} + +static int ocfs2_xa_block_get_free_start(struct ocfs2_xa_loc *loc) +{ + struct ocfs2_xattr_header *xh = loc->xl_header; + int i, count = le16_to_cpu(xh->xh_count); + int offset, free_start = loc->xl_size; + + for (i = 0; i < count; i++) { + offset = le16_to_cpu(xh->xh_entries[i].xe_name_offset); + if (offset < free_start) + free_start = offset; + } + + return free_start; +} + +static int ocfs2_xa_block_check_space(struct ocfs2_xa_loc *loc, + struct ocfs2_xattr_info *xi) +{ + int count = le16_to_cpu(loc->xl_header->xh_count); + int free_start = ocfs2_xa_get_free_start(loc); + int needed_space = ocfs2_xi_entry_usage(xi); + + /* + * Block storage will reclaim the original entry before inserting + * the new value, so we only need the difference. If the new + * entry is smaller than the old one, we don't need anything. + */ + if (loc->xl_entry) { + /* Don't need space if we're reusing! */ + if (ocfs2_xa_can_reuse_entry(loc, xi)) + needed_space = 0; + else + needed_space -= ocfs2_xe_entry_usage(loc->xl_entry); + } + if (needed_space < 0) + needed_space = 0; + return ocfs2_xa_check_space_helper(needed_space, free_start, count); +} + /* * Block storage for xattrs keeps the name+value pairs compacted. When * we remove one, we have to shift any that preceded it towards the end. @@ -1524,13 +1675,7 @@ static void ocfs2_xa_block_wipe_namevalue(struct ocfs2_xa_loc *loc) namevalue_offset = le16_to_cpu(entry->xe_name_offset); namevalue_size = namevalue_size_xe(entry); - - for (i = 0, first_namevalue_offset = loc->xl_size; - i < count; i++) { - offset = le16_to_cpu(xh->xh_entries[i].xe_name_offset); - if (offset < first_namevalue_offset) - first_namevalue_offset = offset; - } + first_namevalue_offset = ocfs2_xa_get_free_start(loc); /* Shift the name+value pairs */ memmove((char *)xh + first_namevalue_offset + namevalue_size, @@ -1552,13 +1697,33 @@ static void ocfs2_xa_block_wipe_namevalue(struct ocfs2_xa_loc *loc) */ } +static void ocfs2_xa_block_add_entry(struct ocfs2_xa_loc *loc, u32 name_hash) +{ + int count = le16_to_cpu(loc->xl_header->xh_count); + loc->xl_entry = &(loc->xl_header->xh_entries[count]); + le16_add_cpu(&loc->xl_header->xh_count, 1); + memset(loc->xl_entry, 0, sizeof(struct ocfs2_xattr_entry)); +} + +static void ocfs2_xa_block_add_namevalue(struct ocfs2_xa_loc *loc, int size) +{ + int free_start = ocfs2_xa_get_free_start(loc); + + loc->xl_entry->xe_name_offset = cpu_to_le16(free_start - size); +} + /* * Operations for xattrs stored in blocks. This includes inline inode * storage and unindexed ocfs2_xattr_blocks. */ static const struct ocfs2_xa_loc_operations ocfs2_xa_block_loc_ops = { .xlo_offset_pointer = ocfs2_xa_block_offset_pointer, + .xlo_check_space = ocfs2_xa_block_check_space, + .xlo_can_reuse = ocfs2_xa_block_can_reuse, + .xlo_get_free_start = ocfs2_xa_block_get_free_start, .xlo_wipe_namevalue = ocfs2_xa_block_wipe_namevalue, + .xlo_add_entry = ocfs2_xa_block_add_entry, + .xlo_add_namevalue = ocfs2_xa_block_add_namevalue, }; static void *ocfs2_xa_bucket_offset_pointer(struct ocfs2_xa_loc *loc, @@ -1567,8 +1732,6 @@ static void *ocfs2_xa_bucket_offset_pointer(struct ocfs2_xa_loc *loc, struct ocfs2_xattr_bucket *bucket = loc->xl_storage; int block, block_offset; - BUG_ON(offset >= OCFS2_XATTR_BUCKET_SIZE); - /* The header is at the front of the bucket */ block = offset >> bucket->bu_inode->i_sb->s_blocksize_bits; block_offset = offset % bucket->bu_inode->i_sb->s_blocksize; @@ -1576,16 +1739,145 @@ static void *ocfs2_xa_bucket_offset_pointer(struct ocfs2_xa_loc *loc, return bucket_block(bucket, block) + block_offset; } +static int ocfs2_xa_bucket_can_reuse(struct ocfs2_xa_loc *loc, + struct ocfs2_xattr_info *xi) +{ + return namevalue_size_xe(loc->xl_entry) >= + namevalue_size_xi(xi); +} + +static int ocfs2_xa_bucket_get_free_start(struct ocfs2_xa_loc *loc) +{ + struct ocfs2_xattr_bucket *bucket = loc->xl_storage; + return le16_to_cpu(bucket_xh(bucket)->xh_free_start); +} + +static int ocfs2_bucket_align_free_start(struct super_block *sb, + int free_start, int size) +{ + /* + * We need to make sure that the name+value pair fits within + * one block. + */ + if (((free_start - size) >> sb->s_blocksize_bits) != + ((free_start - 1) >> sb->s_blocksize_bits)) + free_start -= free_start % sb->s_blocksize; + + return free_start; +} + +static int ocfs2_xa_bucket_check_space(struct ocfs2_xa_loc *loc, + struct ocfs2_xattr_info *xi) +{ + int rc; + int count = le16_to_cpu(loc->xl_header->xh_count); + int free_start = ocfs2_xa_get_free_start(loc); + int needed_space = ocfs2_xi_entry_usage(xi); + int size = namevalue_size_xi(xi); + struct ocfs2_xattr_bucket *bucket = loc->xl_storage; + struct super_block *sb = bucket->bu_inode->i_sb; + + /* + * Bucket storage does not reclaim name+value pairs it cannot + * reuse. They live as holes until the bucket fills, and then + * the bucket is defragmented. However, the bucket can reclaim + * the ocfs2_xattr_entry. + */ + if (loc->xl_entry) { + /* Don't need space if we're reusing! */ + if (ocfs2_xa_can_reuse_entry(loc, xi)) + needed_space = 0; + else + needed_space -= sizeof(struct ocfs2_xattr_entry); + } + BUG_ON(needed_space < 0); + + if (free_start < size) { + if (needed_space) + return -ENOSPC; + } else { + /* + * First we check if it would fit in the first place. + * Below, we align the free start to a block. This may + * slide us below the minimum gap. By checking unaligned + * first, we avoid that error. + */ + rc = ocfs2_xa_check_space_helper(needed_space, free_start, + count); + if (rc) + return rc; + free_start = ocfs2_bucket_align_free_start(sb, free_start, + size); + } + return ocfs2_xa_check_space_helper(needed_space, free_start, count); +} + static void ocfs2_xa_bucket_wipe_namevalue(struct ocfs2_xa_loc *loc) { le16_add_cpu(&loc->xl_header->xh_name_value_len, -namevalue_size_xe(loc->xl_entry)); } +static void ocfs2_xa_bucket_add_entry(struct ocfs2_xa_loc *loc, u32 name_hash) +{ + struct ocfs2_xattr_header *xh = loc->xl_header; + int count = le16_to_cpu(xh->xh_count); + int low = 0, high = count - 1, tmp; + struct ocfs2_xattr_entry *tmp_xe; + + /* + * We keep buckets sorted by name_hash, so we need to find + * our insert place. + */ + while (low <= high && count) { + tmp = (low + high) / 2; + tmp_xe = &xh->xh_entries[tmp]; + + if (name_hash > le32_to_cpu(tmp_xe->xe_name_hash)) + low = tmp + 1; + else if (name_hash < le32_to_cpu(tmp_xe->xe_name_hash)) + high = tmp - 1; + else { + low = tmp; + break; + } + } + + if (low != count) + memmove(&xh->xh_entries[low + 1], + &xh->xh_entries[low], + ((count - low) * sizeof(struct ocfs2_xattr_entry))); + + le16_add_cpu(&xh->xh_count, 1); + loc->xl_entry = &xh->xh_entries[low]; + memset(loc->xl_entry, 0, sizeof(struct ocfs2_xattr_entry)); +} + +static void ocfs2_xa_bucket_add_namevalue(struct ocfs2_xa_loc *loc, int size) +{ + int free_start = ocfs2_xa_get_free_start(loc); + struct ocfs2_xattr_header *xh = loc->xl_header; + struct ocfs2_xattr_bucket *bucket = loc->xl_storage; + struct super_block *sb = bucket->bu_inode->i_sb; + int nameval_offset; + + free_start = ocfs2_bucket_align_free_start(sb, free_start, size); + nameval_offset = free_start - size; + loc->xl_entry->xe_name_offset = cpu_to_le16(nameval_offset); + xh->xh_free_start = cpu_to_le16(nameval_offset); + le16_add_cpu(&xh->xh_name_value_len, size); + +} + /* Operations for xattrs stored in buckets. */ static const struct ocfs2_xa_loc_operations ocfs2_xa_bucket_loc_ops = { .xlo_offset_pointer = ocfs2_xa_bucket_offset_pointer, + .xlo_check_space = ocfs2_xa_bucket_check_space, + .xlo_can_reuse = ocfs2_xa_bucket_can_reuse, + .xlo_get_free_start = ocfs2_xa_bucket_get_free_start, .xlo_wipe_namevalue = ocfs2_xa_bucket_wipe_namevalue, + .xlo_add_entry = ocfs2_xa_bucket_add_entry, + .xlo_add_namevalue = ocfs2_xa_bucket_add_namevalue, }; static void ocfs2_xa_remove_entry(struct ocfs2_xa_loc *loc) @@ -1615,6 +1907,77 @@ static void ocfs2_xa_remove_entry(struct ocfs2_xa_loc *loc) } } +/* + * Prepares loc->xl_entry to receive the new xattr. This includes + * properly setting up the name+value pair region. If loc->xl_entry + * already exists, it will take care of modifying it appropriately. + * This also includes deleting entries, but don't call this to remove + * a non-existant entry. That's just a bug. + * + * Note that this modifies the data. You did journal_access already, + * right? + */ +static int ocfs2_xa_prepare_entry(struct ocfs2_xa_loc *loc, + struct ocfs2_xattr_info *xi, + u32 name_hash) +{ + int rc = 0; + int name_size = OCFS2_XATTR_SIZE(xi->xi_name_len); + char *nameval_buf; + + if (!xi->xi_value) { + ocfs2_xa_remove_entry(loc); + goto out; + } + + rc = ocfs2_xa_check_space(loc, xi); + if (rc) + goto out; + + if (loc->xl_entry) { + if (ocfs2_xa_can_reuse_entry(loc, xi)) { + nameval_buf = ocfs2_xa_offset_pointer(loc, + le16_to_cpu(loc->xl_entry->xe_name_offset)); + memset(nameval_buf + name_size, 0, + namevalue_size_xe(loc->xl_entry) - name_size); + loc->xl_entry->xe_value_size = + cpu_to_le64(xi->xi_value_len); + goto out; + } + + ocfs2_xa_wipe_namevalue(loc); + } else + ocfs2_xa_add_entry(loc, name_hash); + + /* + * If we get here, we have a blank entry. Fill it. We grow our + * name+value pair back from the end. + */ + ocfs2_xa_add_namevalue(loc, xi); + +out: + return rc; +} + +/* + * Store the value portion of the name+value pair. This is either an + * inline value or the tree root of an external value. + */ +static void ocfs2_xa_store_inline_value(struct ocfs2_xa_loc *loc, + struct ocfs2_xattr_info *xi) +{ + int nameval_offset = le16_to_cpu(loc->xl_entry->xe_name_offset); + int name_size = OCFS2_XATTR_SIZE(xi->xi_name_len); + int size = namevalue_size_xi(xi); + char *nameval_buf; + + if (!xi->xi_value) + return; + + nameval_buf = ocfs2_xa_offset_pointer(loc, nameval_offset); + memcpy(nameval_buf + name_size, xi->xi_value, size - name_size); +} + static void ocfs2_init_dinode_xa_loc(struct ocfs2_xa_loc *loc, struct inode *inode, struct buffer_head *bh, @@ -1665,81 +2028,6 @@ static void ocfs2_init_xattr_bucket_xa_loc(struct ocfs2_xa_loc *loc, loc->xl_size = OCFS2_XATTR_BUCKET_SIZE; } -/* - * ocfs2_xattr_set_entry_local() - * - * Set, replace or remove extended attribute in local. - */ -static void ocfs2_xattr_set_entry_local(struct inode *inode, - struct ocfs2_xattr_info *xi, - struct ocfs2_xattr_search *xs, - struct ocfs2_xattr_entry *last, - size_t min_offs) -{ - struct ocfs2_xa_loc loc; - - if (xs->xattr_bh == xs->inode_bh) - ocfs2_init_dinode_xa_loc(&loc, inode, xs->inode_bh, - xs->not_found ? NULL : xs->here); - else - ocfs2_init_xattr_block_xa_loc(&loc, xs->xattr_bh, - xs->not_found ? NULL : xs->here); - if (xi->xi_value && xs->not_found) { - /* Insert the new xattr entry. */ - le16_add_cpu(&xs->header->xh_count, 1); - ocfs2_xattr_set_type(last, xi->xi_name_index); - ocfs2_xattr_set_local(last, 1); - last->xe_name_len = xi->xi_name_len; - } else { - void *first_val; - void *val; - size_t offs, size; - - first_val = xs->base + min_offs; - offs = le16_to_cpu(xs->here->xe_name_offset); - val = xs->base + offs; - - size = namevalue_size_xe(xs->here); - if (xi->xi_value && (size == namevalue_size_xi(xi))) { - /* The old and the new value have the - same size. Just replace the value. */ - ocfs2_xattr_set_local(xs->here, 1); - xs->here->xe_value_size = cpu_to_le64(xi->xi_value_len); - /* Clear value bytes. */ - memset(val + OCFS2_XATTR_SIZE(xi->xi_name_len), - 0, - OCFS2_XATTR_SIZE(xi->xi_value_len)); - memcpy(val + OCFS2_XATTR_SIZE(xi->xi_name_len), - xi->xi_value, - xi->xi_value_len); - return; - } - - if (!xi->xi_value) - ocfs2_xa_remove_entry(&loc); - else - ocfs2_xa_wipe_namevalue(&loc); - - min_offs += size; - } - if (xi->xi_value) { - /* Insert the new name+value. */ - size_t size = namevalue_size_xi(xi); - void *val = xs->base + min_offs - size; - - xs->here->xe_name_offset = cpu_to_le16(min_offs - size); - memset(val, 0, size); - memcpy(val, xi->xi_name, xi->xi_name_len); - memcpy(val + OCFS2_XATTR_SIZE(xi->xi_name_len), - xi->xi_value, - xi->xi_value_len); - xs->here->xe_value_size = cpu_to_le64(xi->xi_value_len); - ocfs2_xattr_set_local(xs->here, 1); - ocfs2_xattr_hash_entry(inode, xs->header, xs->here); - } - - return; -} /* * ocfs2_xattr_set_entry() @@ -1747,7 +2035,7 @@ static void ocfs2_xattr_set_entry_local(struct inode *inode, * Set extended attribute entry into inode or block. * * If extended attribute value size > OCFS2_XATTR_INLINE_SIZE, - * We first insert tree root(ocfs2_xattr_value_root) with set_entry_local(), + * We first insert tree root(ocfs2_xattr_value_root) like a normal value, * then set value in B tree with set_value_outside(). */ static int ocfs2_xattr_set_entry(struct inode *inode, @@ -1763,6 +2051,9 @@ static int ocfs2_xattr_set_entry(struct inode *inode, size_t size_l = 0; handle_t *handle = ctxt->handle; int free, i, ret; + u32 name_hash = ocfs2_xattr_name_hash(inode, xi->xi_name, + xi->xi_name_len); + struct ocfs2_xa_loc loc; struct ocfs2_xattr_info xi_l = { .xi_name_index = xi->xi_name_index, .xi_name = xi->xi_name, @@ -1894,11 +2185,28 @@ static int ocfs2_xattr_set_entry(struct inode *inode, } } + if (xs->xattr_bh == xs->inode_bh) + ocfs2_init_dinode_xa_loc(&loc, inode, xs->inode_bh, + xs->not_found ? NULL : xs->here); + else + ocfs2_init_xattr_block_xa_loc(&loc, xs->xattr_bh, + xs->not_found ? NULL : xs->here); + /* - * Set value in local, include set tree root in local. - * This is the first step for value size >INLINE_SIZE. + * Prepare our entry and insert the inline value. This will + * be a value tree root for values that are larger than + * OCFS2_XATTR_INLINE_SIZE. */ - ocfs2_xattr_set_entry_local(inode, &xi_l, xs, last, min_offs); + ret = ocfs2_xa_prepare_entry(&loc, xi, name_hash); + if (ret) { + if (ret != -ENOSPC) + mlog_errno(ret); + goto out; + } + /* XXX For now, until we make ocfs2_xa_prepare_entry() primary */ + BUG_ON(ret == -ENOSPC); + ocfs2_xa_store_inline_value(&loc, xi); + xs->here = loc.xl_entry; if (!(flag & OCFS2_INLINE_XATTR_FL)) { ret = ocfs2_journal_dirty(handle, xs->xattr_bh); @@ -4938,139 +5246,6 @@ static inline char *ocfs2_xattr_bucket_get_val(struct inode *inode, return bucket_block(bucket, block_off) + offs; } -/* - * Handle the normal xattr set, including replace, delete and new. - * - * Note: "local" indicates the real data's locality. So we can't - * just its bucket locality by its length. - */ -static void ocfs2_xattr_set_entry_normal(struct inode *inode, - struct ocfs2_xattr_info *xi, - struct ocfs2_xattr_search *xs, - u32 name_hash, - int local) -{ - struct ocfs2_xattr_entry *last, *xe; - struct ocfs2_xattr_header *xh = xs->header; - u16 count = le16_to_cpu(xh->xh_count), start; - size_t blocksize = inode->i_sb->s_blocksize; - char *val; - size_t offs, size, new_size; - struct ocfs2_xa_loc loc; - - ocfs2_init_xattr_bucket_xa_loc(&loc, xs->bucket, - xs->not_found ? NULL : xs->here); - last = &xh->xh_entries[count]; - if (!xs->not_found) { - xe = xs->here; - offs = le16_to_cpu(xe->xe_name_offset); - size = namevalue_size_xe(xe); - - /* - * If the new value will be stored outside, xi->xi_value has - * been initalized as an empty ocfs2_xattr_value_root, and - * the same goes with xi->xi_value_len, so we can set - * new_size safely here. - * See ocfs2_xattr_set_in_bucket. - */ - new_size = namevalue_size_xi(xi); - - if (xi->xi_value) { - ocfs2_xa_wipe_namevalue(&loc); - if (new_size > size) - goto set_new_name_value; - - /* Now replace the old value with new one. */ - if (local) - xe->xe_value_size = - cpu_to_le64(xi->xi_value_len); - else - xe->xe_value_size = 0; - - val = ocfs2_xattr_bucket_get_val(inode, - xs->bucket, offs); - memset(val + OCFS2_XATTR_SIZE(xi->xi_name_len), 0, - size - OCFS2_XATTR_SIZE(xi->xi_name_len)); - if (OCFS2_XATTR_SIZE(xi->xi_value_len) > 0) - memcpy(val + OCFS2_XATTR_SIZE(xi->xi_name_len), - xi->xi_value, xi->xi_value_len); - - le16_add_cpu(&xh->xh_name_value_len, new_size); - ocfs2_xattr_set_local(xe, local); - return; - } else { - ocfs2_xa_remove_entry(&loc); - if (!xh->xh_count) - xh->xh_free_start = - cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE); - - return; - } - } else { - /* find a new entry for insert. */ - int low = 0, high = count - 1, tmp; - struct ocfs2_xattr_entry *tmp_xe; - - while (low <= high && count) { - tmp = (low + high) / 2; - tmp_xe = &xh->xh_entries[tmp]; - - if (name_hash > le32_to_cpu(tmp_xe->xe_name_hash)) - low = tmp + 1; - else if (name_hash < - le32_to_cpu(tmp_xe->xe_name_hash)) - high = tmp - 1; - else { - low = tmp; - break; - } - } - - xe = &xh->xh_entries[low]; - if (low != count) - memmove(xe + 1, xe, (void *)last - (void *)xe); - - le16_add_cpu(&xh->xh_count, 1); - memset(xe, 0, sizeof(struct ocfs2_xattr_entry)); - xe->xe_name_hash = cpu_to_le32(name_hash); - xe->xe_name_len = xi->xi_name_len; - ocfs2_xattr_set_type(xe, xi->xi_name_index); - } - -set_new_name_value: - /* Insert the new name+value. */ - size = namevalue_size_xi(xi); - - /* - * We must make sure that the name/value pair - * exists in the same block. - */ - offs = le16_to_cpu(xh->xh_free_start); - start = offs - size; - - if (start >> inode->i_sb->s_blocksize_bits != - (offs - 1) >> inode->i_sb->s_blocksize_bits) { - offs = offs - offs % blocksize; - xh->xh_free_start = cpu_to_le16(offs); - } - - val = ocfs2_xattr_bucket_get_val(inode, xs->bucket, offs - size); - xe->xe_name_offset = cpu_to_le16(offs - size); - - memset(val, 0, size); - memcpy(val, xi->xi_name, xi->xi_name_len); - memcpy(val + OCFS2_XATTR_SIZE(xi->xi_name_len), xi->xi_value, - xi->xi_value_len); - - xe->xe_value_size = cpu_to_le64(xi->xi_value_len); - ocfs2_xattr_set_local(xe, local); - xs->here = xe; - le16_add_cpu(&xh->xh_free_start, -size); - le16_add_cpu(&xh->xh_name_value_len, size); - - return; -} - /* * Set the xattr entry in the specified bucket. * The bucket is indicated by xs->bucket and it should have the enough @@ -5085,6 +5260,7 @@ static int ocfs2_xattr_set_entry_in_bucket(struct inode *inode, { int ret; u64 blkno; + struct ocfs2_xa_loc loc; mlog(0, "Set xattr entry len = %lu index = %d in bucket %llu\n", (unsigned long)xi->xi_value_len, xi->xi_name_index, @@ -5107,7 +5283,19 @@ static int ocfs2_xattr_set_entry_in_bucket(struct inode *inode, goto out; } - ocfs2_xattr_set_entry_normal(inode, xi, xs, name_hash, local); + ocfs2_init_xattr_bucket_xa_loc(&loc, xs->bucket, + xs->not_found ? NULL : xs->here); + ret = ocfs2_xa_prepare_entry(&loc, xi, name_hash); + if (ret) { + if (ret != -ENOSPC) + mlog_errno(ret); + goto out; + } + /* XXX For now, until we make ocfs2_xa_prepare_entry() primary */ + BUG_ON(ret == -ENOSPC); + ocfs2_xa_store_inline_value(&loc, xi); + xs->here = loc.xl_entry; + ocfs2_xattr_bucket_journal_dirty(handle, xs->bucket); out: -- cgit v1.2.3 From 9dc474005d0e34cf21d4b510f347e3942f24b021 Mon Sep 17 00:00:00 2001 From: Joel Becker <joel.becker@oracle.com> Date: Mon, 17 Aug 2009 19:56:01 -0700 Subject: ocfs2: Handle value tree roots in ocfs2_xa_set_inline_value() Previously the xattr code would send in a fake value, containing a tree root, to the function that installed name+value pairs. Instead, we pass the real value to ocfs2_xa_set_inline_value(), and it notices that the value cannot fit. Thus, it installs a tree root. Signed-off-by: Joel Becker <joel.becker@oracle.com> --- fs/ocfs2/xattr.c | 54 ++++++++++++++++-------------------------------------- 1 file changed, 16 insertions(+), 38 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 4bf6ec849e19..6d362e2d3960 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -1968,14 +1968,19 @@ static void ocfs2_xa_store_inline_value(struct ocfs2_xa_loc *loc, { int nameval_offset = le16_to_cpu(loc->xl_entry->xe_name_offset); int name_size = OCFS2_XATTR_SIZE(xi->xi_name_len); - int size = namevalue_size_xi(xi); + int inline_value_size = namevalue_size_xi(xi) - name_size; + const void *value = xi->xi_value; char *nameval_buf; if (!xi->xi_value) return; + if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) { + value = &def_xv; + inline_value_size = OCFS2_XATTR_ROOT_SIZE; + } nameval_buf = ocfs2_xa_offset_pointer(loc, nameval_offset); - memcpy(nameval_buf + name_size, xi->xi_value, size - name_size); + memcpy(nameval_buf + name_size, value, inline_value_size); } static void ocfs2_init_dinode_xa_loc(struct ocfs2_xa_loc *loc, @@ -2054,13 +2059,6 @@ static int ocfs2_xattr_set_entry(struct inode *inode, u32 name_hash = ocfs2_xattr_name_hash(inode, xi->xi_name, xi->xi_name_len); struct ocfs2_xa_loc loc; - struct ocfs2_xattr_info xi_l = { - .xi_name_index = xi->xi_name_index, - .xi_name = xi->xi_name, - .xi_name_len = xi->xi_name_len, - .xi_value = xi->xi_value, - .xi_value_len = xi->xi_value_len, - }; struct ocfs2_xattr_value_buf vb = { .vb_bh = xs->xattr_bh, .vb_access = ocfs2_journal_access_di, @@ -2090,16 +2088,9 @@ static int ocfs2_xattr_set_entry(struct inode *inode, free += ocfs2_xe_entry_usage(xs->here); /* Check free space in inode or block */ - if (xi->xi_value) { - if (free < ocfs2_xi_entry_usage(xi)) { - ret = -ENOSPC; - goto out; - } - if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) { - size_l = namevalue_size_xi(xi); - xi_l.xi_value = (void *)&def_xv; - xi_l.xi_value_len = OCFS2_XATTR_ROOT_SIZE; - } + if (xi->xi_value && (free < ocfs2_xi_entry_usage(xi))) { + ret = -ENOSPC; + goto out; } if (!xs->not_found) { @@ -5255,8 +5246,7 @@ static int ocfs2_xattr_set_entry_in_bucket(struct inode *inode, handle_t *handle, struct ocfs2_xattr_info *xi, struct ocfs2_xattr_search *xs, - u32 name_hash, - int local) + u32 name_hash) { int ret; u64 blkno; @@ -5571,13 +5561,14 @@ static int ocfs2_xattr_set_in_bucket(struct inode *inode, struct ocfs2_xattr_search *xs, struct ocfs2_xattr_set_ctxt *ctxt) { - int ret, local = 1; + int ret; size_t value_len; char *val = (char *)xi->xi_value; struct ocfs2_xattr_entry *xe = xs->here; u32 name_hash = ocfs2_xattr_name_hash(inode, xi->xi_name, xi->xi_name_len); + value_len = xi->xi_value_len; if (!xs->not_found && !ocfs2_xattr_is_local(xe)) { /* * We need to truncate the xattr storage first. @@ -5591,9 +5582,7 @@ static int ocfs2_xattr_set_in_bucket(struct inode *inode, * the modification to the xattr block will be done * by following steps. */ - if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) - value_len = xi->xi_value_len; - else + if (xi->xi_value_len <= OCFS2_XATTR_INLINE_SIZE) value_len = 0; ret = ocfs2_xattr_bucket_value_truncate_xs(inode, xs, @@ -5606,26 +5595,15 @@ static int ocfs2_xattr_set_in_bucket(struct inode *inode, goto set_value_outside; } - value_len = xi->xi_value_len; /* So we have to handle the inside block change now. */ - if (value_len > OCFS2_XATTR_INLINE_SIZE) { - /* - * If the new value will be stored outside of block, - * initalize a new empty value root and insert it first. - */ - local = 0; - xi->xi_value = &def_xv; - xi->xi_value_len = OCFS2_XATTR_ROOT_SIZE; - } - ret = ocfs2_xattr_set_entry_in_bucket(inode, ctxt->handle, xi, xs, - name_hash, local); + name_hash); if (ret) { mlog_errno(ret); goto out; } - if (value_len <= OCFS2_XATTR_INLINE_SIZE) + if (xi->xi_value_len <= OCFS2_XATTR_INLINE_SIZE) goto out; /* allocate the space now for the outside block storage. */ -- cgit v1.2.3 From 3fc12afa0cea5dc8845487b685165c89934ca1eb Mon Sep 17 00:00:00 2001 From: Joel Becker <joel.becker@oracle.com> Date: Tue, 18 Aug 2009 13:20:27 -0700 Subject: ocfs2: Provide ocfs2_xa_fill_value_buf() for external value processing We use the ocfs2_xattr_value_buf structure to manage external values. It lets the value tree code do its work regardless of the containing storage. ocfs2_xa_fill_value_buf() initializes a value buf from an ocfs2_xa_loc entry. Signed-off-by: Joel Becker <joel.becker@oracle.com> --- fs/ocfs2/xattr.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) (limited to 'fs') diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 6d362e2d3960..e0d0fa23a19b 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -172,6 +172,13 @@ struct ocfs2_xa_loc_operations { /* Add name+value storage to an entry */ void (*xlo_add_namevalue)(struct ocfs2_xa_loc *loc, int size); + + /* + * Initialize the value buf's access and bh fields for this entry. + * ocfs2_xa_fill_value_buf() will handle the xv pointer. + */ + void (*xlo_fill_value_buf)(struct ocfs2_xa_loc *loc, + struct ocfs2_xattr_value_buf *vb); }; /* @@ -1605,6 +1612,23 @@ static void ocfs2_xa_add_namevalue(struct ocfs2_xa_loc *loc, memcpy(nameval_buf, xi->xi_name, xi->xi_name_len); } +static void ocfs2_xa_fill_value_buf(struct ocfs2_xa_loc *loc, + struct ocfs2_xattr_value_buf *vb) +{ + int nameval_offset = le16_to_cpu(loc->xl_entry->xe_name_offset); + int name_size = OCFS2_XATTR_SIZE(loc->xl_entry->xe_name_len); + + /* Value bufs are for value trees */ + BUG_ON(namevalue_size_xe(loc->xl_entry) != + (name_size + OCFS2_XATTR_ROOT_SIZE)); + + loc->xl_ops->xlo_fill_value_buf(loc, vb); + vb->vb_xv = + (struct ocfs2_xattr_value_root *)ocfs2_xa_offset_pointer(loc, + nameval_offset + + name_size); +} + static void *ocfs2_xa_block_offset_pointer(struct ocfs2_xa_loc *loc, int offset) { @@ -1712,6 +1736,20 @@ static void ocfs2_xa_block_add_namevalue(struct ocfs2_xa_loc *loc, int size) loc->xl_entry->xe_name_offset = cpu_to_le16(free_start - size); } +static void ocfs2_xa_block_fill_value_buf(struct ocfs2_xa_loc *loc, + struct ocfs2_xattr_value_buf *vb) +{ + struct buffer_head *bh = loc->xl_storage; + + if (loc->xl_size == (bh->b_size - + offsetof(struct ocfs2_xattr_block, + xb_attrs.xb_header))) + vb->vb_access = ocfs2_journal_access_xb; + else + vb->vb_access = ocfs2_journal_access_di; + vb->vb_bh = bh; +} + /* * Operations for xattrs stored in blocks. This includes inline inode * storage and unindexed ocfs2_xattr_blocks. @@ -1724,6 +1762,7 @@ static const struct ocfs2_xa_loc_operations ocfs2_xa_block_loc_ops = { .xlo_wipe_namevalue = ocfs2_xa_block_wipe_namevalue, .xlo_add_entry = ocfs2_xa_block_add_entry, .xlo_add_namevalue = ocfs2_xa_block_add_namevalue, + .xlo_fill_value_buf = ocfs2_xa_block_fill_value_buf, }; static void *ocfs2_xa_bucket_offset_pointer(struct ocfs2_xa_loc *loc, @@ -1869,6 +1908,25 @@ static void ocfs2_xa_bucket_add_namevalue(struct ocfs2_xa_loc *loc, int size) } +static void ocfs2_xa_bucket_fill_value_buf(struct ocfs2_xa_loc *loc, + struct ocfs2_xattr_value_buf *vb) +{ + struct ocfs2_xattr_bucket *bucket = loc->xl_storage; + struct super_block *sb = bucket->bu_inode->i_sb; + int nameval_offset = le16_to_cpu(loc->xl_entry->xe_name_offset); + int size = namevalue_size_xe(loc->xl_entry); + int block_offset = nameval_offset >> sb->s_blocksize_bits; + + /* Values are not allowed to straddle block boundaries */ + BUG_ON(block_offset != + ((nameval_offset + size - 1) >> sb->s_blocksize_bits)); + /* We expect the bucket to be filled in */ + BUG_ON(!bucket->bu_bhs[block_offset]); + + vb->vb_access = ocfs2_journal_access; + vb->vb_bh = bucket->bu_bhs[block_offset]; +} + /* Operations for xattrs stored in buckets. */ static const struct ocfs2_xa_loc_operations ocfs2_xa_bucket_loc_ops = { .xlo_offset_pointer = ocfs2_xa_bucket_offset_pointer, @@ -1878,6 +1936,7 @@ static const struct ocfs2_xa_loc_operations ocfs2_xa_bucket_loc_ops = { .xlo_wipe_namevalue = ocfs2_xa_bucket_wipe_namevalue, .xlo_add_entry = ocfs2_xa_bucket_add_entry, .xlo_add_namevalue = ocfs2_xa_bucket_add_namevalue, + .xlo_fill_value_buf = ocfs2_xa_bucket_fill_value_buf, }; static void ocfs2_xa_remove_entry(struct ocfs2_xa_loc *loc) -- cgit v1.2.3 From cf2bc809403ae48a4a2bb5cc551d2ec35f2e4a47 Mon Sep 17 00:00:00 2001 From: Joel Becker <joel.becker@oracle.com> Date: Tue, 18 Aug 2009 13:52:38 -0700 Subject: ocfs2: Teach ocfs2_xa_loc how to do its own journal work We're going to want to make sure our buffers get accessed and dirtied correctly. So have the xa_loc do the work. This includes storing the inode on ocfs2_xa_loc. Signed-off-by: Joel Becker <joel.becker@oracle.com> --- fs/ocfs2/xattr.c | 115 +++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 86 insertions(+), 29 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index e0d0fa23a19b..fbec11610223 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -141,6 +141,13 @@ struct ocfs2_xattr_search { /* Operations on struct ocfs2_xa_entry */ struct ocfs2_xa_loc; struct ocfs2_xa_loc_operations { + /* + * Journal functions + */ + int (*xlo_journal_access)(handle_t *handle, struct ocfs2_xa_loc *loc, + int type); + void (*xlo_journal_dirty)(handle_t *handle, struct ocfs2_xa_loc *loc); + /* * Return a pointer to the appropriate buffer in loc->xl_storage * at the given offset from loc->xl_header. @@ -186,6 +193,9 @@ struct ocfs2_xa_loc_operations { * tracking the on-disk structure. */ struct ocfs2_xa_loc { + /* This xattr belongs to this inode */ + struct inode *xl_inode; + /* The ocfs2_xattr_header inside the on-disk storage. Not NULL. */ struct ocfs2_xattr_header *xl_header; @@ -1540,6 +1550,17 @@ static int ocfs2_xa_check_space_helper(int needed_space, int free_start, return 0; } +static int ocfs2_xa_journal_access(handle_t *handle, struct ocfs2_xa_loc *loc, + int type) +{ + return loc->xl_ops->xlo_journal_access(handle, loc, type); +} + +static void ocfs2_xa_journal_dirty(handle_t *handle, struct ocfs2_xa_loc *loc) +{ + loc->xl_ops->xlo_journal_dirty(handle, loc); +} + /* Give a pointer into the storage for the given offset */ static void *ocfs2_xa_offset_pointer(struct ocfs2_xa_loc *loc, int offset) { @@ -1629,6 +1650,29 @@ static void ocfs2_xa_fill_value_buf(struct ocfs2_xa_loc *loc, name_size); } +static int ocfs2_xa_block_journal_access(handle_t *handle, + struct ocfs2_xa_loc *loc, int type) +{ + struct buffer_head *bh = loc->xl_storage; + ocfs2_journal_access_func access; + + if (loc->xl_size == (bh->b_size - + offsetof(struct ocfs2_xattr_block, + xb_attrs.xb_header))) + access = ocfs2_journal_access_xb; + else + access = ocfs2_journal_access_di; + return access(handle, INODE_CACHE(loc->xl_inode), bh, type); +} + +static void ocfs2_xa_block_journal_dirty(handle_t *handle, + struct ocfs2_xa_loc *loc) +{ + struct buffer_head *bh = loc->xl_storage; + + ocfs2_journal_dirty(handle, bh); +} + static void *ocfs2_xa_block_offset_pointer(struct ocfs2_xa_loc *loc, int offset) { @@ -1755,6 +1799,8 @@ static void ocfs2_xa_block_fill_value_buf(struct ocfs2_xa_loc *loc, * storage and unindexed ocfs2_xattr_blocks. */ static const struct ocfs2_xa_loc_operations ocfs2_xa_block_loc_ops = { + .xlo_journal_access = ocfs2_xa_block_journal_access, + .xlo_journal_dirty = ocfs2_xa_block_journal_dirty, .xlo_offset_pointer = ocfs2_xa_block_offset_pointer, .xlo_check_space = ocfs2_xa_block_check_space, .xlo_can_reuse = ocfs2_xa_block_can_reuse, @@ -1765,6 +1811,22 @@ static const struct ocfs2_xa_loc_operations ocfs2_xa_block_loc_ops = { .xlo_fill_value_buf = ocfs2_xa_block_fill_value_buf, }; +static int ocfs2_xa_bucket_journal_access(handle_t *handle, + struct ocfs2_xa_loc *loc, int type) +{ + struct ocfs2_xattr_bucket *bucket = loc->xl_storage; + + return ocfs2_xattr_bucket_journal_access(handle, bucket, type); +} + +static void ocfs2_xa_bucket_journal_dirty(handle_t *handle, + struct ocfs2_xa_loc *loc) +{ + struct ocfs2_xattr_bucket *bucket = loc->xl_storage; + + ocfs2_xattr_bucket_journal_dirty(handle, bucket); +} + static void *ocfs2_xa_bucket_offset_pointer(struct ocfs2_xa_loc *loc, int offset) { @@ -1772,8 +1834,8 @@ static void *ocfs2_xa_bucket_offset_pointer(struct ocfs2_xa_loc *loc, int block, block_offset; /* The header is at the front of the bucket */ - block = offset >> bucket->bu_inode->i_sb->s_blocksize_bits; - block_offset = offset % bucket->bu_inode->i_sb->s_blocksize; + block = offset >> loc->xl_inode->i_sb->s_blocksize_bits; + block_offset = offset % loc->xl_inode->i_sb->s_blocksize; return bucket_block(bucket, block) + block_offset; } @@ -1813,8 +1875,7 @@ static int ocfs2_xa_bucket_check_space(struct ocfs2_xa_loc *loc, int free_start = ocfs2_xa_get_free_start(loc); int needed_space = ocfs2_xi_entry_usage(xi); int size = namevalue_size_xi(xi); - struct ocfs2_xattr_bucket *bucket = loc->xl_storage; - struct super_block *sb = bucket->bu_inode->i_sb; + struct super_block *sb = loc->xl_inode->i_sb; /* * Bucket storage does not reclaim name+value pairs it cannot @@ -1896,8 +1957,7 @@ static void ocfs2_xa_bucket_add_namevalue(struct ocfs2_xa_loc *loc, int size) { int free_start = ocfs2_xa_get_free_start(loc); struct ocfs2_xattr_header *xh = loc->xl_header; - struct ocfs2_xattr_bucket *bucket = loc->xl_storage; - struct super_block *sb = bucket->bu_inode->i_sb; + struct super_block *sb = loc->xl_inode->i_sb; int nameval_offset; free_start = ocfs2_bucket_align_free_start(sb, free_start, size); @@ -1912,7 +1972,7 @@ static void ocfs2_xa_bucket_fill_value_buf(struct ocfs2_xa_loc *loc, struct ocfs2_xattr_value_buf *vb) { struct ocfs2_xattr_bucket *bucket = loc->xl_storage; - struct super_block *sb = bucket->bu_inode->i_sb; + struct super_block *sb = loc->xl_inode->i_sb; int nameval_offset = le16_to_cpu(loc->xl_entry->xe_name_offset); int size = namevalue_size_xe(loc->xl_entry); int block_offset = nameval_offset >> sb->s_blocksize_bits; @@ -1929,6 +1989,8 @@ static void ocfs2_xa_bucket_fill_value_buf(struct ocfs2_xa_loc *loc, /* Operations for xattrs stored in buckets. */ static const struct ocfs2_xa_loc_operations ocfs2_xa_bucket_loc_ops = { + .xlo_journal_access = ocfs2_xa_bucket_journal_access, + .xlo_journal_dirty = ocfs2_xa_bucket_journal_dirty, .xlo_offset_pointer = ocfs2_xa_bucket_offset_pointer, .xlo_check_space = ocfs2_xa_bucket_check_space, .xlo_can_reuse = ocfs2_xa_bucket_can_reuse, @@ -2049,6 +2111,7 @@ static void ocfs2_init_dinode_xa_loc(struct ocfs2_xa_loc *loc, { struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data; + loc->xl_inode = inode; loc->xl_ops = &ocfs2_xa_block_loc_ops; loc->xl_storage = bh; loc->xl_entry = entry; @@ -2065,6 +2128,7 @@ static void ocfs2_init_dinode_xa_loc(struct ocfs2_xa_loc *loc, } static void ocfs2_init_xattr_block_xa_loc(struct ocfs2_xa_loc *loc, + struct inode *inode, struct buffer_head *bh, struct ocfs2_xattr_entry *entry) { @@ -2073,6 +2137,7 @@ static void ocfs2_init_xattr_block_xa_loc(struct ocfs2_xa_loc *loc, BUG_ON(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED); + loc->xl_inode = inode; loc->xl_ops = &ocfs2_xa_block_loc_ops; loc->xl_storage = bh; loc->xl_header = &(xb->xb_attrs.xb_header); @@ -2085,6 +2150,7 @@ static void ocfs2_init_xattr_bucket_xa_loc(struct ocfs2_xa_loc *loc, struct ocfs2_xattr_bucket *bucket, struct ocfs2_xattr_entry *entry) { + loc->xl_inode = bucket->bu_inode; loc->xl_ops = &ocfs2_xa_bucket_loc_ops; loc->xl_storage = bucket; loc->xl_header = bucket_xh(bucket); @@ -2226,21 +2292,18 @@ static int ocfs2_xattr_set_entry(struct inode *inode, goto out; } - if (!(flag & OCFS2_INLINE_XATTR_FL)) { - ret = vb.vb_access(handle, INODE_CACHE(inode), vb.vb_bh, - OCFS2_JOURNAL_ACCESS_WRITE); - if (ret) { - mlog_errno(ret); - goto out; - } - } - if (xs->xattr_bh == xs->inode_bh) ocfs2_init_dinode_xa_loc(&loc, inode, xs->inode_bh, xs->not_found ? NULL : xs->here); else - ocfs2_init_xattr_block_xa_loc(&loc, xs->xattr_bh, + ocfs2_init_xattr_block_xa_loc(&loc, inode, xs->xattr_bh, xs->not_found ? NULL : xs->here); + ret = ocfs2_xa_journal_access(handle, &loc, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } /* * Prepare our entry and insert the inline value. This will @@ -2258,13 +2321,7 @@ static int ocfs2_xattr_set_entry(struct inode *inode, ocfs2_xa_store_inline_value(&loc, xi); xs->here = loc.xl_entry; - if (!(flag & OCFS2_INLINE_XATTR_FL)) { - ret = ocfs2_journal_dirty(handle, xs->xattr_bh); - if (ret < 0) { - mlog_errno(ret); - goto out; - } - } + ocfs2_xa_journal_dirty(handle, &loc); if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) && (flag & OCFS2_INLINE_XATTR_FL)) { @@ -5325,15 +5382,15 @@ static int ocfs2_xattr_set_entry_in_bucket(struct inode *inode, } } - ret = ocfs2_xattr_bucket_journal_access(handle, xs->bucket, - OCFS2_JOURNAL_ACCESS_WRITE); + ocfs2_init_xattr_bucket_xa_loc(&loc, xs->bucket, + xs->not_found ? NULL : xs->here); + ret = ocfs2_xa_journal_access(handle, &loc, + OCFS2_JOURNAL_ACCESS_WRITE); if (ret < 0) { mlog_errno(ret); goto out; } - ocfs2_init_xattr_bucket_xa_loc(&loc, xs->bucket, - xs->not_found ? NULL : xs->here); ret = ocfs2_xa_prepare_entry(&loc, xi, name_hash); if (ret) { if (ret != -ENOSPC) @@ -5345,7 +5402,7 @@ static int ocfs2_xattr_set_entry_in_bucket(struct inode *inode, ocfs2_xa_store_inline_value(&loc, xi); xs->here = loc.xl_entry; - ocfs2_xattr_bucket_journal_dirty(handle, xs->bucket); + ocfs2_xa_journal_dirty(handle, &loc); out: return ret; -- cgit v1.2.3 From 73857ee0b548017f9632a0d0e6fe2dabbdc11d31 Mon Sep 17 00:00:00 2001 From: Joel Becker <joel.becker@oracle.com> Date: Tue, 18 Aug 2009 20:26:41 -0700 Subject: ocfs2: Allocation in ocfs2_xa_prepare_entry(), values in ocfs2_xa_store_value() ocfs2_xa_prepare_entry() gets all the logic to add, remove, or modify external value trees. Now, when it exits, the entry is ready to receive a value of any size. ocfs2_xa_remove() is added to handle the complete removal of an entry. It truncates the external value tree before calling ocfs2_xa_remove_entry(). ocfs2_xa_store_inline_value() becomes ocfs2_xa_store_value(). It can store any value. ocfs2_xattr_set_entry() loses all the allocation logic and just uses these functions. ocfs2_xattr_set_value_outside() disappears. ocfs2_xattr_set_in_bucket() uses these functions and makes ocfs2_xattr_set_entry_in_bucket() obsolete. That goes away, as does ocfs2_xattr_bucket_set_value_outside() and ocfs2_xattr_bucket_value_truncate(). Signed-off-by: Joel Becker <joel.becker@oracle.com> --- fs/ocfs2/xattr.c | 661 ++++++++++++++++--------------------------------------- 1 file changed, 186 insertions(+), 475 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index fbec11610223..550a3e86c971 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -573,24 +573,6 @@ static u32 ocfs2_xattr_name_hash(struct inode *inode, return hash; } -/* - * ocfs2_xattr_hash_entry() - * - * Compute the hash of an extended attribute. - */ -static void ocfs2_xattr_hash_entry(struct inode *inode, - struct ocfs2_xattr_header *header, - struct ocfs2_xattr_entry *entry) -{ - u32 hash = 0; - char *name = (char *)header + le16_to_cpu(entry->xe_name_offset); - - hash = ocfs2_xattr_name_hash(inode, name, entry->xe_name_len); - entry->xe_name_hash = cpu_to_le32(hash); - - return; -} - static int ocfs2_xattr_entry_real_size(int name_len, size_t value_len) { return namevalue_size(name_len, value_len) + @@ -1423,113 +1405,6 @@ out: return ret; } -static int ocfs2_xattr_cleanup(struct inode *inode, - handle_t *handle, - struct ocfs2_xattr_info *xi, - struct ocfs2_xattr_search *xs, - struct ocfs2_xattr_value_buf *vb, - size_t offs) -{ - int ret = 0; - void *val = xs->base + offs; - size_t size = namevalue_size_xi(xi); - - ret = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh, - OCFS2_JOURNAL_ACCESS_WRITE); - if (ret) { - mlog_errno(ret); - goto out; - } - /* Decrease xattr count */ - le16_add_cpu(&xs->header->xh_count, -1); - /* Remove the xattr entry and tree root which has already be set*/ - memset((void *)xs->here, 0, sizeof(struct ocfs2_xattr_entry)); - memset(val, 0, size); - - ret = ocfs2_journal_dirty(handle, vb->vb_bh); - if (ret < 0) - mlog_errno(ret); -out: - return ret; -} - -static int ocfs2_xattr_update_entry(struct inode *inode, - handle_t *handle, - struct ocfs2_xattr_info *xi, - struct ocfs2_xattr_search *xs, - struct ocfs2_xattr_value_buf *vb, - size_t offs) -{ - int ret; - - ret = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh, - OCFS2_JOURNAL_ACCESS_WRITE); - if (ret) { - mlog_errno(ret); - goto out; - } - - xs->here->xe_name_offset = cpu_to_le16(offs); - xs->here->xe_value_size = cpu_to_le64(xi->xi_value_len); - if (xi->xi_value_len <= OCFS2_XATTR_INLINE_SIZE) - ocfs2_xattr_set_local(xs->here, 1); - else - ocfs2_xattr_set_local(xs->here, 0); - ocfs2_xattr_hash_entry(inode, xs->header, xs->here); - - ret = ocfs2_journal_dirty(handle, vb->vb_bh); - if (ret < 0) - mlog_errno(ret); -out: - return ret; -} - -/* - * ocfs2_xattr_set_value_outside() - * - * Set large size value in B tree. - */ -static int ocfs2_xattr_set_value_outside(struct inode *inode, - struct ocfs2_xattr_info *xi, - struct ocfs2_xattr_search *xs, - struct ocfs2_xattr_set_ctxt *ctxt, - struct ocfs2_xattr_value_buf *vb, - size_t offs) -{ - void *val = xs->base + offs; - struct ocfs2_xattr_value_root *xv = NULL; - size_t size = namevalue_size_xi(xi); - int ret = 0; - - memset(val, 0, size); - memcpy(val, xi->xi_name, xi->xi_name_len); - xv = (struct ocfs2_xattr_value_root *) - (val + OCFS2_XATTR_SIZE(xi->xi_name_len)); - xv->xr_clusters = 0; - xv->xr_last_eb_blk = 0; - xv->xr_list.l_tree_depth = 0; - xv->xr_list.l_count = cpu_to_le16(1); - xv->xr_list.l_next_free_rec = 0; - vb->vb_xv = xv; - - ret = ocfs2_xattr_value_truncate(inode, vb, xi->xi_value_len, ctxt); - if (ret < 0) { - mlog_errno(ret); - return ret; - } - ret = ocfs2_xattr_update_entry(inode, ctxt->handle, xi, xs, vb, offs); - if (ret < 0) { - mlog_errno(ret); - return ret; - } - ret = __ocfs2_xattr_set_value_outside(inode, ctxt->handle, vb, - xi->xi_value, xi->xi_value_len); - if (ret < 0) - mlog_errno(ret); - - return ret; -} - static int ocfs2_xa_check_space_helper(int needed_space, int free_start, int num_entries) { @@ -1640,6 +1515,7 @@ static void ocfs2_xa_fill_value_buf(struct ocfs2_xa_loc *loc, int name_size = OCFS2_XATTR_SIZE(loc->xl_entry->xe_name_len); /* Value bufs are for value trees */ + BUG_ON(ocfs2_xattr_is_local(loc->xl_entry)); BUG_ON(namevalue_size_xe(loc->xl_entry) != (name_size + OCFS2_XATTR_ROOT_SIZE)); @@ -2001,6 +1877,33 @@ static const struct ocfs2_xa_loc_operations ocfs2_xa_bucket_loc_ops = { .xlo_fill_value_buf = ocfs2_xa_bucket_fill_value_buf, }; +static int ocfs2_xa_value_truncate(struct ocfs2_xa_loc *loc, u64 bytes, + struct ocfs2_xattr_set_ctxt *ctxt) +{ + int trunc_rc, access_rc; + struct ocfs2_xattr_value_buf vb; + + ocfs2_xa_fill_value_buf(loc, &vb); + trunc_rc = ocfs2_xattr_value_truncate(loc->xl_inode, &vb, bytes, + ctxt); + + /* + * The caller of ocfs2_xa_value_truncate() has already called + * ocfs2_xa_journal_access on the loc. However, The truncate code + * calls ocfs2_extend_trans(). This may commit the previous + * transaction and open a new one. If this is a bucket, truncate + * could leave only vb->vb_bh set up for journaling. Meanwhile, + * the caller is expecting to dirty the entire bucket. So we must + * reset the journal work. We do this even if truncate has failed, + * as it could have failed after committing the extend. + */ + access_rc = ocfs2_xa_journal_access(ctxt->handle, loc, + OCFS2_JOURNAL_ACCESS_WRITE); + + /* Errors in truncate take precedence */ + return trunc_rc ? trunc_rc : access_rc; +} + static void ocfs2_xa_remove_entry(struct ocfs2_xa_loc *loc) { int index, count; @@ -2028,6 +1931,88 @@ static void ocfs2_xa_remove_entry(struct ocfs2_xa_loc *loc) } } +static int ocfs2_xa_remove(struct ocfs2_xa_loc *loc, + struct ocfs2_xattr_set_ctxt *ctxt) +{ + int rc = 0; + + if (!ocfs2_xattr_is_local(loc->xl_entry)) { + rc = ocfs2_xa_value_truncate(loc, 0, ctxt); + if (rc) { + mlog_errno(rc); + goto out; + } + } + + ocfs2_xa_remove_entry(loc); + +out: + return rc; +} + +static void ocfs2_xa_install_value_root(struct ocfs2_xa_loc *loc) +{ + int name_size = OCFS2_XATTR_SIZE(loc->xl_entry->xe_name_len); + char *nameval_buf; + + nameval_buf = ocfs2_xa_offset_pointer(loc, + le16_to_cpu(loc->xl_entry->xe_name_offset)); + memcpy(nameval_buf + name_size, &def_xv, OCFS2_XATTR_ROOT_SIZE); +} + +/* + * Take an existing entry and make it ready for the new value. This + * won't allocate space, but it may free space. It should be ready for + * ocfs2_xa_prepare_entry() to finish the work. + */ +static int ocfs2_xa_reuse_entry(struct ocfs2_xa_loc *loc, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_set_ctxt *ctxt) +{ + int rc = 0; + int name_size = OCFS2_XATTR_SIZE(xi->xi_name_len); + char *nameval_buf; + int xe_local = ocfs2_xattr_is_local(loc->xl_entry); + int xi_local = xi->xi_value_len <= OCFS2_XATTR_INLINE_SIZE; + + BUG_ON(OCFS2_XATTR_SIZE(loc->xl_entry->xe_name_len) != + name_size); + + nameval_buf = ocfs2_xa_offset_pointer(loc, + le16_to_cpu(loc->xl_entry->xe_name_offset)); + if (xe_local) { + memset(nameval_buf + name_size, 0, + namevalue_size_xe(loc->xl_entry) - name_size); + if (!xi_local) + ocfs2_xa_install_value_root(loc); + } else { + if (xi_local) { + rc = ocfs2_xa_value_truncate(loc, 0, ctxt); + if (rc < 0) { + mlog_errno(rc); + goto out; + } + memset(nameval_buf + name_size, 0, + namevalue_size_xe(loc->xl_entry) - + name_size); + } else if (le64_to_cpu(loc->xl_entry->xe_value_size) > + xi->xi_value_len) { + rc = ocfs2_xa_value_truncate(loc, xi->xi_value_len, + ctxt); + if (rc < 0) { + mlog_errno(rc); + goto out; + } + } + } + + loc->xl_entry->xe_value_size = cpu_to_le64(xi->xi_value_len); + ocfs2_xattr_set_local(loc->xl_entry, xi_local); + +out: + return rc; +} + /* * Prepares loc->xl_entry to receive the new xattr. This includes * properly setting up the name+value pair region. If loc->xl_entry @@ -2040,14 +2025,13 @@ static void ocfs2_xa_remove_entry(struct ocfs2_xa_loc *loc) */ static int ocfs2_xa_prepare_entry(struct ocfs2_xa_loc *loc, struct ocfs2_xattr_info *xi, - u32 name_hash) + u32 name_hash, + struct ocfs2_xattr_set_ctxt *ctxt) { int rc = 0; - int name_size = OCFS2_XATTR_SIZE(xi->xi_name_len); - char *nameval_buf; if (!xi->xi_value) { - ocfs2_xa_remove_entry(loc); + rc = ocfs2_xa_remove(loc, ctxt); goto out; } @@ -2057,15 +2041,19 @@ static int ocfs2_xa_prepare_entry(struct ocfs2_xa_loc *loc, if (loc->xl_entry) { if (ocfs2_xa_can_reuse_entry(loc, xi)) { - nameval_buf = ocfs2_xa_offset_pointer(loc, - le16_to_cpu(loc->xl_entry->xe_name_offset)); - memset(nameval_buf + name_size, 0, - namevalue_size_xe(loc->xl_entry) - name_size); - loc->xl_entry->xe_value_size = - cpu_to_le64(xi->xi_value_len); - goto out; + rc = ocfs2_xa_reuse_entry(loc, xi, ctxt); + if (rc) + goto out; + goto alloc_value; } + if (!ocfs2_xattr_is_local(loc->xl_entry)) { + rc = ocfs2_xa_value_truncate(loc, 0, ctxt); + if (rc) { + mlog_errno(rc); + goto out; + } + } ocfs2_xa_wipe_namevalue(loc); } else ocfs2_xa_add_entry(loc, name_hash); @@ -2075,33 +2063,50 @@ static int ocfs2_xa_prepare_entry(struct ocfs2_xa_loc *loc, * name+value pair back from the end. */ ocfs2_xa_add_namevalue(loc, xi); + if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) + ocfs2_xa_install_value_root(loc); + +alloc_value: + if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) { + rc = ocfs2_xa_value_truncate(loc, xi->xi_value_len, ctxt); + if (rc < 0) + mlog_errno(rc); + } out: return rc; } /* - * Store the value portion of the name+value pair. This is either an - * inline value or the tree root of an external value. + * Store the value portion of the name+value pair. This will skip + * values that are stored externally. Their tree roots were set up + * by ocfs2_xa_prepare_entry(). */ -static void ocfs2_xa_store_inline_value(struct ocfs2_xa_loc *loc, - struct ocfs2_xattr_info *xi) +static int ocfs2_xa_store_value(struct ocfs2_xa_loc *loc, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_set_ctxt *ctxt) { + int rc = 0; int nameval_offset = le16_to_cpu(loc->xl_entry->xe_name_offset); int name_size = OCFS2_XATTR_SIZE(xi->xi_name_len); - int inline_value_size = namevalue_size_xi(xi) - name_size; - const void *value = xi->xi_value; char *nameval_buf; + struct ocfs2_xattr_value_buf vb; if (!xi->xi_value) - return; + goto out; - if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) { - value = &def_xv; - inline_value_size = OCFS2_XATTR_ROOT_SIZE; - } nameval_buf = ocfs2_xa_offset_pointer(loc, nameval_offset); - memcpy(nameval_buf + name_size, value, inline_value_size); + if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) { + ocfs2_xa_fill_value_buf(loc, &vb); + rc = __ocfs2_xattr_set_value_outside(loc->xl_inode, + ctxt->handle, &vb, + xi->xi_value, + xi->xi_value_len); + } else + memcpy(nameval_buf + name_size, xi->xi_value, xi->xi_value_len); + +out: + return rc; } static void ocfs2_init_dinode_xa_loc(struct ocfs2_xa_loc *loc, @@ -2174,117 +2179,19 @@ static int ocfs2_xattr_set_entry(struct inode *inode, struct ocfs2_xattr_set_ctxt *ctxt, int flag) { - struct ocfs2_xattr_entry *last; struct ocfs2_inode_info *oi = OCFS2_I(inode); struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data; - size_t min_offs = xs->end - xs->base; - size_t size_l = 0; handle_t *handle = ctxt->handle; - int free, i, ret; + int ret; u32 name_hash = ocfs2_xattr_name_hash(inode, xi->xi_name, xi->xi_name_len); struct ocfs2_xa_loc loc; - struct ocfs2_xattr_value_buf vb = { - .vb_bh = xs->xattr_bh, - .vb_access = ocfs2_journal_access_di, - }; - if (!(flag & OCFS2_INLINE_XATTR_FL)) { + if (!(flag & OCFS2_INLINE_XATTR_FL)) BUG_ON(xs->xattr_bh == xs->inode_bh); - vb.vb_access = ocfs2_journal_access_xb; - } else + else BUG_ON(xs->xattr_bh != xs->inode_bh); - /* Compute min_offs, last and free space. */ - last = xs->header->xh_entries; - - for (i = 0 ; i < le16_to_cpu(xs->header->xh_count); i++) { - size_t offs = le16_to_cpu(last->xe_name_offset); - if (offs < min_offs) - min_offs = offs; - last += 1; - } - - free = min_offs - ((void *)last - xs->base) - OCFS2_XATTR_HEADER_GAP; - if (free < 0) - return -EIO; - - if (!xs->not_found) - free += ocfs2_xe_entry_usage(xs->here); - - /* Check free space in inode or block */ - if (xi->xi_value && (free < ocfs2_xi_entry_usage(xi))) { - ret = -ENOSPC; - goto out; - } - - if (!xs->not_found) { - /* For existing extended attribute */ - size_t size = namevalue_size_xe(xs->here); - size_t offs = le16_to_cpu(xs->here->xe_name_offset); - void *val = xs->base + offs; - - if (ocfs2_xattr_is_local(xs->here) && size == size_l) { - /* Replace existing local xattr with tree root */ - ret = ocfs2_xattr_set_value_outside(inode, xi, xs, - ctxt, &vb, offs); - if (ret < 0) - mlog_errno(ret); - goto out; - } else if (!ocfs2_xattr_is_local(xs->here)) { - /* For existing xattr which has value outside */ - vb.vb_xv = (struct ocfs2_xattr_value_root *) - (val + OCFS2_XATTR_SIZE(xi->xi_name_len)); - - if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) { - /* - * If new value need set outside also, - * first truncate old value to new value, - * then set new value with set_value_outside(). - */ - ret = ocfs2_xattr_value_truncate(inode, - &vb, - xi->xi_value_len, - ctxt); - if (ret < 0) { - mlog_errno(ret); - goto out; - } - - ret = ocfs2_xattr_update_entry(inode, - handle, - xi, - xs, - &vb, - offs); - if (ret < 0) { - mlog_errno(ret); - goto out; - } - - ret = __ocfs2_xattr_set_value_outside(inode, - handle, - &vb, - xi->xi_value, - xi->xi_value_len); - if (ret < 0) - mlog_errno(ret); - goto out; - } else { - /* - * If new value need set in local, - * just trucate old value to zero. - */ - ret = ocfs2_xattr_value_truncate(inode, - &vb, - 0, - ctxt); - if (ret < 0) - mlog_errno(ret); - } - } - } - ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), xs->inode_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { @@ -2305,22 +2212,20 @@ static int ocfs2_xattr_set_entry(struct inode *inode, goto out; } - /* - * Prepare our entry and insert the inline value. This will - * be a value tree root for values that are larger than - * OCFS2_XATTR_INLINE_SIZE. - */ - ret = ocfs2_xa_prepare_entry(&loc, xi, name_hash); + ret = ocfs2_xa_prepare_entry(&loc, xi, name_hash, ctxt); if (ret) { if (ret != -ENOSPC) mlog_errno(ret); goto out; } - /* XXX For now, until we make ocfs2_xa_prepare_entry() primary */ - BUG_ON(ret == -ENOSPC); - ocfs2_xa_store_inline_value(&loc, xi); xs->here = loc.xl_entry; + ret = ocfs2_xa_store_value(&loc, xi, ctxt); + if (ret) { + mlog_errno(ret); + goto out; + } + ocfs2_xa_journal_dirty(handle, &loc); if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) && @@ -2352,28 +2257,6 @@ static int ocfs2_xattr_set_entry(struct inode *inode, if (ret < 0) mlog_errno(ret); - if (!ret && xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) { - /* - * Set value outside in B tree. - * This is the second step for value size > INLINE_SIZE. - */ - size_t offs = le16_to_cpu(xs->here->xe_name_offset); - ret = ocfs2_xattr_set_value_outside(inode, xi, xs, ctxt, - &vb, offs); - if (ret < 0) { - int ret2; - - mlog_errno(ret); - /* - * If set value outside failed, we have to clean - * the junk tree root we have already set in local. - */ - ret2 = ocfs2_xattr_cleanup(inode, ctxt->handle, - xi, xs, &vb, offs); - if (ret2 < 0) - mlog_errno(ret2); - } - } out: return ret; } @@ -5353,61 +5236,6 @@ static inline char *ocfs2_xattr_bucket_get_val(struct inode *inode, return bucket_block(bucket, block_off) + offs; } -/* - * Set the xattr entry in the specified bucket. - * The bucket is indicated by xs->bucket and it should have the enough - * space for the xattr insertion. - */ -static int ocfs2_xattr_set_entry_in_bucket(struct inode *inode, - handle_t *handle, - struct ocfs2_xattr_info *xi, - struct ocfs2_xattr_search *xs, - u32 name_hash) -{ - int ret; - u64 blkno; - struct ocfs2_xa_loc loc; - - mlog(0, "Set xattr entry len = %lu index = %d in bucket %llu\n", - (unsigned long)xi->xi_value_len, xi->xi_name_index, - (unsigned long long)bucket_blkno(xs->bucket)); - - if (!xs->bucket->bu_bhs[1]) { - blkno = bucket_blkno(xs->bucket); - ocfs2_xattr_bucket_relse(xs->bucket); - ret = ocfs2_read_xattr_bucket(xs->bucket, blkno); - if (ret) { - mlog_errno(ret); - goto out; - } - } - - ocfs2_init_xattr_bucket_xa_loc(&loc, xs->bucket, - xs->not_found ? NULL : xs->here); - ret = ocfs2_xa_journal_access(handle, &loc, - OCFS2_JOURNAL_ACCESS_WRITE); - if (ret < 0) { - mlog_errno(ret); - goto out; - } - - ret = ocfs2_xa_prepare_entry(&loc, xi, name_hash); - if (ret) { - if (ret != -ENOSPC) - mlog_errno(ret); - goto out; - } - /* XXX For now, until we make ocfs2_xa_prepare_entry() primary */ - BUG_ON(ret == -ENOSPC); - ocfs2_xa_store_inline_value(&loc, xi); - xs->here = loc.xl_entry; - - ocfs2_xa_journal_dirty(handle, &loc); - -out: - return ret; -} - /* * Truncate the specified xe_off entry in xattr bucket. * bucket is indicated by header_bh and len is the new length. @@ -5478,66 +5306,6 @@ out: return ret; } -static int ocfs2_xattr_bucket_value_truncate_xs(struct inode *inode, - struct ocfs2_xattr_search *xs, - int len, - struct ocfs2_xattr_set_ctxt *ctxt) -{ - int ret, offset; - struct ocfs2_xattr_entry *xe = xs->here; - struct ocfs2_xattr_header *xh = (struct ocfs2_xattr_header *)xs->base; - - BUG_ON(!xs->bucket->bu_bhs[0] || !xe || ocfs2_xattr_is_local(xe)); - - offset = xe - xh->xh_entries; - ret = ocfs2_xattr_bucket_value_truncate(inode, xs->bucket, - offset, len, ctxt); - if (ret) - mlog_errno(ret); - - return ret; -} - -static int ocfs2_xattr_bucket_set_value_outside(struct inode *inode, - handle_t *handle, - struct ocfs2_xattr_search *xs, - char *val, - int value_len) -{ - int ret, offset, block_off; - struct ocfs2_xattr_value_root *xv; - struct ocfs2_xattr_entry *xe = xs->here; - struct ocfs2_xattr_header *xh = bucket_xh(xs->bucket); - void *base; - struct ocfs2_xattr_value_buf vb = { - .vb_access = ocfs2_journal_access, - }; - - BUG_ON(!xs->base || !xe || ocfs2_xattr_is_local(xe)); - - ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb, xh, - xe - xh->xh_entries, - &block_off, - &offset); - if (ret) { - mlog_errno(ret); - goto out; - } - - base = bucket_block(xs->bucket, block_off); - xv = (struct ocfs2_xattr_value_root *)(base + offset + - OCFS2_XATTR_SIZE(xe->xe_name_len)); - - vb.vb_xv = xv; - vb.vb_bh = xs->bucket->bu_bhs[block_off]; - ret = __ocfs2_xattr_set_value_outside(inode, handle, - &vb, val, value_len); - if (ret) - mlog_errno(ret); -out: - return ret; -} - static int ocfs2_rm_xattr_cluster(struct inode *inode, struct buffer_head *root_bh, u64 blkno, @@ -5636,41 +5404,8 @@ out: return ret; } -static void ocfs2_xattr_bucket_remove_xs(struct inode *inode, - handle_t *handle, - struct ocfs2_xattr_search *xs) -{ - struct ocfs2_xattr_header *xh = bucket_xh(xs->bucket); - struct ocfs2_xattr_entry *last = &xh->xh_entries[ - le16_to_cpu(xh->xh_count) - 1]; - int ret = 0; - - ret = ocfs2_xattr_bucket_journal_access(handle, xs->bucket, - OCFS2_JOURNAL_ACCESS_WRITE); - if (ret) { - mlog_errno(ret); - return; - } - - /* Remove the old entry. */ - memmove(xs->here, xs->here + 1, - (void *)last - (void *)xs->here); - memset(last, 0, sizeof(struct ocfs2_xattr_entry)); - le16_add_cpu(&xh->xh_count, -1); - - ocfs2_xattr_bucket_journal_dirty(handle, xs->bucket); -} - /* * Set the xattr name/value in the bucket specified in xs. - * - * As the new value in xi may be stored in the bucket or in an outside cluster, - * we divide the whole process into 3 steps: - * 1. insert name/value in the bucket(ocfs2_xattr_set_entry_in_bucket) - * 2. truncate of the outside cluster(ocfs2_xattr_bucket_value_truncate_xs) - * 3. Set the value to the outside cluster(ocfs2_xattr_bucket_set_value_outside) - * 4. If the clusters for the new outside value can't be allocated, we need - * to free the xattr we allocated in set. */ static int ocfs2_xattr_set_in_bucket(struct inode *inode, struct ocfs2_xattr_info *xi, @@ -5678,70 +5413,46 @@ static int ocfs2_xattr_set_in_bucket(struct inode *inode, struct ocfs2_xattr_set_ctxt *ctxt) { int ret; - size_t value_len; - char *val = (char *)xi->xi_value; - struct ocfs2_xattr_entry *xe = xs->here; + u64 blkno; + struct ocfs2_xa_loc loc; u32 name_hash = ocfs2_xattr_name_hash(inode, xi->xi_name, xi->xi_name_len); - value_len = xi->xi_value_len; - if (!xs->not_found && !ocfs2_xattr_is_local(xe)) { - /* - * We need to truncate the xattr storage first. - * - * If both the old and new value are stored to - * outside block, we only need to truncate - * the storage and then set the value outside. - * - * If the new value should be stored within block, - * we should free all the outside block first and - * the modification to the xattr block will be done - * by following steps. - */ - if (xi->xi_value_len <= OCFS2_XATTR_INLINE_SIZE) - value_len = 0; - - ret = ocfs2_xattr_bucket_value_truncate_xs(inode, xs, - value_len, - ctxt); - if (ret) + if (!xs->bucket->bu_bhs[1]) { + blkno = bucket_blkno(xs->bucket); + ocfs2_xattr_bucket_relse(xs->bucket); + ret = ocfs2_read_xattr_bucket(xs->bucket, blkno); + if (ret) { + mlog_errno(ret); goto out; - - if (value_len) - goto set_value_outside; + } } - /* So we have to handle the inside block change now. */ - ret = ocfs2_xattr_set_entry_in_bucket(inode, ctxt->handle, xi, xs, - name_hash); - if (ret) { + ocfs2_init_xattr_bucket_xa_loc(&loc, xs->bucket, + xs->not_found ? NULL : xs->here); + ret = ocfs2_xa_journal_access(ctxt->handle, &loc, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret < 0) { mlog_errno(ret); goto out; } - if (xi->xi_value_len <= OCFS2_XATTR_INLINE_SIZE) + ret = ocfs2_xa_prepare_entry(&loc, xi, name_hash, ctxt); + if (ret) { + if (ret != -ENOSPC) + mlog_errno(ret); goto out; + } + xs->here = loc.xl_entry; - /* allocate the space now for the outside block storage. */ - ret = ocfs2_xattr_bucket_value_truncate_xs(inode, xs, - value_len, ctxt); + ret = ocfs2_xa_store_value(&loc, xi, ctxt); if (ret) { mlog_errno(ret); - - if (xs->not_found) { - /* - * We can't allocate enough clusters for outside - * storage and we have allocated xattr already, - * so need to remove it. - */ - ocfs2_xattr_bucket_remove_xs(inode, ctxt->handle, xs); - } goto out; } -set_value_outside: - ret = ocfs2_xattr_bucket_set_value_outside(inode, ctxt->handle, - xs, val, value_len); + ocfs2_xa_journal_dirty(ctxt->handle, &loc); + out: return ret; } -- cgit v1.2.3 From bca5e9bd1eb2a9422a2ff29e822a956310653754 Mon Sep 17 00:00:00 2001 From: Joel Becker <joel.becker@oracle.com> Date: Tue, 18 Aug 2009 20:40:14 -0700 Subject: ocfs2: Gell into ocfs2_xa_set() ocfs2_xa_set() wraps the ocfs2_xa_prepare_entry()/ocfs2_xa_store_value() logic. Both callers can now use the same routine. ocfs2_xa_remove() moves directly into ocfs2_xa_set(). Signed-off-by: Joel Becker <joel.becker@oracle.com> --- fs/ocfs2/xattr.c | 88 +++++++++++++++++++++++++++----------------------------- 1 file changed, 42 insertions(+), 46 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 550a3e86c971..98c18fbfc289 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -2017,8 +2017,6 @@ out: * Prepares loc->xl_entry to receive the new xattr. This includes * properly setting up the name+value pair region. If loc->xl_entry * already exists, it will take care of modifying it appropriately. - * This also includes deleting entries, but don't call this to remove - * a non-existant entry. That's just a bug. * * Note that this modifies the data. You did journal_access already, * right? @@ -2030,11 +2028,6 @@ static int ocfs2_xa_prepare_entry(struct ocfs2_xa_loc *loc, { int rc = 0; - if (!xi->xi_value) { - rc = ocfs2_xa_remove(loc, ctxt); - goto out; - } - rc = ocfs2_xa_check_space(loc, xi); if (rc) goto out; @@ -2092,9 +2085,6 @@ static int ocfs2_xa_store_value(struct ocfs2_xa_loc *loc, char *nameval_buf; struct ocfs2_xattr_value_buf vb; - if (!xi->xi_value) - goto out; - nameval_buf = ocfs2_xa_offset_pointer(loc, nameval_offset); if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) { ocfs2_xa_fill_value_buf(loc, &vb); @@ -2105,10 +2095,49 @@ static int ocfs2_xa_store_value(struct ocfs2_xa_loc *loc, } else memcpy(nameval_buf + name_size, xi->xi_value, xi->xi_value_len); -out: return rc; } +static int ocfs2_xa_set(struct ocfs2_xa_loc *loc, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_set_ctxt *ctxt) +{ + int ret; + u32 name_hash = ocfs2_xattr_name_hash(loc->xl_inode, xi->xi_name, + xi->xi_name_len); + + ret = ocfs2_xa_journal_access(ctxt->handle, loc, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* Don't worry, we are never called with !xi_value and !xl_entry */ + if (!xi->xi_value) { + ret = ocfs2_xa_remove(loc, ctxt); + goto out; + } + + ret = ocfs2_xa_prepare_entry(loc, xi, name_hash, ctxt); + if (ret) { + if (ret != -ENOSPC) + mlog_errno(ret); + goto out; + } + + ret = ocfs2_xa_store_value(loc, xi, ctxt); + if (ret) { + mlog_errno(ret); + goto out; + } + + ocfs2_xa_journal_dirty(ctxt->handle, loc); + +out: + return ret; +} + static void ocfs2_init_dinode_xa_loc(struct ocfs2_xa_loc *loc, struct inode *inode, struct buffer_head *bh, @@ -2183,8 +2212,6 @@ static int ocfs2_xattr_set_entry(struct inode *inode, struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data; handle_t *handle = ctxt->handle; int ret; - u32 name_hash = ocfs2_xattr_name_hash(inode, xi->xi_name, - xi->xi_name_len); struct ocfs2_xa_loc loc; if (!(flag & OCFS2_INLINE_XATTR_FL)) @@ -2205,14 +2232,8 @@ static int ocfs2_xattr_set_entry(struct inode *inode, else ocfs2_init_xattr_block_xa_loc(&loc, inode, xs->xattr_bh, xs->not_found ? NULL : xs->here); - ret = ocfs2_xa_journal_access(handle, &loc, - OCFS2_JOURNAL_ACCESS_WRITE); - if (ret) { - mlog_errno(ret); - goto out; - } - ret = ocfs2_xa_prepare_entry(&loc, xi, name_hash, ctxt); + ret = ocfs2_xa_set(&loc, xi, ctxt); if (ret) { if (ret != -ENOSPC) mlog_errno(ret); @@ -2220,14 +2241,6 @@ static int ocfs2_xattr_set_entry(struct inode *inode, } xs->here = loc.xl_entry; - ret = ocfs2_xa_store_value(&loc, xi, ctxt); - if (ret) { - mlog_errno(ret); - goto out; - } - - ocfs2_xa_journal_dirty(handle, &loc); - if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) && (flag & OCFS2_INLINE_XATTR_FL)) { struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); @@ -5415,8 +5428,6 @@ static int ocfs2_xattr_set_in_bucket(struct inode *inode, int ret; u64 blkno; struct ocfs2_xa_loc loc; - u32 name_hash = ocfs2_xattr_name_hash(inode, xi->xi_name, - xi->xi_name_len); if (!xs->bucket->bu_bhs[1]) { blkno = bucket_blkno(xs->bucket); @@ -5430,14 +5441,7 @@ static int ocfs2_xattr_set_in_bucket(struct inode *inode, ocfs2_init_xattr_bucket_xa_loc(&loc, xs->bucket, xs->not_found ? NULL : xs->here); - ret = ocfs2_xa_journal_access(ctxt->handle, &loc, - OCFS2_JOURNAL_ACCESS_WRITE); - if (ret < 0) { - mlog_errno(ret); - goto out; - } - - ret = ocfs2_xa_prepare_entry(&loc, xi, name_hash, ctxt); + ret = ocfs2_xa_set(&loc, xi, ctxt); if (ret) { if (ret != -ENOSPC) mlog_errno(ret); @@ -5445,14 +5449,6 @@ static int ocfs2_xattr_set_in_bucket(struct inode *inode, } xs->here = loc.xl_entry; - ret = ocfs2_xa_store_value(&loc, xi, ctxt); - if (ret) { - mlog_errno(ret); - goto out; - } - - ocfs2_xa_journal_dirty(ctxt->handle, &loc); - out: return ret; } -- cgit v1.2.3 From c5d95df5f78312c879f3058059c98a08821897a5 Mon Sep 17 00:00:00 2001 From: Joel Becker <joel.becker@oracle.com> Date: Tue, 18 Aug 2009 21:03:24 -0700 Subject: ocfs2: Let ocfs2_xa_prepare_entry() do space checks. ocfs2_xattr_set_in_bucket() doesn't need to do its own hacky space checking. Let's let ocfs2_xa_prepare_entry() (via ocfs2_xa_set()) do the more accurate work. Whenever it doesn't have space, ocfs2_xattr_set_in_bucket() can try to get more space. Signed-off-by: Joel Becker <joel.becker@oracle.com> --- fs/ocfs2/xattr.c | 270 +++++++++++++++++++------------------------------------ 1 file changed, 93 insertions(+), 177 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 98c18fbfc289..e7630a77a6c8 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -322,14 +322,6 @@ static inline u16 ocfs2_blocks_per_xattr_bucket(struct super_block *sb) return OCFS2_XATTR_BUCKET_SIZE / (1 << sb->s_blocksize_bits); } -static inline u16 ocfs2_xattr_max_xe_in_bucket(struct super_block *sb) -{ - u16 len = sb->s_blocksize - - offsetof(struct ocfs2_xattr_header, xh_entries); - - return len / sizeof(struct ocfs2_xattr_entry); -} - #define bucket_blkno(_b) ((_b)->bu_bhs[0]->b_blocknr) #define bucket_block(_b, _n) ((_b)->bu_bhs[(_n)]->b_data) #define bucket_xh(_b) ((struct ocfs2_xattr_header *)bucket_block((_b), 0)) @@ -5417,42 +5409,6 @@ out: return ret; } -/* - * Set the xattr name/value in the bucket specified in xs. - */ -static int ocfs2_xattr_set_in_bucket(struct inode *inode, - struct ocfs2_xattr_info *xi, - struct ocfs2_xattr_search *xs, - struct ocfs2_xattr_set_ctxt *ctxt) -{ - int ret; - u64 blkno; - struct ocfs2_xa_loc loc; - - if (!xs->bucket->bu_bhs[1]) { - blkno = bucket_blkno(xs->bucket); - ocfs2_xattr_bucket_relse(xs->bucket); - ret = ocfs2_read_xattr_bucket(xs->bucket, blkno); - if (ret) { - mlog_errno(ret); - goto out; - } - } - - ocfs2_init_xattr_bucket_xa_loc(&loc, xs->bucket, - xs->not_found ? NULL : xs->here); - ret = ocfs2_xa_set(&loc, xi, ctxt); - if (ret) { - if (ret != -ENOSPC) - mlog_errno(ret); - goto out; - } - xs->here = loc.xl_entry; - -out: - return ret; -} - /* * check whether the xattr bucket is filled up with the same hash value. * If we want to insert the xattr with the same hash, return -ENOSPC. @@ -5481,156 +5437,116 @@ static int ocfs2_check_xattr_bucket_collision(struct inode *inode, return 0; } -static int ocfs2_xattr_set_entry_index_block(struct inode *inode, - struct ocfs2_xattr_info *xi, - struct ocfs2_xattr_search *xs, - struct ocfs2_xattr_set_ctxt *ctxt) +/* + * Try to set the entry in the current bucket. If we fail, the caller + * will handle getting us another bucket. + */ +static int ocfs2_xattr_set_entry_bucket(struct inode *inode, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_search *xs, + struct ocfs2_xattr_set_ctxt *ctxt) { - struct ocfs2_xattr_header *xh; - struct ocfs2_xattr_entry *xe; - u16 count, header_size, xh_free_start; - int free, max_free, need, old; - size_t value_size = 0; - size_t blocksize = inode->i_sb->s_blocksize; - int ret, allocation = 0; - - mlog_entry("Set xattr %s in xattr index block\n", xi->xi_name); - -try_again: - xh = xs->header; - count = le16_to_cpu(xh->xh_count); - xh_free_start = le16_to_cpu(xh->xh_free_start); - header_size = sizeof(struct ocfs2_xattr_header) + - count * sizeof(struct ocfs2_xattr_entry); - max_free = OCFS2_XATTR_BUCKET_SIZE - header_size - - le16_to_cpu(xh->xh_name_value_len) - OCFS2_XATTR_HEADER_GAP; - - mlog_bug_on_msg(header_size > blocksize, "bucket %llu has header size " - "of %u which exceed block size\n", - (unsigned long long)bucket_blkno(xs->bucket), - header_size); + int ret; + struct ocfs2_xa_loc loc; - if (xi->xi_value && xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) - value_size = OCFS2_XATTR_ROOT_SIZE; - else if (xi->xi_value) - value_size = OCFS2_XATTR_SIZE(xi->xi_value_len); + mlog_entry("Set xattr %s in xattr bucket\n", xi->xi_name); - if (xs->not_found) - need = sizeof(struct ocfs2_xattr_entry) + - OCFS2_XATTR_SIZE(xi->xi_name_len) + value_size; - else { - need = value_size + OCFS2_XATTR_SIZE(xi->xi_name_len); + ocfs2_init_xattr_bucket_xa_loc(&loc, xs->bucket, + xs->not_found ? NULL : xs->here); + ret = ocfs2_xa_set(&loc, xi, ctxt); + if (!ret) { + xs->here = loc.xl_entry; + goto out; + } + if (ret != -ENOSPC) { + mlog_errno(ret); + goto out; + } - /* - * We only replace the old value if the new length is smaller - * than the old one. Otherwise we will allocate new space in the - * bucket to store it. - */ - xe = xs->here; - if (ocfs2_xattr_is_local(xe)) - old = OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size)); - else - old = OCFS2_XATTR_SIZE(OCFS2_XATTR_ROOT_SIZE); + /* Ok, we need space. Let's try defragmenting the bucket. */ + ret = ocfs2_defrag_xattr_bucket(inode, ctxt->handle, + xs->bucket); + if (ret) { + mlog_errno(ret); + goto out; + } - if (old >= value_size) - need = 0; + ret = ocfs2_xa_set(&loc, xi, ctxt); + if (!ret) { + xs->here = loc.xl_entry; + goto out; } + if (ret != -ENOSPC) + mlog_errno(ret); - free = xh_free_start - header_size - OCFS2_XATTR_HEADER_GAP; - /* - * We need to make sure the new name/value pair - * can exist in the same block. - */ - if (xh_free_start % blocksize < need) - free -= xh_free_start % blocksize; - - mlog(0, "xs->not_found = %d, in xattr bucket %llu: free = %d, " - "need = %d, max_free = %d, xh_free_start = %u, xh_name_value_len =" - " %u\n", xs->not_found, - (unsigned long long)bucket_blkno(xs->bucket), - free, need, max_free, le16_to_cpu(xh->xh_free_start), - le16_to_cpu(xh->xh_name_value_len)); - - if (free < need || - (xs->not_found && - count == ocfs2_xattr_max_xe_in_bucket(inode->i_sb))) { - if (need <= max_free && - count < ocfs2_xattr_max_xe_in_bucket(inode->i_sb)) { - /* - * We can create the space by defragment. Since only the - * name/value will be moved, the xe shouldn't be changed - * in xs. - */ - ret = ocfs2_defrag_xattr_bucket(inode, ctxt->handle, - xs->bucket); - if (ret) { - mlog_errno(ret); - goto out; - } - xh_free_start = le16_to_cpu(xh->xh_free_start); - free = xh_free_start - header_size - - OCFS2_XATTR_HEADER_GAP; - if (xh_free_start % blocksize < need) - free -= xh_free_start % blocksize; +out: + mlog_exit(ret); + return ret; +} - if (free >= need) - goto xattr_set; +static int ocfs2_xattr_set_entry_index_block(struct inode *inode, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_search *xs, + struct ocfs2_xattr_set_ctxt *ctxt) +{ + int ret; - mlog(0, "Can't get enough space for xattr insert by " - "defragment. Need %u bytes, but we have %d, so " - "allocate new bucket for it.\n", need, free); - } + mlog_entry("Set xattr %s in xattr index block\n", xi->xi_name); - /* - * We have to add new buckets or clusters and one - * allocation should leave us enough space for insert. - */ - BUG_ON(allocation); + ret = ocfs2_xattr_set_entry_bucket(inode, xi, xs, ctxt); + if (!ret) + goto out; + if (ret != -ENOSPC) { + mlog_errno(ret); + goto out; + } - /* - * We do not allow for overlapping ranges between buckets. And - * the maximum number of collisions we will allow for then is - * one bucket's worth, so check it here whether we need to - * add a new bucket for the insert. - */ - ret = ocfs2_check_xattr_bucket_collision(inode, - xs->bucket, - xi->xi_name); - if (ret) { - mlog_errno(ret); - goto out; - } + /* Ack, need more space. Let's try to get another bucket! */ - ret = ocfs2_add_new_xattr_bucket(inode, - xs->xattr_bh, + /* + * We do not allow for overlapping ranges between buckets. And + * the maximum number of collisions we will allow for then is + * one bucket's worth, so check it here whether we need to + * add a new bucket for the insert. + */ + ret = ocfs2_check_xattr_bucket_collision(inode, xs->bucket, - ctxt); - if (ret) { - mlog_errno(ret); - goto out; - } + xi->xi_name); + if (ret) { + mlog_errno(ret); + goto out; + } - /* - * ocfs2_add_new_xattr_bucket() will have updated - * xs->bucket if it moved, but it will not have updated - * any of the other search fields. Thus, we drop it and - * re-search. Everything should be cached, so it'll be - * quick. - */ - ocfs2_xattr_bucket_relse(xs->bucket); - ret = ocfs2_xattr_index_block_find(inode, xs->xattr_bh, - xi->xi_name_index, - xi->xi_name, xs); - if (ret && ret != -ENODATA) - goto out; - xs->not_found = ret; - allocation = 1; - goto try_again; + ret = ocfs2_add_new_xattr_bucket(inode, + xs->xattr_bh, + xs->bucket, + ctxt); + if (ret) { + mlog_errno(ret); + goto out; } -xattr_set: - ret = ocfs2_xattr_set_in_bucket(inode, xi, xs, ctxt); + /* + * ocfs2_add_new_xattr_bucket() will have updated + * xs->bucket if it moved, but it will not have updated + * any of the other search fields. Thus, we drop it and + * re-search. Everything should be cached, so it'll be + * quick. + */ + ocfs2_xattr_bucket_relse(xs->bucket); + ret = ocfs2_xattr_index_block_find(inode, xs->xattr_bh, + xi->xi_name_index, + xi->xi_name, xs); + if (ret && ret != -ENODATA) + goto out; + xs->not_found = ret; + + /* Ok, we have a new bucket, let's try again */ + ret = ocfs2_xattr_set_entry_bucket(inode, xi, xs, ctxt); + if (ret && (ret != -ENOSPC)) + mlog_errno(ret); + out: mlog_exit(ret); return ret; -- cgit v1.2.3 From d3981544d7a4ed276966cdd58fe2f5d6e8a585d9 Mon Sep 17 00:00:00 2001 From: Joel Becker <joel.becker@oracle.com> Date: Wed, 19 Aug 2009 02:13:50 -0700 Subject: ocfs2: Set xattr block entries with ocfs2_xa_set() ocfs2_xattr_block_set() calls into ocfs2_xattr_set_entry() with just the HAS_XATTR flag. Most of the machinery of ocfs2_xattr_set_entry() is skipped. All that really happens other than the call to ocfs2_xa_set() is making sure the HAS_XATTR flag is set on the inode. But HAS_XATTR should be set when we also set di->i_xattr_loc. And that's done in ocfs2_create_xattr_block(). So let's move it there, and then ocfs2_xattr_block_set() can just call ocfs2_xa_set(). While we're there, ocfs2_create_xattr_block() can take the set_ctxt for a smaller argument list. It also learns to set HAS_XATTR_FL, because it knows for sure. ocfs2_create_empty_xatttr_block() in the reflink path fakes a set_ctxt to call ocfs2_create_xattr_block(). Signed-off-by: Joel Becker <joel.becker@oracle.com> --- fs/ocfs2/xattr.c | 99 ++++++++++++++++++++++++++++---------------------------- 1 file changed, 49 insertions(+), 50 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index e7630a77a6c8..fb8568d1e8a1 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -2206,10 +2206,8 @@ static int ocfs2_xattr_set_entry(struct inode *inode, int ret; struct ocfs2_xa_loc loc; - if (!(flag & OCFS2_INLINE_XATTR_FL)) - BUG_ON(xs->xattr_bh == xs->inode_bh); - else - BUG_ON(xs->xattr_bh != xs->inode_bh); + BUG_ON(!(flag & OCFS2_INLINE_XATTR_FL)); + BUG_ON(xs->xattr_bh != xs->inode_bh); ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), xs->inode_bh, OCFS2_JOURNAL_ACCESS_WRITE); @@ -2218,13 +2216,8 @@ static int ocfs2_xattr_set_entry(struct inode *inode, goto out; } - if (xs->xattr_bh == xs->inode_bh) - ocfs2_init_dinode_xa_loc(&loc, inode, xs->inode_bh, - xs->not_found ? NULL : xs->here); - else - ocfs2_init_xattr_block_xa_loc(&loc, inode, xs->xattr_bh, - xs->not_found ? NULL : xs->here); - + ocfs2_init_dinode_xa_loc(&loc, inode, xs->inode_bh, + xs->not_found ? NULL : xs->here); ret = ocfs2_xa_set(&loc, xi, ctxt); if (ret) { if (ret != -ENOSPC) @@ -2233,8 +2226,7 @@ static int ocfs2_xattr_set_entry(struct inode *inode, } xs->here = loc.xl_entry; - if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) && - (flag & OCFS2_INLINE_XATTR_FL)) { + if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) { struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); unsigned int xattrsize = osb->s_xattr_inline_size; @@ -2254,7 +2246,7 @@ static int ocfs2_xattr_set_entry(struct inode *inode, } /* Update xattr flag */ spin_lock(&oi->ip_lock); - oi->ip_dyn_features |= flag; + oi->ip_dyn_features |= OCFS2_INLINE_XATTR_FL; di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); spin_unlock(&oi->ip_lock); @@ -2748,12 +2740,11 @@ cleanup: return ret; } -static int ocfs2_create_xattr_block(handle_t *handle, - struct inode *inode, +static int ocfs2_create_xattr_block(struct inode *inode, struct buffer_head *inode_bh, - struct ocfs2_alloc_context *meta_ac, - struct buffer_head **ret_bh, - int indexed) + struct ocfs2_xattr_set_ctxt *ctxt, + int indexed, + struct buffer_head **ret_bh) { int ret; u16 suballoc_bit_start; @@ -2764,14 +2755,14 @@ static int ocfs2_create_xattr_block(handle_t *handle, struct buffer_head *new_bh = NULL; struct ocfs2_xattr_block *xblk; - ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), inode_bh, - OCFS2_JOURNAL_ACCESS_CREATE); + ret = ocfs2_journal_access_di(ctxt->handle, INODE_CACHE(inode), + inode_bh, OCFS2_JOURNAL_ACCESS_CREATE); if (ret < 0) { mlog_errno(ret); goto end; } - ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1, + ret = ocfs2_claim_metadata(osb, ctxt->handle, ctxt->meta_ac, 1, &suballoc_bit_start, &num_got, &first_blkno); if (ret < 0) { @@ -2782,7 +2773,7 @@ static int ocfs2_create_xattr_block(handle_t *handle, new_bh = sb_getblk(inode->i_sb, first_blkno); ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), new_bh); - ret = ocfs2_journal_access_xb(handle, INODE_CACHE(inode), + ret = ocfs2_journal_access_xb(ctxt->handle, INODE_CACHE(inode), new_bh, OCFS2_JOURNAL_ACCESS_CREATE); if (ret < 0) { @@ -2794,11 +2785,10 @@ static int ocfs2_create_xattr_block(handle_t *handle, xblk = (struct ocfs2_xattr_block *)new_bh->b_data; memset(xblk, 0, inode->i_sb->s_blocksize); strcpy((void *)xblk, OCFS2_XATTR_BLOCK_SIGNATURE); - xblk->xb_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot); + xblk->xb_suballoc_slot = cpu_to_le16(ctxt->meta_ac->ac_alloc_slot); xblk->xb_suballoc_bit = cpu_to_le16(suballoc_bit_start); xblk->xb_fs_generation = cpu_to_le32(osb->fs_generation); xblk->xb_blkno = cpu_to_le64(first_blkno); - if (indexed) { struct ocfs2_xattr_tree_root *xr = &xblk->xb_attrs.xb_root; xr->xt_clusters = cpu_to_le32(1); @@ -2809,14 +2799,17 @@ static int ocfs2_create_xattr_block(handle_t *handle, xr->xt_list.l_next_free_rec = cpu_to_le16(1); xblk->xb_flags = cpu_to_le16(OCFS2_XATTR_INDEXED); } + ocfs2_journal_dirty(ctxt->handle, new_bh); - ret = ocfs2_journal_dirty(handle, new_bh); - if (ret < 0) { - mlog_errno(ret); - goto end; - } + /* Add it to the inode */ di->i_xattr_loc = cpu_to_le64(first_blkno); - ocfs2_journal_dirty(handle, inode_bh); + + spin_lock(&OCFS2_I(inode)->ip_lock); + OCFS2_I(inode)->ip_dyn_features |= OCFS2_HAS_XATTR_FL; + di->i_dyn_features = cpu_to_le16(OCFS2_I(inode)->ip_dyn_features); + spin_unlock(&OCFS2_I(inode)->ip_lock); + + ocfs2_journal_dirty(ctxt->handle, inode_bh); *ret_bh = new_bh; new_bh = NULL; @@ -2838,13 +2831,13 @@ static int ocfs2_xattr_block_set(struct inode *inode, struct ocfs2_xattr_set_ctxt *ctxt) { struct buffer_head *new_bh = NULL; - handle_t *handle = ctxt->handle; struct ocfs2_xattr_block *xblk = NULL; int ret; + struct ocfs2_xa_loc loc; if (!xs->xattr_bh) { - ret = ocfs2_create_xattr_block(handle, inode, xs->inode_bh, - ctxt->meta_ac, &new_bh, 0); + ret = ocfs2_create_xattr_block(inode, xs->inode_bh, ctxt, + 0, &new_bh); if (ret) { mlog_errno(ret); goto end; @@ -2860,21 +2853,25 @@ static int ocfs2_xattr_block_set(struct inode *inode, xblk = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data; if (!(le16_to_cpu(xblk->xb_flags) & OCFS2_XATTR_INDEXED)) { - /* Set extended attribute into external block */ - ret = ocfs2_xattr_set_entry(inode, xi, xs, ctxt, - OCFS2_HAS_XATTR_FL); - if (!ret || ret != -ENOSPC) - goto end; + ocfs2_init_xattr_block_xa_loc(&loc, inode, xs->xattr_bh, + xs->not_found ? NULL : xs->here); - ret = ocfs2_xattr_create_index_block(inode, xs, ctxt); - if (ret) + ret = ocfs2_xa_set(&loc, xi, ctxt); + if (!ret) + xs->here = loc.xl_entry; + else if (ret != -ENOSPC) goto end; + else { + ret = ocfs2_xattr_create_index_block(inode, xs, ctxt); + if (ret) + goto end; + } } - ret = ocfs2_xattr_set_entry_index_block(inode, xi, xs, ctxt); + if (le16_to_cpu(xblk->xb_flags) & OCFS2_XATTR_INDEXED) + ret = ocfs2_xattr_set_entry_index_block(inode, xi, xs, ctxt); end: - return ret; } @@ -6434,9 +6431,11 @@ static int ocfs2_create_empty_xattr_block(struct inode *inode, int indexed) { int ret; - handle_t *handle; struct ocfs2_alloc_context *meta_ac; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_xattr_set_ctxt ctxt = { + .meta_ac = meta_ac, + }; ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac); if (ret < 0) { @@ -6444,21 +6443,21 @@ static int ocfs2_create_empty_xattr_block(struct inode *inode, return ret; } - handle = ocfs2_start_trans(osb, OCFS2_XATTR_BLOCK_CREATE_CREDITS); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); + ctxt.handle = ocfs2_start_trans(osb, OCFS2_XATTR_BLOCK_CREATE_CREDITS); + if (IS_ERR(ctxt.handle)) { + ret = PTR_ERR(ctxt.handle); mlog_errno(ret); goto out; } mlog(0, "create new xattr block for inode %llu, index = %d\n", (unsigned long long)fe_bh->b_blocknr, indexed); - ret = ocfs2_create_xattr_block(handle, inode, fe_bh, - meta_ac, ret_bh, indexed); + ret = ocfs2_create_xattr_block(inode, fe_bh, &ctxt, indexed, + ret_bh); if (ret) mlog_errno(ret); - ocfs2_commit_trans(osb, handle); + ocfs2_commit_trans(osb, ctxt.handle); out: ocfs2_free_alloc_context(meta_ac); return ret; -- cgit v1.2.3 From 139ffacebf5fe2cd9e2ae40d325a9661a679ad4f Mon Sep 17 00:00:00 2001 From: Joel Becker <joel.becker@oracle.com> Date: Wed, 19 Aug 2009 11:09:17 -0700 Subject: ocfs2: Set inline xattr entries with ocfs2_xa_set() ocfs2_xattr_ibody_set() is the only remaining user of ocfs2_xattr_set_entry(). ocfs2_xattr_set_entry() actually does two things: it calls ocfs2_xa_set(), and it initializes the inline xattrs. Initializing the inline space really belongs in its own call. We lift the initialization to ocfs2_xattr_ibody_init(), called from ocfs2_xattr_ibody_set() only when necessary. Now ocfs2_xattr_ibody_set() can call ocfs2_xa_set() directly. ocfs2_xattr_set_entry() goes away. Another nice fact is that ocfs2_init_dinode_xa_loc() can trust i_xattr_inline_size. Signed-off-by: Joel Becker <joel.becker@oracle.com> --- fs/ocfs2/xattr.c | 157 ++++++++++++++++++++++++++----------------------------- 1 file changed, 73 insertions(+), 84 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index fb8568d1e8a1..a2d912a92dd7 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -2137,17 +2137,13 @@ static void ocfs2_init_dinode_xa_loc(struct ocfs2_xa_loc *loc, { struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data; + BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_XATTR_FL)); + loc->xl_inode = inode; loc->xl_ops = &ocfs2_xa_block_loc_ops; loc->xl_storage = bh; loc->xl_entry = entry; - - if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_XATTR_FL) - loc->xl_size = le16_to_cpu(di->i_xattr_inline_size); - else { - BUG_ON(entry); - loc->xl_size = OCFS2_SB(inode->i_sb)->s_xattr_inline_size; - } + loc->xl_size = le16_to_cpu(di->i_xattr_inline_size); loc->xl_header = (struct ocfs2_xattr_header *)(bh->b_data + bh->b_size - loc->xl_size); @@ -2184,80 +2180,6 @@ static void ocfs2_init_xattr_bucket_xa_loc(struct ocfs2_xa_loc *loc, loc->xl_size = OCFS2_XATTR_BUCKET_SIZE; } - -/* - * ocfs2_xattr_set_entry() - * - * Set extended attribute entry into inode or block. - * - * If extended attribute value size > OCFS2_XATTR_INLINE_SIZE, - * We first insert tree root(ocfs2_xattr_value_root) like a normal value, - * then set value in B tree with set_value_outside(). - */ -static int ocfs2_xattr_set_entry(struct inode *inode, - struct ocfs2_xattr_info *xi, - struct ocfs2_xattr_search *xs, - struct ocfs2_xattr_set_ctxt *ctxt, - int flag) -{ - struct ocfs2_inode_info *oi = OCFS2_I(inode); - struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data; - handle_t *handle = ctxt->handle; - int ret; - struct ocfs2_xa_loc loc; - - BUG_ON(!(flag & OCFS2_INLINE_XATTR_FL)); - BUG_ON(xs->xattr_bh != xs->inode_bh); - - ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), xs->inode_bh, - OCFS2_JOURNAL_ACCESS_WRITE); - if (ret) { - mlog_errno(ret); - goto out; - } - - ocfs2_init_dinode_xa_loc(&loc, inode, xs->inode_bh, - xs->not_found ? NULL : xs->here); - ret = ocfs2_xa_set(&loc, xi, ctxt); - if (ret) { - if (ret != -ENOSPC) - mlog_errno(ret); - goto out; - } - xs->here = loc.xl_entry; - - if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) { - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - unsigned int xattrsize = osb->s_xattr_inline_size; - - /* - * Adjust extent record count or inline data size - * to reserve space for extended attribute. - */ - if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) { - struct ocfs2_inline_data *idata = &di->id2.i_data; - le16_add_cpu(&idata->id_count, -xattrsize); - } else if (!(ocfs2_inode_is_fast_symlink(inode))) { - struct ocfs2_extent_list *el = &di->id2.i_list; - le16_add_cpu(&el->l_count, -(xattrsize / - sizeof(struct ocfs2_extent_rec))); - } - di->i_xattr_inline_size = cpu_to_le16(xattrsize); - } - /* Update xattr flag */ - spin_lock(&oi->ip_lock); - oi->ip_dyn_features |= OCFS2_INLINE_XATTR_FL; - di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); - spin_unlock(&oi->ip_lock); - - ret = ocfs2_journal_dirty(handle, xs->inode_bh); - if (ret < 0) - mlog_errno(ret); - -out: - return ret; -} - /* * In xattr remove, if it is stored outside and refcounted, we may have * the chance to split the refcount tree. So need the allocators. @@ -2653,6 +2575,55 @@ static int ocfs2_xattr_ibody_find(struct inode *inode, return 0; } +static int ocfs2_xattr_ibody_init(struct inode *inode, + struct buffer_head *di_bh, + struct ocfs2_xattr_set_ctxt *ctxt) +{ + int ret; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + unsigned int xattrsize = osb->s_xattr_inline_size; + + if (!ocfs2_xattr_has_space_inline(inode, di)) { + ret = -ENOSPC; + goto out; + } + + ret = ocfs2_journal_access_di(ctxt->handle, INODE_CACHE(inode), di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * Adjust extent record count or inline data size + * to reserve space for extended attribute. + */ + if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + struct ocfs2_inline_data *idata = &di->id2.i_data; + le16_add_cpu(&idata->id_count, -xattrsize); + } else if (!(ocfs2_inode_is_fast_symlink(inode))) { + struct ocfs2_extent_list *el = &di->id2.i_list; + le16_add_cpu(&el->l_count, -(xattrsize / + sizeof(struct ocfs2_extent_rec))); + } + di->i_xattr_inline_size = cpu_to_le16(xattrsize); + + spin_lock(&oi->ip_lock); + oi->ip_dyn_features |= OCFS2_INLINE_XATTR_FL|OCFS2_HAS_XATTR_FL; + di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); + spin_unlock(&oi->ip_lock); + + ret = ocfs2_journal_dirty(ctxt->handle, di_bh); + if (ret < 0) + mlog_errno(ret); + +out: + return ret; +} + /* * ocfs2_xattr_ibody_set() * @@ -2664,9 +2635,10 @@ static int ocfs2_xattr_ibody_set(struct inode *inode, struct ocfs2_xattr_search *xs, struct ocfs2_xattr_set_ctxt *ctxt) { + int ret; struct ocfs2_inode_info *oi = OCFS2_I(inode); struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data; - int ret; + struct ocfs2_xa_loc loc; if (inode->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE) return -ENOSPC; @@ -2679,8 +2651,25 @@ static int ocfs2_xattr_ibody_set(struct inode *inode, } } - ret = ocfs2_xattr_set_entry(inode, xi, xs, ctxt, - (OCFS2_INLINE_XATTR_FL | OCFS2_HAS_XATTR_FL)); + if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) { + ret = ocfs2_xattr_ibody_init(inode, xs->inode_bh, ctxt); + if (ret) { + if (ret != -ENOSPC) + mlog_errno(ret); + goto out; + } + } + + ocfs2_init_dinode_xa_loc(&loc, inode, xs->inode_bh, + xs->not_found ? NULL : xs->here); + ret = ocfs2_xa_set(&loc, xi, ctxt); + if (ret) { + if (ret != -ENOSPC) + mlog_errno(ret); + goto out; + } + xs->here = loc.xl_entry; + out: up_write(&oi->ip_alloc_sem); -- cgit v1.2.3 From 399ff3a748cf4c8c853e96dd477153202636527b Mon Sep 17 00:00:00 2001 From: Joel Becker <joel.becker@oracle.com> Date: Tue, 1 Sep 2009 18:38:27 -0700 Subject: ocfs2: Handle errors while setting external xattr values. ocfs2 can store extended attribute values as large as a single file. It does this using a standard ocfs2 btree for the large value. However, the previous code did not handle all error cases cleanly. There are multiple problems to have. 1) We have trouble allocating space for a new xattr. This leaves us with an empty xattr. 2) We overwrote an existing local xattr with a value root, and now we have an error allocating the storage. This leaves us an empty xattr. where there used to be a value. The value is lost. 3) We have trouble truncating a reused value. This leaves us with the original entry pointing to the truncated original value. The value is lost. 4) We have trouble extending the storage on a reused value. This leaves us with the original value safely in place, but with more storage allocated when needed. This doesn't consider storing local xattrs (values that don't require a btree). Those only fail when the journal fails. Case (1) is easy. We just remove the xattr we added. We leak the storage because we can't safely remove it, but otherwise everything is happy. We'll print a warning about the leak. Case (4) is easy. We still have the original value in place. We can just leave the extra storage attached to this xattr. We return the error, but the old value is untouched. We print a warning about the storage. Case (2) and (3) are hard because we've lost the original values. In the old code, we ended up with values that could be partially read. That's not good. Instead, we just wipe the xattr entry and leak the storage. It stinks that the original value is lost, but now there isn't a partial value to be read. We'll print a big fat warning. Signed-off-by: Joel Becker <joel.becker@oracle.com> --- fs/ocfs2/xattr.c | 140 ++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 124 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index a2d912a92dd7..d1b0d386f6d1 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -1869,6 +1869,17 @@ static const struct ocfs2_xa_loc_operations ocfs2_xa_bucket_loc_ops = { .xlo_fill_value_buf = ocfs2_xa_bucket_fill_value_buf, }; +static unsigned int ocfs2_xa_value_clusters(struct ocfs2_xa_loc *loc) +{ + struct ocfs2_xattr_value_buf vb; + + if (ocfs2_xattr_is_local(loc->xl_entry)) + return 0; + + ocfs2_xa_fill_value_buf(loc, &vb); + return le32_to_cpu(vb.vb_xv->xr_clusters); +} + static int ocfs2_xa_value_truncate(struct ocfs2_xa_loc *loc, u64 bytes, struct ocfs2_xattr_set_ctxt *ctxt) { @@ -1923,16 +1934,85 @@ static void ocfs2_xa_remove_entry(struct ocfs2_xa_loc *loc) } } +/* + * If we have a problem adjusting the size of an external value during + * ocfs2_xa_prepare_entry() or ocfs2_xa_remove(), we may have an xattr + * in an intermediate state. For example, the value may be partially + * truncated. + * + * If the value tree hasn't changed, the extend/truncate went nowhere. + * We have nothing to do. The caller can treat it as a straight error. + * + * If the value tree got partially truncated, we now have a corrupted + * extended attribute. We're going to wipe its entry and leak the + * clusters. Better to leak some storage than leave a corrupt entry. + * + * If the value tree grew, it obviously didn't grow enough for the + * new entry. We're not going to try and reclaim those clusters either. + * If there was already an external value there (orig_clusters != 0), + * the new clusters are attached safely and we can just leave the old + * value in place. If there was no external value there, we remove + * the entry. + * + * This way, the xattr block we store in the journal will be consistent. + * If the size change broke because of the journal, no changes will hit + * disk anyway. + */ +static void ocfs2_xa_cleanup_value_truncate(struct ocfs2_xa_loc *loc, + const char *what, + unsigned int orig_clusters) +{ + unsigned int new_clusters = ocfs2_xa_value_clusters(loc); + char *nameval_buf = ocfs2_xa_offset_pointer(loc, + le16_to_cpu(loc->xl_entry->xe_name_offset)); + + if (new_clusters < orig_clusters) { + mlog(ML_ERROR, + "Partial truncate while %s xattr %.*s. Leaking " + "%u clusters and removing the entry\n", + what, loc->xl_entry->xe_name_len, nameval_buf, + orig_clusters - new_clusters); + ocfs2_xa_remove_entry(loc); + } else if (!orig_clusters) { + mlog(ML_ERROR, + "Unable to allocate an external value for xattr " + "%.*s safely. Leaking %u clusters and removing the " + "entry\n", + loc->xl_entry->xe_name_len, nameval_buf, + new_clusters - orig_clusters); + ocfs2_xa_remove_entry(loc); + } else if (new_clusters > orig_clusters) + mlog(ML_ERROR, + "Unable to grow xattr %.*s safely. %u new clusters " + "have been added, but the value will not be " + "modified\n", + loc->xl_entry->xe_name_len, nameval_buf, + new_clusters - orig_clusters); +} + static int ocfs2_xa_remove(struct ocfs2_xa_loc *loc, struct ocfs2_xattr_set_ctxt *ctxt) { int rc = 0; + unsigned int orig_clusters; if (!ocfs2_xattr_is_local(loc->xl_entry)) { + orig_clusters = ocfs2_xa_value_clusters(loc); rc = ocfs2_xa_value_truncate(loc, 0, ctxt); if (rc) { mlog_errno(rc); - goto out; + /* + * Since this is remove, we can return 0 if + * ocfs2_xa_cleanup_value_truncate() is going to + * wipe the entry anyway. So we check the + * cluster count as well. + */ + if (orig_clusters != ocfs2_xa_value_clusters(loc)) + rc = 0; + ocfs2_xa_cleanup_value_truncate(loc, "removing", + orig_clusters); + if (rc) + goto out; } } @@ -1963,6 +2043,7 @@ static int ocfs2_xa_reuse_entry(struct ocfs2_xa_loc *loc, { int rc = 0; int name_size = OCFS2_XATTR_SIZE(xi->xi_name_len); + unsigned int orig_clusters; char *nameval_buf; int xe_local = ocfs2_xattr_is_local(loc->xl_entry); int xi_local = xi->xi_value_len <= OCFS2_XATTR_INLINE_SIZE; @@ -1978,23 +2059,27 @@ static int ocfs2_xa_reuse_entry(struct ocfs2_xa_loc *loc, if (!xi_local) ocfs2_xa_install_value_root(loc); } else { + orig_clusters = ocfs2_xa_value_clusters(loc); if (xi_local) { rc = ocfs2_xa_value_truncate(loc, 0, ctxt); - if (rc < 0) { + if (rc < 0) mlog_errno(rc); - goto out; - } - memset(nameval_buf + name_size, 0, - namevalue_size_xe(loc->xl_entry) - - name_size); + else + memset(nameval_buf + name_size, 0, + namevalue_size_xe(loc->xl_entry) - + name_size); } else if (le64_to_cpu(loc->xl_entry->xe_value_size) > xi->xi_value_len) { rc = ocfs2_xa_value_truncate(loc, xi->xi_value_len, ctxt); - if (rc < 0) { + if (rc < 0) mlog_errno(rc); - goto out; - } + } + + if (rc) { + ocfs2_xa_cleanup_value_truncate(loc, "reusing", + orig_clusters); + goto out; } } @@ -2019,6 +2104,8 @@ static int ocfs2_xa_prepare_entry(struct ocfs2_xa_loc *loc, struct ocfs2_xattr_set_ctxt *ctxt) { int rc = 0; + unsigned int orig_clusters; + __le64 orig_value_size = 0; rc = ocfs2_xa_check_space(loc, xi); if (rc) @@ -2026,6 +2113,7 @@ static int ocfs2_xa_prepare_entry(struct ocfs2_xa_loc *loc, if (loc->xl_entry) { if (ocfs2_xa_can_reuse_entry(loc, xi)) { + orig_value_size = loc->xl_entry->xe_value_size; rc = ocfs2_xa_reuse_entry(loc, xi, ctxt); if (rc) goto out; @@ -2033,9 +2121,13 @@ static int ocfs2_xa_prepare_entry(struct ocfs2_xa_loc *loc, } if (!ocfs2_xattr_is_local(loc->xl_entry)) { + orig_clusters = ocfs2_xa_value_clusters(loc); rc = ocfs2_xa_value_truncate(loc, 0, ctxt); if (rc) { mlog_errno(rc); + ocfs2_xa_cleanup_value_truncate(loc, + "overwriting", + orig_clusters); goto out; } } @@ -2053,9 +2145,20 @@ static int ocfs2_xa_prepare_entry(struct ocfs2_xa_loc *loc, alloc_value: if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) { + orig_clusters = ocfs2_xa_value_clusters(loc); rc = ocfs2_xa_value_truncate(loc, xi->xi_value_len, ctxt); - if (rc < 0) + if (rc < 0) { + /* + * If we tried to grow an existing external value, + * ocfs2_xa_cleanuP-value_truncate() is going to + * let it stand. We have to restore its original + * value size. + */ + loc->xl_entry->xe_value_size = orig_value_size; + ocfs2_xa_cleanup_value_truncate(loc, "growing", + orig_clusters); mlog_errno(rc); + } } out: @@ -2105,25 +2208,30 @@ static int ocfs2_xa_set(struct ocfs2_xa_loc *loc, goto out; } + /* + * From here on out, everything is going to modify the buffer a + * little. Errors are going to leave the xattr header in a + * sane state. Thus, even with errors we dirty the sucker. + */ + /* Don't worry, we are never called with !xi_value and !xl_entry */ if (!xi->xi_value) { ret = ocfs2_xa_remove(loc, ctxt); - goto out; + goto out_dirty; } ret = ocfs2_xa_prepare_entry(loc, xi, name_hash, ctxt); if (ret) { if (ret != -ENOSPC) mlog_errno(ret); - goto out; + goto out_dirty; } ret = ocfs2_xa_store_value(loc, xi, ctxt); - if (ret) { + if (ret) mlog_errno(ret); - goto out; - } +out_dirty: ocfs2_xa_journal_dirty(ctxt->handle, loc); out: -- cgit v1.2.3 From 14a437c2b67aeee2a989a5d9c7e19094355d2fed Mon Sep 17 00:00:00 2001 From: Joel Becker <joel.becker@oracle.com> Date: Thu, 4 Feb 2010 15:37:27 -0800 Subject: ocfs2_dlmfs: Add capabilities parameter. Over time, dlmfs has added some features that were not part of the initial ABI. Unfortunately, some of these features are not detectable via standard usage. For example, Linux's default poll always returns POLLIN, so there is no way for a caller of poll(2) to know when dlmfs added poll support. Instead, we provide this list of new capabilities. Capabilities is a read-only attribute. We do it as a module parameter so we can discover it whether dlmfs is built in, loaded, or even not loaded (via modinfo). The ABI features are local to this machine's dlmfs mount. This is distinct from the locking protocol, which is concerned with inter-node interaction. Signed-off-by: Joel Becker <joel.becker@oracle.com> --- fs/ocfs2/dlm/dlmfs.c | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) (limited to 'fs') diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c index 02bf17808bdc..77d0df42fe02 100644 --- a/fs/ocfs2/dlm/dlmfs.c +++ b/fs/ocfs2/dlm/dlmfs.c @@ -81,6 +81,42 @@ static const struct dlm_protocol_version user_locking_protocol = { .pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR, }; + +/* + * These are the ABI capabilities of dlmfs. + * + * Over time, dlmfs has added some features that were not part of the + * initial ABI. Unfortunately, some of these features are not detectable + * via standard usage. For example, Linux's default poll always returns + * POLLIN, so there is no way for a caller of poll(2) to know when dlmfs + * added poll support. Instead, we provide this list of new capabilities. + * + * Capabilities is a read-only attribute. We do it as a module parameter + * so we can discover it whether dlmfs is built in, loaded, or even not + * loaded. + * + * The ABI features are local to this machine's dlmfs mount. This is + * distinct from the locking protocol, which is concerned with inter-node + * interaction. + */ +#define DLMFS_CAPABILITIES "" +extern int param_set_dlmfs_capabilities(const char *val, + struct kernel_param *kp) +{ + printk(KERN_ERR "%s: readonly parameter\n", kp->name); + return -EINVAL; +} +static int param_get_dlmfs_capabilities(char *buffer, + struct kernel_param *kp) +{ + return strlcpy(buffer, DLMFS_CAPABILITIES, + strlen(DLMFS_CAPABILITIES) + 1); +} +module_param_call(capabilities, param_set_dlmfs_capabilities, + param_get_dlmfs_capabilities, NULL, 0444); +MODULE_PARM_DESC(capabilities, DLMFS_CAPABILITIES); + + /* * decodes a set of open flags into a valid lock level and a set of flags. * returns < 0 if we have invalid flags -- cgit v1.2.3 From 65b6f3403431cd43ef7b0dab679a50f770124a65 Mon Sep 17 00:00:00 2001 From: Joel Becker <joel.becker@oracle.com> Date: Tue, 26 Jan 2010 22:14:20 -0800 Subject: ocfs2_dlmfs: Use poll() to signify BASTs. o2dlm's userspace filesystem is an easy way to use the DLM from userspace. It is intentionally simple. For example, it does not allow for asynchronous behavior or lock conversion. This is intentional to keep the interface simple. Because there is no asynchronous notification, there is no way for a process holding a lock to know another node needs the lock. This is the number one complaint of ocfs2_dlmfs users. Turns out, we can solve this very easily. We add poll() support to ocfs2_dlmfs. When a BAST is received, the lock's file descriptor will receive POLLIN. This is trivial to implement. Userdlm already has an appropriate waitqueue, and the lock knows when it is blocked. We add the "bast" capability to tell userspace this is available. Signed-off-by: Joel Becker <joel.becker@oracle.com> Acked-by: Mark Fasheh <mfasheh@suse.com> Signed-off-by: Joel Becker <joel.becker@oracle.com> --- fs/ocfs2/dlm/dlmfs.c | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c index 77d0df42fe02..ddf55dafba2d 100644 --- a/fs/ocfs2/dlm/dlmfs.c +++ b/fs/ocfs2/dlm/dlmfs.c @@ -43,6 +43,7 @@ #include <linux/init.h> #include <linux/string.h> #include <linux/backing-dev.h> +#include <linux/poll.h> #include <asm/uaccess.h> @@ -98,8 +99,12 @@ static const struct dlm_protocol_version user_locking_protocol = { * The ABI features are local to this machine's dlmfs mount. This is * distinct from the locking protocol, which is concerned with inter-node * interaction. + * + * Capabilities: + * - bast : POLLIN against the file descriptor of a held lock + * signifies a bast fired on the lock. */ -#define DLMFS_CAPABILITIES "" +#define DLMFS_CAPABILITIES "bast" extern int param_set_dlmfs_capabilities(const char *val, struct kernel_param *kp) { @@ -215,6 +220,22 @@ static int dlmfs_file_release(struct inode *inode, return 0; } +static unsigned int dlmfs_file_poll(struct file *file, poll_table *wait) +{ + int event = 0; + struct inode *inode = file->f_path.dentry->d_inode; + struct dlmfs_inode_private *ip = DLMFS_I(inode); + + poll_wait(file, &ip->ip_lockres.l_event, wait); + + spin_lock(&ip->ip_lockres.l_lock); + if (ip->ip_lockres.l_flags & USER_LOCK_BLOCKED) + event = POLLIN | POLLRDNORM; + spin_unlock(&ip->ip_lockres.l_lock); + + return event; +} + static ssize_t dlmfs_file_read(struct file *filp, char __user *buf, size_t count, @@ -585,6 +606,7 @@ static int dlmfs_fill_super(struct super_block * sb, static const struct file_operations dlmfs_file_operations = { .open = dlmfs_file_open, .release = dlmfs_file_release, + .poll = dlmfs_file_poll, .read = dlmfs_file_read, .write = dlmfs_file_write, }; -- cgit v1.2.3 From 34a9dd7e29e9129fec40c645a03f1bbbe810e771 Mon Sep 17 00:00:00 2001 From: Joel Becker <joel.becker@oracle.com> Date: Thu, 28 Jan 2010 15:00:49 -0800 Subject: ocfs2_dlmfs: Move to its own directory We're going to remove the tie between ocfs2_dlmfs and o2dlm. ocfs2_dlmfs doesn't belong in the fs/ocfs2/dlm directory anymore. Here we move it to fs/ocfs2/dlmfs. Signed-off-by: Joel Becker <joel.becker@oracle.com> --- fs/ocfs2/Makefile | 1 + fs/ocfs2/dlm/Makefile | 3 +- fs/ocfs2/dlm/dlmfs.c | 710 ---------------------------------------------- fs/ocfs2/dlm/dlmfsver.c | 42 --- fs/ocfs2/dlm/dlmfsver.h | 31 -- fs/ocfs2/dlm/userdlm.c | 676 ------------------------------------------- fs/ocfs2/dlm/userdlm.h | 113 -------- fs/ocfs2/dlmfs/Makefile | 5 + fs/ocfs2/dlmfs/dlmfs.c | 710 ++++++++++++++++++++++++++++++++++++++++++++++ fs/ocfs2/dlmfs/dlmfsver.c | 42 +++ fs/ocfs2/dlmfs/dlmfsver.h | 31 ++ fs/ocfs2/dlmfs/userdlm.c | 676 +++++++++++++++++++++++++++++++++++++++++++ fs/ocfs2/dlmfs/userdlm.h | 113 ++++++++ 13 files changed, 1579 insertions(+), 1574 deletions(-) delete mode 100644 fs/ocfs2/dlm/dlmfs.c delete mode 100644 fs/ocfs2/dlm/dlmfsver.c delete mode 100644 fs/ocfs2/dlm/dlmfsver.h delete mode 100644 fs/ocfs2/dlm/userdlm.c delete mode 100644 fs/ocfs2/dlm/userdlm.h create mode 100644 fs/ocfs2/dlmfs/Makefile create mode 100644 fs/ocfs2/dlmfs/dlmfs.c create mode 100644 fs/ocfs2/dlmfs/dlmfsver.c create mode 100644 fs/ocfs2/dlmfs/dlmfsver.h create mode 100644 fs/ocfs2/dlmfs/userdlm.c create mode 100644 fs/ocfs2/dlmfs/userdlm.h (limited to 'fs') diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile index 600d2d2ade11..791c0886c060 100644 --- a/fs/ocfs2/Makefile +++ b/fs/ocfs2/Makefile @@ -46,6 +46,7 @@ ocfs2_stackglue-objs := stackglue.o ocfs2_stack_o2cb-objs := stack_o2cb.o ocfs2_stack_user-objs := stack_user.o +obj-$(CONFIG_OCFS2_FS) += dlmfs/ # cluster/ is always needed when OCFS2_FS for masklog support obj-$(CONFIG_OCFS2_FS) += cluster/ obj-$(CONFIG_OCFS2_FS_O2CB) += dlm/ diff --git a/fs/ocfs2/dlm/Makefile b/fs/ocfs2/dlm/Makefile index 190361375700..dcebf0d920fa 100644 --- a/fs/ocfs2/dlm/Makefile +++ b/fs/ocfs2/dlm/Makefile @@ -1,8 +1,7 @@ EXTRA_CFLAGS += -Ifs/ocfs2 -obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_dlm.o ocfs2_dlmfs.o +obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_dlm.o ocfs2_dlm-objs := dlmdomain.o dlmdebug.o dlmthread.o dlmrecovery.o \ dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o dlmver.o -ocfs2_dlmfs-objs := userdlm.o dlmfs.o dlmfsver.o diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c deleted file mode 100644 index ddf55dafba2d..000000000000 --- a/fs/ocfs2/dlm/dlmfs.c +++ /dev/null @@ -1,710 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * - * dlmfs.c - * - * Code which implements the kernel side of a minimal userspace - * interface to our DLM. This file handles the virtual file system - * used for communication with userspace. Credit should go to ramfs, - * which was a template for the fs side of this module. - * - * Copyright (C) 2003, 2004 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -/* Simple VFS hooks based on: */ -/* - * Resizable simple ram filesystem for Linux. - * - * Copyright (C) 2000 Linus Torvalds. - * 2000 Transmeta Corp. - */ - -#include <linux/module.h> -#include <linux/fs.h> -#include <linux/pagemap.h> -#include <linux/types.h> -#include <linux/slab.h> -#include <linux/highmem.h> -#include <linux/init.h> -#include <linux/string.h> -#include <linux/backing-dev.h> -#include <linux/poll.h> - -#include <asm/uaccess.h> - - -#include "cluster/nodemanager.h" -#include "cluster/heartbeat.h" -#include "cluster/tcp.h" - -#include "dlmapi.h" - -#include "userdlm.h" - -#include "dlmfsver.h" - -#define MLOG_MASK_PREFIX ML_DLMFS -#include "cluster/masklog.h" - -#include "ocfs2_lockingver.h" - -static const struct super_operations dlmfs_ops; -static const struct file_operations dlmfs_file_operations; -static const struct inode_operations dlmfs_dir_inode_operations; -static const struct inode_operations dlmfs_root_inode_operations; -static const struct inode_operations dlmfs_file_inode_operations; -static struct kmem_cache *dlmfs_inode_cache; - -struct workqueue_struct *user_dlm_worker; - -/* - * This is the userdlmfs locking protocol version. - * - * See fs/ocfs2/dlmglue.c for more details on locking versions. - */ -static const struct dlm_protocol_version user_locking_protocol = { - .pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR, - .pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR, -}; - - -/* - * These are the ABI capabilities of dlmfs. - * - * Over time, dlmfs has added some features that were not part of the - * initial ABI. Unfortunately, some of these features are not detectable - * via standard usage. For example, Linux's default poll always returns - * POLLIN, so there is no way for a caller of poll(2) to know when dlmfs - * added poll support. Instead, we provide this list of new capabilities. - * - * Capabilities is a read-only attribute. We do it as a module parameter - * so we can discover it whether dlmfs is built in, loaded, or even not - * loaded. - * - * The ABI features are local to this machine's dlmfs mount. This is - * distinct from the locking protocol, which is concerned with inter-node - * interaction. - * - * Capabilities: - * - bast : POLLIN against the file descriptor of a held lock - * signifies a bast fired on the lock. - */ -#define DLMFS_CAPABILITIES "bast" -extern int param_set_dlmfs_capabilities(const char *val, - struct kernel_param *kp) -{ - printk(KERN_ERR "%s: readonly parameter\n", kp->name); - return -EINVAL; -} -static int param_get_dlmfs_capabilities(char *buffer, - struct kernel_param *kp) -{ - return strlcpy(buffer, DLMFS_CAPABILITIES, - strlen(DLMFS_CAPABILITIES) + 1); -} -module_param_call(capabilities, param_set_dlmfs_capabilities, - param_get_dlmfs_capabilities, NULL, 0444); -MODULE_PARM_DESC(capabilities, DLMFS_CAPABILITIES); - - -/* - * decodes a set of open flags into a valid lock level and a set of flags. - * returns < 0 if we have invalid flags - * flags which mean something to us: - * O_RDONLY -> PRMODE level - * O_WRONLY -> EXMODE level - * - * O_NONBLOCK -> LKM_NOQUEUE - */ -static int dlmfs_decode_open_flags(int open_flags, - int *level, - int *flags) -{ - if (open_flags & (O_WRONLY|O_RDWR)) - *level = LKM_EXMODE; - else - *level = LKM_PRMODE; - - *flags = 0; - if (open_flags & O_NONBLOCK) - *flags |= LKM_NOQUEUE; - - return 0; -} - -static int dlmfs_file_open(struct inode *inode, - struct file *file) -{ - int status, level, flags; - struct dlmfs_filp_private *fp = NULL; - struct dlmfs_inode_private *ip; - - if (S_ISDIR(inode->i_mode)) - BUG(); - - mlog(0, "open called on inode %lu, flags 0x%x\n", inode->i_ino, - file->f_flags); - - status = dlmfs_decode_open_flags(file->f_flags, &level, &flags); - if (status < 0) - goto bail; - - /* We don't want to honor O_APPEND at read/write time as it - * doesn't make sense for LVB writes. */ - file->f_flags &= ~O_APPEND; - - fp = kmalloc(sizeof(*fp), GFP_NOFS); - if (!fp) { - status = -ENOMEM; - goto bail; - } - fp->fp_lock_level = level; - - ip = DLMFS_I(inode); - - status = user_dlm_cluster_lock(&ip->ip_lockres, level, flags); - if (status < 0) { - /* this is a strange error to return here but I want - * to be able userspace to be able to distinguish a - * valid lock request from one that simply couldn't be - * granted. */ - if (flags & LKM_NOQUEUE && status == -EAGAIN) - status = -ETXTBSY; - kfree(fp); - goto bail; - } - - file->private_data = fp; -bail: - return status; -} - -static int dlmfs_file_release(struct inode *inode, - struct file *file) -{ - int level, status; - struct dlmfs_inode_private *ip = DLMFS_I(inode); - struct dlmfs_filp_private *fp = - (struct dlmfs_filp_private *) file->private_data; - - if (S_ISDIR(inode->i_mode)) - BUG(); - - mlog(0, "close called on inode %lu\n", inode->i_ino); - - status = 0; - if (fp) { - level = fp->fp_lock_level; - if (level != LKM_IVMODE) - user_dlm_cluster_unlock(&ip->ip_lockres, level); - - kfree(fp); - file->private_data = NULL; - } - - return 0; -} - -static unsigned int dlmfs_file_poll(struct file *file, poll_table *wait) -{ - int event = 0; - struct inode *inode = file->f_path.dentry->d_inode; - struct dlmfs_inode_private *ip = DLMFS_I(inode); - - poll_wait(file, &ip->ip_lockres.l_event, wait); - - spin_lock(&ip->ip_lockres.l_lock); - if (ip->ip_lockres.l_flags & USER_LOCK_BLOCKED) - event = POLLIN | POLLRDNORM; - spin_unlock(&ip->ip_lockres.l_lock); - - return event; -} - -static ssize_t dlmfs_file_read(struct file *filp, - char __user *buf, - size_t count, - loff_t *ppos) -{ - int bytes_left; - ssize_t readlen; - char *lvb_buf; - struct inode *inode = filp->f_path.dentry->d_inode; - - mlog(0, "inode %lu, count = %zu, *ppos = %llu\n", - inode->i_ino, count, *ppos); - - if (*ppos >= i_size_read(inode)) - return 0; - - if (!count) - return 0; - - if (!access_ok(VERIFY_WRITE, buf, count)) - return -EFAULT; - - /* don't read past the lvb */ - if ((count + *ppos) > i_size_read(inode)) - readlen = i_size_read(inode) - *ppos; - else - readlen = count - *ppos; - - lvb_buf = kmalloc(readlen, GFP_NOFS); - if (!lvb_buf) - return -ENOMEM; - - user_dlm_read_lvb(inode, lvb_buf, readlen); - bytes_left = __copy_to_user(buf, lvb_buf, readlen); - readlen -= bytes_left; - - kfree(lvb_buf); - - *ppos = *ppos + readlen; - - mlog(0, "read %zd bytes\n", readlen); - return readlen; -} - -static ssize_t dlmfs_file_write(struct file *filp, - const char __user *buf, - size_t count, - loff_t *ppos) -{ - int bytes_left; - ssize_t writelen; - char *lvb_buf; - struct inode *inode = filp->f_path.dentry->d_inode; - - mlog(0, "inode %lu, count = %zu, *ppos = %llu\n", - inode->i_ino, count, *ppos); - - if (*ppos >= i_size_read(inode)) - return -ENOSPC; - - if (!count) - return 0; - - if (!access_ok(VERIFY_READ, buf, count)) - return -EFAULT; - - /* don't write past the lvb */ - if ((count + *ppos) > i_size_read(inode)) - writelen = i_size_read(inode) - *ppos; - else - writelen = count - *ppos; - - lvb_buf = kmalloc(writelen, GFP_NOFS); - if (!lvb_buf) - return -ENOMEM; - - bytes_left = copy_from_user(lvb_buf, buf, writelen); - writelen -= bytes_left; - if (writelen) - user_dlm_write_lvb(inode, lvb_buf, writelen); - - kfree(lvb_buf); - - *ppos = *ppos + writelen; - mlog(0, "wrote %zd bytes\n", writelen); - return writelen; -} - -static void dlmfs_init_once(void *foo) -{ - struct dlmfs_inode_private *ip = - (struct dlmfs_inode_private *) foo; - - ip->ip_dlm = NULL; - ip->ip_parent = NULL; - - inode_init_once(&ip->ip_vfs_inode); -} - -static struct inode *dlmfs_alloc_inode(struct super_block *sb) -{ - struct dlmfs_inode_private *ip; - - ip = kmem_cache_alloc(dlmfs_inode_cache, GFP_NOFS); - if (!ip) - return NULL; - - return &ip->ip_vfs_inode; -} - -static void dlmfs_destroy_inode(struct inode *inode) -{ - kmem_cache_free(dlmfs_inode_cache, DLMFS_I(inode)); -} - -static void dlmfs_clear_inode(struct inode *inode) -{ - int status; - struct dlmfs_inode_private *ip; - - if (!inode) - return; - - mlog(0, "inode %lu\n", inode->i_ino); - - ip = DLMFS_I(inode); - - if (S_ISREG(inode->i_mode)) { - status = user_dlm_destroy_lock(&ip->ip_lockres); - if (status < 0) - mlog_errno(status); - iput(ip->ip_parent); - goto clear_fields; - } - - mlog(0, "we're a directory, ip->ip_dlm = 0x%p\n", ip->ip_dlm); - /* we must be a directory. If required, lets unregister the - * dlm context now. */ - if (ip->ip_dlm) - user_dlm_unregister_context(ip->ip_dlm); -clear_fields: - ip->ip_parent = NULL; - ip->ip_dlm = NULL; -} - -static struct backing_dev_info dlmfs_backing_dev_info = { - .name = "ocfs2-dlmfs", - .ra_pages = 0, /* No readahead */ - .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, -}; - -static struct inode *dlmfs_get_root_inode(struct super_block *sb) -{ - struct inode *inode = new_inode(sb); - int mode = S_IFDIR | 0755; - struct dlmfs_inode_private *ip; - - if (inode) { - ip = DLMFS_I(inode); - - inode->i_mode = mode; - inode->i_uid = current_fsuid(); - inode->i_gid = current_fsgid(); - inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info; - inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; - inc_nlink(inode); - - inode->i_fop = &simple_dir_operations; - inode->i_op = &dlmfs_root_inode_operations; - } - - return inode; -} - -static struct inode *dlmfs_get_inode(struct inode *parent, - struct dentry *dentry, - int mode) -{ - struct super_block *sb = parent->i_sb; - struct inode * inode = new_inode(sb); - struct dlmfs_inode_private *ip; - - if (!inode) - return NULL; - - inode->i_mode = mode; - inode->i_uid = current_fsuid(); - inode->i_gid = current_fsgid(); - inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info; - inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; - - ip = DLMFS_I(inode); - ip->ip_dlm = DLMFS_I(parent)->ip_dlm; - - switch (mode & S_IFMT) { - default: - /* for now we don't support anything other than - * directories and regular files. */ - BUG(); - break; - case S_IFREG: - inode->i_op = &dlmfs_file_inode_operations; - inode->i_fop = &dlmfs_file_operations; - - i_size_write(inode, DLM_LVB_LEN); - - user_dlm_lock_res_init(&ip->ip_lockres, dentry); - - /* released at clear_inode time, this insures that we - * get to drop the dlm reference on each lock *before* - * we call the unregister code for releasing parent - * directories. */ - ip->ip_parent = igrab(parent); - BUG_ON(!ip->ip_parent); - break; - case S_IFDIR: - inode->i_op = &dlmfs_dir_inode_operations; - inode->i_fop = &simple_dir_operations; - - /* directory inodes start off with i_nlink == - * 2 (for "." entry) */ - inc_nlink(inode); - break; - } - - if (parent->i_mode & S_ISGID) { - inode->i_gid = parent->i_gid; - if (S_ISDIR(mode)) - inode->i_mode |= S_ISGID; - } - - return inode; -} - -/* - * File creation. Allocate an inode, and we're done.. - */ -/* SMP-safe */ -static int dlmfs_mkdir(struct inode * dir, - struct dentry * dentry, - int mode) -{ - int status; - struct inode *inode = NULL; - struct qstr *domain = &dentry->d_name; - struct dlmfs_inode_private *ip; - struct dlm_ctxt *dlm; - struct dlm_protocol_version proto = user_locking_protocol; - - mlog(0, "mkdir %.*s\n", domain->len, domain->name); - - /* verify that we have a proper domain */ - if (domain->len >= O2NM_MAX_NAME_LEN) { - status = -EINVAL; - mlog(ML_ERROR, "invalid domain name for directory.\n"); - goto bail; - } - - inode = dlmfs_get_inode(dir, dentry, mode | S_IFDIR); - if (!inode) { - status = -ENOMEM; - mlog_errno(status); - goto bail; - } - - ip = DLMFS_I(inode); - - dlm = user_dlm_register_context(domain, &proto); - if (IS_ERR(dlm)) { - status = PTR_ERR(dlm); - mlog(ML_ERROR, "Error %d could not register domain \"%.*s\"\n", - status, domain->len, domain->name); - goto bail; - } - ip->ip_dlm = dlm; - - inc_nlink(dir); - d_instantiate(dentry, inode); - dget(dentry); /* Extra count - pin the dentry in core */ - - status = 0; -bail: - if (status < 0) - iput(inode); - return status; -} - -static int dlmfs_create(struct inode *dir, - struct dentry *dentry, - int mode, - struct nameidata *nd) -{ - int status = 0; - struct inode *inode; - struct qstr *name = &dentry->d_name; - - mlog(0, "create %.*s\n", name->len, name->name); - - /* verify name is valid and doesn't contain any dlm reserved - * characters */ - if (name->len >= USER_DLM_LOCK_ID_MAX_LEN || - name->name[0] == '$') { - status = -EINVAL; - mlog(ML_ERROR, "invalid lock name, %.*s\n", name->len, - name->name); - goto bail; - } - - inode = dlmfs_get_inode(dir, dentry, mode | S_IFREG); - if (!inode) { - status = -ENOMEM; - mlog_errno(status); - goto bail; - } - - d_instantiate(dentry, inode); - dget(dentry); /* Extra count - pin the dentry in core */ -bail: - return status; -} - -static int dlmfs_unlink(struct inode *dir, - struct dentry *dentry) -{ - int status; - struct inode *inode = dentry->d_inode; - - mlog(0, "unlink inode %lu\n", inode->i_ino); - - /* if there are no current holders, or none that are waiting - * to acquire a lock, this basically destroys our lockres. */ - status = user_dlm_destroy_lock(&DLMFS_I(inode)->ip_lockres); - if (status < 0) { - mlog(ML_ERROR, "unlink %.*s, error %d from destroy\n", - dentry->d_name.len, dentry->d_name.name, status); - goto bail; - } - status = simple_unlink(dir, dentry); -bail: - return status; -} - -static int dlmfs_fill_super(struct super_block * sb, - void * data, - int silent) -{ - struct inode * inode; - struct dentry * root; - - sb->s_maxbytes = MAX_LFS_FILESIZE; - sb->s_blocksize = PAGE_CACHE_SIZE; - sb->s_blocksize_bits = PAGE_CACHE_SHIFT; - sb->s_magic = DLMFS_MAGIC; - sb->s_op = &dlmfs_ops; - inode = dlmfs_get_root_inode(sb); - if (!inode) - return -ENOMEM; - - root = d_alloc_root(inode); - if (!root) { - iput(inode); - return -ENOMEM; - } - sb->s_root = root; - return 0; -} - -static const struct file_operations dlmfs_file_operations = { - .open = dlmfs_file_open, - .release = dlmfs_file_release, - .poll = dlmfs_file_poll, - .read = dlmfs_file_read, - .write = dlmfs_file_write, -}; - -static const struct inode_operations dlmfs_dir_inode_operations = { - .create = dlmfs_create, - .lookup = simple_lookup, - .unlink = dlmfs_unlink, -}; - -/* this way we can restrict mkdir to only the toplevel of the fs. */ -static const struct inode_operations dlmfs_root_inode_operations = { - .lookup = simple_lookup, - .mkdir = dlmfs_mkdir, - .rmdir = simple_rmdir, -}; - -static const struct super_operations dlmfs_ops = { - .statfs = simple_statfs, - .alloc_inode = dlmfs_alloc_inode, - .destroy_inode = dlmfs_destroy_inode, - .clear_inode = dlmfs_clear_inode, - .drop_inode = generic_delete_inode, -}; - -static const struct inode_operations dlmfs_file_inode_operations = { - .getattr = simple_getattr, -}; - -static int dlmfs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, struct vfsmount *mnt) -{ - return get_sb_nodev(fs_type, flags, data, dlmfs_fill_super, mnt); -} - -static struct file_system_type dlmfs_fs_type = { - .owner = THIS_MODULE, - .name = "ocfs2_dlmfs", - .get_sb = dlmfs_get_sb, - .kill_sb = kill_litter_super, -}; - -static int __init init_dlmfs_fs(void) -{ - int status; - int cleanup_inode = 0, cleanup_worker = 0; - - dlmfs_print_version(); - - status = bdi_init(&dlmfs_backing_dev_info); - if (status) - return status; - - dlmfs_inode_cache = kmem_cache_create("dlmfs_inode_cache", - sizeof(struct dlmfs_inode_private), - 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), - dlmfs_init_once); - if (!dlmfs_inode_cache) { - status = -ENOMEM; - goto bail; - } - cleanup_inode = 1; - - user_dlm_worker = create_singlethread_workqueue("user_dlm"); - if (!user_dlm_worker) { - status = -ENOMEM; - goto bail; - } - cleanup_worker = 1; - - status = register_filesystem(&dlmfs_fs_type); -bail: - if (status) { - if (cleanup_inode) - kmem_cache_destroy(dlmfs_inode_cache); - if (cleanup_worker) - destroy_workqueue(user_dlm_worker); - bdi_destroy(&dlmfs_backing_dev_info); - } else - printk("OCFS2 User DLM kernel interface loaded\n"); - return status; -} - -static void __exit exit_dlmfs_fs(void) -{ - unregister_filesystem(&dlmfs_fs_type); - - flush_workqueue(user_dlm_worker); - destroy_workqueue(user_dlm_worker); - - kmem_cache_destroy(dlmfs_inode_cache); - - bdi_destroy(&dlmfs_backing_dev_info); -} - -MODULE_AUTHOR("Oracle"); -MODULE_LICENSE("GPL"); - -module_init(init_dlmfs_fs) -module_exit(exit_dlmfs_fs) diff --git a/fs/ocfs2/dlm/dlmfsver.c b/fs/ocfs2/dlm/dlmfsver.c deleted file mode 100644 index a733b3321f83..000000000000 --- a/fs/ocfs2/dlm/dlmfsver.c +++ /dev/null @@ -1,42 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * - * dlmfsver.c - * - * version string - * - * Copyright (C) 2002, 2005 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#include <linux/module.h> -#include <linux/kernel.h> - -#include "dlmfsver.h" - -#define DLM_BUILD_VERSION "1.5.0" - -#define VERSION_STR "OCFS2 DLMFS " DLM_BUILD_VERSION - -void dlmfs_print_version(void) -{ - printk(KERN_INFO "%s\n", VERSION_STR); -} - -MODULE_DESCRIPTION(VERSION_STR); - -MODULE_VERSION(DLM_BUILD_VERSION); diff --git a/fs/ocfs2/dlm/dlmfsver.h b/fs/ocfs2/dlm/dlmfsver.h deleted file mode 100644 index f35eadbed25c..000000000000 --- a/fs/ocfs2/dlm/dlmfsver.h +++ /dev/null @@ -1,31 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * - * dlmver.h - * - * Function prototypes - * - * Copyright (C) 2005 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#ifndef DLMFS_VER_H -#define DLMFS_VER_H - -void dlmfs_print_version(void); - -#endif /* DLMFS_VER_H */ diff --git a/fs/ocfs2/dlm/userdlm.c b/fs/ocfs2/dlm/userdlm.c deleted file mode 100644 index 4cb1d3dae250..000000000000 --- a/fs/ocfs2/dlm/userdlm.c +++ /dev/null @@ -1,676 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * - * userdlm.c - * - * Code which implements the kernel side of a minimal userspace - * interface to our DLM. - * - * Many of the functions here are pared down versions of dlmglue.c - * functions. - * - * Copyright (C) 2003, 2004 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#include <linux/signal.h> - -#include <linux/module.h> -#include <linux/fs.h> -#include <linux/types.h> -#include <linux/crc32.h> - - -#include "cluster/nodemanager.h" -#include "cluster/heartbeat.h" -#include "cluster/tcp.h" - -#include "dlmapi.h" - -#include "userdlm.h" - -#define MLOG_MASK_PREFIX ML_DLMFS -#include "cluster/masklog.h" - -static inline int user_check_wait_flag(struct user_lock_res *lockres, - int flag) -{ - int ret; - - spin_lock(&lockres->l_lock); - ret = lockres->l_flags & flag; - spin_unlock(&lockres->l_lock); - - return ret; -} - -static inline void user_wait_on_busy_lock(struct user_lock_res *lockres) - -{ - wait_event(lockres->l_event, - !user_check_wait_flag(lockres, USER_LOCK_BUSY)); -} - -static inline void user_wait_on_blocked_lock(struct user_lock_res *lockres) - -{ - wait_event(lockres->l_event, - !user_check_wait_flag(lockres, USER_LOCK_BLOCKED)); -} - -/* I heart container_of... */ -static inline struct dlm_ctxt * -dlm_ctxt_from_user_lockres(struct user_lock_res *lockres) -{ - struct dlmfs_inode_private *ip; - - ip = container_of(lockres, - struct dlmfs_inode_private, - ip_lockres); - return ip->ip_dlm; -} - -static struct inode * -user_dlm_inode_from_user_lockres(struct user_lock_res *lockres) -{ - struct dlmfs_inode_private *ip; - - ip = container_of(lockres, - struct dlmfs_inode_private, - ip_lockres); - return &ip->ip_vfs_inode; -} - -static inline void user_recover_from_dlm_error(struct user_lock_res *lockres) -{ - spin_lock(&lockres->l_lock); - lockres->l_flags &= ~USER_LOCK_BUSY; - spin_unlock(&lockres->l_lock); -} - -#define user_log_dlm_error(_func, _stat, _lockres) do { \ - mlog(ML_ERROR, "Dlm error \"%s\" while calling %s on " \ - "resource %.*s: %s\n", dlm_errname(_stat), _func, \ - _lockres->l_namelen, _lockres->l_name, dlm_errmsg(_stat)); \ -} while (0) - -/* WARNING: This function lives in a world where the only three lock - * levels are EX, PR, and NL. It *will* have to be adjusted when more - * lock types are added. */ -static inline int user_highest_compat_lock_level(int level) -{ - int new_level = LKM_EXMODE; - - if (level == LKM_EXMODE) - new_level = LKM_NLMODE; - else if (level == LKM_PRMODE) - new_level = LKM_PRMODE; - return new_level; -} - -static void user_ast(void *opaque) -{ - struct user_lock_res *lockres = opaque; - struct dlm_lockstatus *lksb; - - mlog(0, "AST fired for lockres %.*s\n", lockres->l_namelen, - lockres->l_name); - - spin_lock(&lockres->l_lock); - - lksb = &(lockres->l_lksb); - if (lksb->status != DLM_NORMAL) { - mlog(ML_ERROR, "lksb status value of %u on lockres %.*s\n", - lksb->status, lockres->l_namelen, lockres->l_name); - spin_unlock(&lockres->l_lock); - return; - } - - mlog_bug_on_msg(lockres->l_requested == LKM_IVMODE, - "Lockres %.*s, requested ivmode. flags 0x%x\n", - lockres->l_namelen, lockres->l_name, lockres->l_flags); - - /* we're downconverting. */ - if (lockres->l_requested < lockres->l_level) { - if (lockres->l_requested <= - user_highest_compat_lock_level(lockres->l_blocking)) { - lockres->l_blocking = LKM_NLMODE; - lockres->l_flags &= ~USER_LOCK_BLOCKED; - } - } - - lockres->l_level = lockres->l_requested; - lockres->l_requested = LKM_IVMODE; - lockres->l_flags |= USER_LOCK_ATTACHED; - lockres->l_flags &= ~USER_LOCK_BUSY; - - spin_unlock(&lockres->l_lock); - - wake_up(&lockres->l_event); -} - -static inline void user_dlm_grab_inode_ref(struct user_lock_res *lockres) -{ - struct inode *inode; - inode = user_dlm_inode_from_user_lockres(lockres); - if (!igrab(inode)) - BUG(); -} - -static void user_dlm_unblock_lock(struct work_struct *work); - -static void __user_dlm_queue_lockres(struct user_lock_res *lockres) -{ - if (!(lockres->l_flags & USER_LOCK_QUEUED)) { - user_dlm_grab_inode_ref(lockres); - - INIT_WORK(&lockres->l_work, user_dlm_unblock_lock); - - queue_work(user_dlm_worker, &lockres->l_work); - lockres->l_flags |= USER_LOCK_QUEUED; - } -} - -static void __user_dlm_cond_queue_lockres(struct user_lock_res *lockres) -{ - int queue = 0; - - if (!(lockres->l_flags & USER_LOCK_BLOCKED)) - return; - - switch (lockres->l_blocking) { - case LKM_EXMODE: - if (!lockres->l_ex_holders && !lockres->l_ro_holders) - queue = 1; - break; - case LKM_PRMODE: - if (!lockres->l_ex_holders) - queue = 1; - break; - default: - BUG(); - } - - if (queue) - __user_dlm_queue_lockres(lockres); -} - -static void user_bast(void *opaque, int level) -{ - struct user_lock_res *lockres = opaque; - - mlog(0, "Blocking AST fired for lockres %.*s. Blocking level %d\n", - lockres->l_namelen, lockres->l_name, level); - - spin_lock(&lockres->l_lock); - lockres->l_flags |= USER_LOCK_BLOCKED; - if (level > lockres->l_blocking) - lockres->l_blocking = level; - - __user_dlm_queue_lockres(lockres); - spin_unlock(&lockres->l_lock); - - wake_up(&lockres->l_event); -} - -static void user_unlock_ast(void *opaque, enum dlm_status status) -{ - struct user_lock_res *lockres = opaque; - - mlog(0, "UNLOCK AST called on lock %.*s\n", lockres->l_namelen, - lockres->l_name); - - if (status != DLM_NORMAL && status != DLM_CANCELGRANT) - mlog(ML_ERROR, "Dlm returns status %d\n", status); - - spin_lock(&lockres->l_lock); - /* The teardown flag gets set early during the unlock process, - * so test the cancel flag to make sure that this ast isn't - * for a concurrent cancel. */ - if (lockres->l_flags & USER_LOCK_IN_TEARDOWN - && !(lockres->l_flags & USER_LOCK_IN_CANCEL)) { - lockres->l_level = LKM_IVMODE; - } else if (status == DLM_CANCELGRANT) { - /* We tried to cancel a convert request, but it was - * already granted. Don't clear the busy flag - the - * ast should've done this already. */ - BUG_ON(!(lockres->l_flags & USER_LOCK_IN_CANCEL)); - lockres->l_flags &= ~USER_LOCK_IN_CANCEL; - goto out_noclear; - } else { - BUG_ON(!(lockres->l_flags & USER_LOCK_IN_CANCEL)); - /* Cancel succeeded, we want to re-queue */ - lockres->l_requested = LKM_IVMODE; /* cancel an - * upconvert - * request. */ - lockres->l_flags &= ~USER_LOCK_IN_CANCEL; - /* we want the unblock thread to look at it again - * now. */ - if (lockres->l_flags & USER_LOCK_BLOCKED) - __user_dlm_queue_lockres(lockres); - } - - lockres->l_flags &= ~USER_LOCK_BUSY; -out_noclear: - spin_unlock(&lockres->l_lock); - - wake_up(&lockres->l_event); -} - -static inline void user_dlm_drop_inode_ref(struct user_lock_res *lockres) -{ - struct inode *inode; - inode = user_dlm_inode_from_user_lockres(lockres); - iput(inode); -} - -static void user_dlm_unblock_lock(struct work_struct *work) -{ - int new_level, status; - struct user_lock_res *lockres = - container_of(work, struct user_lock_res, l_work); - struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres); - - mlog(0, "processing lockres %.*s\n", lockres->l_namelen, - lockres->l_name); - - spin_lock(&lockres->l_lock); - - mlog_bug_on_msg(!(lockres->l_flags & USER_LOCK_QUEUED), - "Lockres %.*s, flags 0x%x\n", - lockres->l_namelen, lockres->l_name, lockres->l_flags); - - /* notice that we don't clear USER_LOCK_BLOCKED here. If it's - * set, we want user_ast clear it. */ - lockres->l_flags &= ~USER_LOCK_QUEUED; - - /* It's valid to get here and no longer be blocked - if we get - * several basts in a row, we might be queued by the first - * one, the unblock thread might run and clear the queued - * flag, and finally we might get another bast which re-queues - * us before our ast for the downconvert is called. */ - if (!(lockres->l_flags & USER_LOCK_BLOCKED)) { - spin_unlock(&lockres->l_lock); - goto drop_ref; - } - - if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) { - spin_unlock(&lockres->l_lock); - goto drop_ref; - } - - if (lockres->l_flags & USER_LOCK_BUSY) { - if (lockres->l_flags & USER_LOCK_IN_CANCEL) { - spin_unlock(&lockres->l_lock); - goto drop_ref; - } - - lockres->l_flags |= USER_LOCK_IN_CANCEL; - spin_unlock(&lockres->l_lock); - - status = dlmunlock(dlm, - &lockres->l_lksb, - LKM_CANCEL, - user_unlock_ast, - lockres); - if (status != DLM_NORMAL) - user_log_dlm_error("dlmunlock", status, lockres); - goto drop_ref; - } - - /* If there are still incompat holders, we can exit safely - * without worrying about re-queueing this lock as that will - * happen on the last call to user_cluster_unlock. */ - if ((lockres->l_blocking == LKM_EXMODE) - && (lockres->l_ex_holders || lockres->l_ro_holders)) { - spin_unlock(&lockres->l_lock); - mlog(0, "can't downconvert for ex: ro = %u, ex = %u\n", - lockres->l_ro_holders, lockres->l_ex_holders); - goto drop_ref; - } - - if ((lockres->l_blocking == LKM_PRMODE) - && lockres->l_ex_holders) { - spin_unlock(&lockres->l_lock); - mlog(0, "can't downconvert for pr: ex = %u\n", - lockres->l_ex_holders); - goto drop_ref; - } - - /* yay, we can downconvert now. */ - new_level = user_highest_compat_lock_level(lockres->l_blocking); - lockres->l_requested = new_level; - lockres->l_flags |= USER_LOCK_BUSY; - mlog(0, "Downconvert lock from %d to %d\n", - lockres->l_level, new_level); - spin_unlock(&lockres->l_lock); - - /* need lock downconvert request now... */ - status = dlmlock(dlm, - new_level, - &lockres->l_lksb, - LKM_CONVERT|LKM_VALBLK, - lockres->l_name, - lockres->l_namelen, - user_ast, - lockres, - user_bast); - if (status != DLM_NORMAL) { - user_log_dlm_error("dlmlock", status, lockres); - user_recover_from_dlm_error(lockres); - } - -drop_ref: - user_dlm_drop_inode_ref(lockres); -} - -static inline void user_dlm_inc_holders(struct user_lock_res *lockres, - int level) -{ - switch(level) { - case LKM_EXMODE: - lockres->l_ex_holders++; - break; - case LKM_PRMODE: - lockres->l_ro_holders++; - break; - default: - BUG(); - } -} - -/* predict what lock level we'll be dropping down to on behalf - * of another node, and return true if the currently wanted - * level will be compatible with it. */ -static inline int -user_may_continue_on_blocked_lock(struct user_lock_res *lockres, - int wanted) -{ - BUG_ON(!(lockres->l_flags & USER_LOCK_BLOCKED)); - - return wanted <= user_highest_compat_lock_level(lockres->l_blocking); -} - -int user_dlm_cluster_lock(struct user_lock_res *lockres, - int level, - int lkm_flags) -{ - int status, local_flags; - struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres); - - if (level != LKM_EXMODE && - level != LKM_PRMODE) { - mlog(ML_ERROR, "lockres %.*s: invalid request!\n", - lockres->l_namelen, lockres->l_name); - status = -EINVAL; - goto bail; - } - - mlog(0, "lockres %.*s: asking for %s lock, passed flags = 0x%x\n", - lockres->l_namelen, lockres->l_name, - (level == LKM_EXMODE) ? "LKM_EXMODE" : "LKM_PRMODE", - lkm_flags); - -again: - if (signal_pending(current)) { - status = -ERESTARTSYS; - goto bail; - } - - spin_lock(&lockres->l_lock); - - /* We only compare against the currently granted level - * here. If the lock is blocked waiting on a downconvert, - * we'll get caught below. */ - if ((lockres->l_flags & USER_LOCK_BUSY) && - (level > lockres->l_level)) { - /* is someone sitting in dlm_lock? If so, wait on - * them. */ - spin_unlock(&lockres->l_lock); - - user_wait_on_busy_lock(lockres); - goto again; - } - - if ((lockres->l_flags & USER_LOCK_BLOCKED) && - (!user_may_continue_on_blocked_lock(lockres, level))) { - /* is the lock is currently blocked on behalf of - * another node */ - spin_unlock(&lockres->l_lock); - - user_wait_on_blocked_lock(lockres); - goto again; - } - - if (level > lockres->l_level) { - local_flags = lkm_flags | LKM_VALBLK; - if (lockres->l_level != LKM_IVMODE) - local_flags |= LKM_CONVERT; - - lockres->l_requested = level; - lockres->l_flags |= USER_LOCK_BUSY; - spin_unlock(&lockres->l_lock); - - BUG_ON(level == LKM_IVMODE); - BUG_ON(level == LKM_NLMODE); - - /* call dlm_lock to upgrade lock now */ - status = dlmlock(dlm, - level, - &lockres->l_lksb, - local_flags, - lockres->l_name, - lockres->l_namelen, - user_ast, - lockres, - user_bast); - if (status != DLM_NORMAL) { - if ((lkm_flags & LKM_NOQUEUE) && - (status == DLM_NOTQUEUED)) - status = -EAGAIN; - else { - user_log_dlm_error("dlmlock", status, lockres); - status = -EINVAL; - } - user_recover_from_dlm_error(lockres); - goto bail; - } - - user_wait_on_busy_lock(lockres); - goto again; - } - - user_dlm_inc_holders(lockres, level); - spin_unlock(&lockres->l_lock); - - status = 0; -bail: - return status; -} - -static inline void user_dlm_dec_holders(struct user_lock_res *lockres, - int level) -{ - switch(level) { - case LKM_EXMODE: - BUG_ON(!lockres->l_ex_holders); - lockres->l_ex_holders--; - break; - case LKM_PRMODE: - BUG_ON(!lockres->l_ro_holders); - lockres->l_ro_holders--; - break; - default: - BUG(); - } -} - -void user_dlm_cluster_unlock(struct user_lock_res *lockres, - int level) -{ - if (level != LKM_EXMODE && - level != LKM_PRMODE) { - mlog(ML_ERROR, "lockres %.*s: invalid request!\n", - lockres->l_namelen, lockres->l_name); - return; - } - - spin_lock(&lockres->l_lock); - user_dlm_dec_holders(lockres, level); - __user_dlm_cond_queue_lockres(lockres); - spin_unlock(&lockres->l_lock); -} - -void user_dlm_write_lvb(struct inode *inode, - const char *val, - unsigned int len) -{ - struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres; - char *lvb = lockres->l_lksb.lvb; - - BUG_ON(len > DLM_LVB_LEN); - - spin_lock(&lockres->l_lock); - - BUG_ON(lockres->l_level < LKM_EXMODE); - memcpy(lvb, val, len); - - spin_unlock(&lockres->l_lock); -} - -void user_dlm_read_lvb(struct inode *inode, - char *val, - unsigned int len) -{ - struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres; - char *lvb = lockres->l_lksb.lvb; - - BUG_ON(len > DLM_LVB_LEN); - - spin_lock(&lockres->l_lock); - - BUG_ON(lockres->l_level < LKM_PRMODE); - memcpy(val, lvb, len); - - spin_unlock(&lockres->l_lock); -} - -void user_dlm_lock_res_init(struct user_lock_res *lockres, - struct dentry *dentry) -{ - memset(lockres, 0, sizeof(*lockres)); - - spin_lock_init(&lockres->l_lock); - init_waitqueue_head(&lockres->l_event); - lockres->l_level = LKM_IVMODE; - lockres->l_requested = LKM_IVMODE; - lockres->l_blocking = LKM_IVMODE; - - /* should have been checked before getting here. */ - BUG_ON(dentry->d_name.len >= USER_DLM_LOCK_ID_MAX_LEN); - - memcpy(lockres->l_name, - dentry->d_name.name, - dentry->d_name.len); - lockres->l_namelen = dentry->d_name.len; -} - -int user_dlm_destroy_lock(struct user_lock_res *lockres) -{ - int status = -EBUSY; - struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres); - - mlog(0, "asked to destroy %.*s\n", lockres->l_namelen, lockres->l_name); - - spin_lock(&lockres->l_lock); - if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) { - spin_unlock(&lockres->l_lock); - return 0; - } - - lockres->l_flags |= USER_LOCK_IN_TEARDOWN; - - while (lockres->l_flags & USER_LOCK_BUSY) { - spin_unlock(&lockres->l_lock); - - user_wait_on_busy_lock(lockres); - - spin_lock(&lockres->l_lock); - } - - if (lockres->l_ro_holders || lockres->l_ex_holders) { - spin_unlock(&lockres->l_lock); - goto bail; - } - - status = 0; - if (!(lockres->l_flags & USER_LOCK_ATTACHED)) { - spin_unlock(&lockres->l_lock); - goto bail; - } - - lockres->l_flags &= ~USER_LOCK_ATTACHED; - lockres->l_flags |= USER_LOCK_BUSY; - spin_unlock(&lockres->l_lock); - - status = dlmunlock(dlm, - &lockres->l_lksb, - LKM_VALBLK, - user_unlock_ast, - lockres); - if (status != DLM_NORMAL) { - user_log_dlm_error("dlmunlock", status, lockres); - status = -EINVAL; - goto bail; - } - - user_wait_on_busy_lock(lockres); - - status = 0; -bail: - return status; -} - -struct dlm_ctxt *user_dlm_register_context(struct qstr *name, - struct dlm_protocol_version *proto) -{ - struct dlm_ctxt *dlm; - u32 dlm_key; - char *domain; - - domain = kmalloc(name->len + 1, GFP_NOFS); - if (!domain) { - mlog_errno(-ENOMEM); - return ERR_PTR(-ENOMEM); - } - - dlm_key = crc32_le(0, name->name, name->len); - - snprintf(domain, name->len + 1, "%.*s", name->len, name->name); - - dlm = dlm_register_domain(domain, dlm_key, proto); - if (IS_ERR(dlm)) - mlog_errno(PTR_ERR(dlm)); - - kfree(domain); - return dlm; -} - -void user_dlm_unregister_context(struct dlm_ctxt *dlm) -{ - dlm_unregister_domain(dlm); -} diff --git a/fs/ocfs2/dlm/userdlm.h b/fs/ocfs2/dlm/userdlm.h deleted file mode 100644 index 0c3cc03c61fa..000000000000 --- a/fs/ocfs2/dlm/userdlm.h +++ /dev/null @@ -1,113 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * - * userdlm.h - * - * Userspace dlm defines - * - * Copyright (C) 2002, 2004 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - - -#ifndef USERDLM_H -#define USERDLM_H - -#include <linux/module.h> -#include <linux/fs.h> -#include <linux/types.h> -#include <linux/workqueue.h> - -/* user_lock_res->l_flags flags. */ -#define USER_LOCK_ATTACHED (0x00000001) /* we have initialized - * the lvb */ -#define USER_LOCK_BUSY (0x00000002) /* we are currently in - * dlm_lock */ -#define USER_LOCK_BLOCKED (0x00000004) /* blocked waiting to - * downconvert*/ -#define USER_LOCK_IN_TEARDOWN (0x00000008) /* we're currently - * destroying this - * lock. */ -#define USER_LOCK_QUEUED (0x00000010) /* lock is on the - * workqueue */ -#define USER_LOCK_IN_CANCEL (0x00000020) - -struct user_lock_res { - spinlock_t l_lock; - - int l_flags; - -#define USER_DLM_LOCK_ID_MAX_LEN 32 - char l_name[USER_DLM_LOCK_ID_MAX_LEN]; - int l_namelen; - int l_level; - unsigned int l_ro_holders; - unsigned int l_ex_holders; - struct dlm_lockstatus l_lksb; - - int l_requested; - int l_blocking; - - wait_queue_head_t l_event; - - struct work_struct l_work; -}; - -extern struct workqueue_struct *user_dlm_worker; - -void user_dlm_lock_res_init(struct user_lock_res *lockres, - struct dentry *dentry); -int user_dlm_destroy_lock(struct user_lock_res *lockres); -int user_dlm_cluster_lock(struct user_lock_res *lockres, - int level, - int lkm_flags); -void user_dlm_cluster_unlock(struct user_lock_res *lockres, - int level); -void user_dlm_write_lvb(struct inode *inode, - const char *val, - unsigned int len); -void user_dlm_read_lvb(struct inode *inode, - char *val, - unsigned int len); -struct dlm_ctxt *user_dlm_register_context(struct qstr *name, - struct dlm_protocol_version *proto); -void user_dlm_unregister_context(struct dlm_ctxt *dlm); - -struct dlmfs_inode_private { - struct dlm_ctxt *ip_dlm; - - struct user_lock_res ip_lockres; /* unused for directories. */ - struct inode *ip_parent; - - struct inode ip_vfs_inode; -}; - -static inline struct dlmfs_inode_private * -DLMFS_I(struct inode *inode) -{ - return container_of(inode, - struct dlmfs_inode_private, - ip_vfs_inode); -} - -struct dlmfs_filp_private { - int fp_lock_level; -}; - -#define DLMFS_MAGIC 0x76a9f425 - -#endif /* USERDLM_H */ diff --git a/fs/ocfs2/dlmfs/Makefile b/fs/ocfs2/dlmfs/Makefile new file mode 100644 index 000000000000..df69b4856d0d --- /dev/null +++ b/fs/ocfs2/dlmfs/Makefile @@ -0,0 +1,5 @@ +EXTRA_CFLAGS += -Ifs/ocfs2 + +obj-$(CONFIG_OCFS2_FS) += ocfs2_dlmfs.o + +ocfs2_dlmfs-objs := userdlm.o dlmfs.o dlmfsver.o diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c new file mode 100644 index 000000000000..e21ce0e5fc42 --- /dev/null +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -0,0 +1,710 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * dlmfs.c + * + * Code which implements the kernel side of a minimal userspace + * interface to our DLM. This file handles the virtual file system + * used for communication with userspace. Credit should go to ramfs, + * which was a template for the fs side of this module. + * + * Copyright (C) 2003, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +/* Simple VFS hooks based on: */ +/* + * Resizable simple ram filesystem for Linux. + * + * Copyright (C) 2000 Linus Torvalds. + * 2000 Transmeta Corp. + */ + +#include <linux/module.h> +#include <linux/fs.h> +#include <linux/pagemap.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/highmem.h> +#include <linux/init.h> +#include <linux/string.h> +#include <linux/backing-dev.h> +#include <linux/poll.h> + +#include <asm/uaccess.h> + + +#include "cluster/nodemanager.h" +#include "cluster/heartbeat.h" +#include "cluster/tcp.h" + +#include "dlm/dlmapi.h" + +#include "userdlm.h" + +#include "dlmfsver.h" + +#define MLOG_MASK_PREFIX ML_DLMFS +#include "cluster/masklog.h" + +#include "ocfs2_lockingver.h" + +static const struct super_operations dlmfs_ops; +static const struct file_operations dlmfs_file_operations; +static const struct inode_operations dlmfs_dir_inode_operations; +static const struct inode_operations dlmfs_root_inode_operations; +static const struct inode_operations dlmfs_file_inode_operations; +static struct kmem_cache *dlmfs_inode_cache; + +struct workqueue_struct *user_dlm_worker; + +/* + * This is the userdlmfs locking protocol version. + * + * See fs/ocfs2/dlmglue.c for more details on locking versions. + */ +static const struct dlm_protocol_version user_locking_protocol = { + .pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR, + .pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR, +}; + + +/* + * These are the ABI capabilities of dlmfs. + * + * Over time, dlmfs has added some features that were not part of the + * initial ABI. Unfortunately, some of these features are not detectable + * via standard usage. For example, Linux's default poll always returns + * POLLIN, so there is no way for a caller of poll(2) to know when dlmfs + * added poll support. Instead, we provide this list of new capabilities. + * + * Capabilities is a read-only attribute. We do it as a module parameter + * so we can discover it whether dlmfs is built in, loaded, or even not + * loaded. + * + * The ABI features are local to this machine's dlmfs mount. This is + * distinct from the locking protocol, which is concerned with inter-node + * interaction. + * + * Capabilities: + * - bast : POLLIN against the file descriptor of a held lock + * signifies a bast fired on the lock. + */ +#define DLMFS_CAPABILITIES "bast" +extern int param_set_dlmfs_capabilities(const char *val, + struct kernel_param *kp) +{ + printk(KERN_ERR "%s: readonly parameter\n", kp->name); + return -EINVAL; +} +static int param_get_dlmfs_capabilities(char *buffer, + struct kernel_param *kp) +{ + return strlcpy(buffer, DLMFS_CAPABILITIES, + strlen(DLMFS_CAPABILITIES) + 1); +} +module_param_call(capabilities, param_set_dlmfs_capabilities, + param_get_dlmfs_capabilities, NULL, 0444); +MODULE_PARM_DESC(capabilities, DLMFS_CAPABILITIES); + + +/* + * decodes a set of open flags into a valid lock level and a set of flags. + * returns < 0 if we have invalid flags + * flags which mean something to us: + * O_RDONLY -> PRMODE level + * O_WRONLY -> EXMODE level + * + * O_NONBLOCK -> LKM_NOQUEUE + */ +static int dlmfs_decode_open_flags(int open_flags, + int *level, + int *flags) +{ + if (open_flags & (O_WRONLY|O_RDWR)) + *level = LKM_EXMODE; + else + *level = LKM_PRMODE; + + *flags = 0; + if (open_flags & O_NONBLOCK) + *flags |= LKM_NOQUEUE; + + return 0; +} + +static int dlmfs_file_open(struct inode *inode, + struct file *file) +{ + int status, level, flags; + struct dlmfs_filp_private *fp = NULL; + struct dlmfs_inode_private *ip; + + if (S_ISDIR(inode->i_mode)) + BUG(); + + mlog(0, "open called on inode %lu, flags 0x%x\n", inode->i_ino, + file->f_flags); + + status = dlmfs_decode_open_flags(file->f_flags, &level, &flags); + if (status < 0) + goto bail; + + /* We don't want to honor O_APPEND at read/write time as it + * doesn't make sense for LVB writes. */ + file->f_flags &= ~O_APPEND; + + fp = kmalloc(sizeof(*fp), GFP_NOFS); + if (!fp) { + status = -ENOMEM; + goto bail; + } + fp->fp_lock_level = level; + + ip = DLMFS_I(inode); + + status = user_dlm_cluster_lock(&ip->ip_lockres, level, flags); + if (status < 0) { + /* this is a strange error to return here but I want + * to be able userspace to be able to distinguish a + * valid lock request from one that simply couldn't be + * granted. */ + if (flags & LKM_NOQUEUE && status == -EAGAIN) + status = -ETXTBSY; + kfree(fp); + goto bail; + } + + file->private_data = fp; +bail: + return status; +} + +static int dlmfs_file_release(struct inode *inode, + struct file *file) +{ + int level, status; + struct dlmfs_inode_private *ip = DLMFS_I(inode); + struct dlmfs_filp_private *fp = + (struct dlmfs_filp_private *) file->private_data; + + if (S_ISDIR(inode->i_mode)) + BUG(); + + mlog(0, "close called on inode %lu\n", inode->i_ino); + + status = 0; + if (fp) { + level = fp->fp_lock_level; + if (level != LKM_IVMODE) + user_dlm_cluster_unlock(&ip->ip_lockres, level); + + kfree(fp); + file->private_data = NULL; + } + + return 0; +} + +static unsigned int dlmfs_file_poll(struct file *file, poll_table *wait) +{ + int event = 0; + struct inode *inode = file->f_path.dentry->d_inode; + struct dlmfs_inode_private *ip = DLMFS_I(inode); + + poll_wait(file, &ip->ip_lockres.l_event, wait); + + spin_lock(&ip->ip_lockres.l_lock); + if (ip->ip_lockres.l_flags & USER_LOCK_BLOCKED) + event = POLLIN | POLLRDNORM; + spin_unlock(&ip->ip_lockres.l_lock); + + return event; +} + +static ssize_t dlmfs_file_read(struct file *filp, + char __user *buf, + size_t count, + loff_t *ppos) +{ + int bytes_left; + ssize_t readlen; + char *lvb_buf; + struct inode *inode = filp->f_path.dentry->d_inode; + + mlog(0, "inode %lu, count = %zu, *ppos = %llu\n", + inode->i_ino, count, *ppos); + + if (*ppos >= i_size_read(inode)) + return 0; + + if (!count) + return 0; + + if (!access_ok(VERIFY_WRITE, buf, count)) + return -EFAULT; + + /* don't read past the lvb */ + if ((count + *ppos) > i_size_read(inode)) + readlen = i_size_read(inode) - *ppos; + else + readlen = count - *ppos; + + lvb_buf = kmalloc(readlen, GFP_NOFS); + if (!lvb_buf) + return -ENOMEM; + + user_dlm_read_lvb(inode, lvb_buf, readlen); + bytes_left = __copy_to_user(buf, lvb_buf, readlen); + readlen -= bytes_left; + + kfree(lvb_buf); + + *ppos = *ppos + readlen; + + mlog(0, "read %zd bytes\n", readlen); + return readlen; +} + +static ssize_t dlmfs_file_write(struct file *filp, + const char __user *buf, + size_t count, + loff_t *ppos) +{ + int bytes_left; + ssize_t writelen; + char *lvb_buf; + struct inode *inode = filp->f_path.dentry->d_inode; + + mlog(0, "inode %lu, count = %zu, *ppos = %llu\n", + inode->i_ino, count, *ppos); + + if (*ppos >= i_size_read(inode)) + return -ENOSPC; + + if (!count) + return 0; + + if (!access_ok(VERIFY_READ, buf, count)) + return -EFAULT; + + /* don't write past the lvb */ + if ((count + *ppos) > i_size_read(inode)) + writelen = i_size_read(inode) - *ppos; + else + writelen = count - *ppos; + + lvb_buf = kmalloc(writelen, GFP_NOFS); + if (!lvb_buf) + return -ENOMEM; + + bytes_left = copy_from_user(lvb_buf, buf, writelen); + writelen -= bytes_left; + if (writelen) + user_dlm_write_lvb(inode, lvb_buf, writelen); + + kfree(lvb_buf); + + *ppos = *ppos + writelen; + mlog(0, "wrote %zd bytes\n", writelen); + return writelen; +} + +static void dlmfs_init_once(void *foo) +{ + struct dlmfs_inode_private *ip = + (struct dlmfs_inode_private *) foo; + + ip->ip_dlm = NULL; + ip->ip_parent = NULL; + + inode_init_once(&ip->ip_vfs_inode); +} + +static struct inode *dlmfs_alloc_inode(struct super_block *sb) +{ + struct dlmfs_inode_private *ip; + + ip = kmem_cache_alloc(dlmfs_inode_cache, GFP_NOFS); + if (!ip) + return NULL; + + return &ip->ip_vfs_inode; +} + +static void dlmfs_destroy_inode(struct inode *inode) +{ + kmem_cache_free(dlmfs_inode_cache, DLMFS_I(inode)); +} + +static void dlmfs_clear_inode(struct inode *inode) +{ + int status; + struct dlmfs_inode_private *ip; + + if (!inode) + return; + + mlog(0, "inode %lu\n", inode->i_ino); + + ip = DLMFS_I(inode); + + if (S_ISREG(inode->i_mode)) { + status = user_dlm_destroy_lock(&ip->ip_lockres); + if (status < 0) + mlog_errno(status); + iput(ip->ip_parent); + goto clear_fields; + } + + mlog(0, "we're a directory, ip->ip_dlm = 0x%p\n", ip->ip_dlm); + /* we must be a directory. If required, lets unregister the + * dlm context now. */ + if (ip->ip_dlm) + user_dlm_unregister_context(ip->ip_dlm); +clear_fields: + ip->ip_parent = NULL; + ip->ip_dlm = NULL; +} + +static struct backing_dev_info dlmfs_backing_dev_info = { + .name = "ocfs2-dlmfs", + .ra_pages = 0, /* No readahead */ + .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, +}; + +static struct inode *dlmfs_get_root_inode(struct super_block *sb) +{ + struct inode *inode = new_inode(sb); + int mode = S_IFDIR | 0755; + struct dlmfs_inode_private *ip; + + if (inode) { + ip = DLMFS_I(inode); + + inode->i_mode = mode; + inode->i_uid = current_fsuid(); + inode->i_gid = current_fsgid(); + inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info; + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inc_nlink(inode); + + inode->i_fop = &simple_dir_operations; + inode->i_op = &dlmfs_root_inode_operations; + } + + return inode; +} + +static struct inode *dlmfs_get_inode(struct inode *parent, + struct dentry *dentry, + int mode) +{ + struct super_block *sb = parent->i_sb; + struct inode * inode = new_inode(sb); + struct dlmfs_inode_private *ip; + + if (!inode) + return NULL; + + inode->i_mode = mode; + inode->i_uid = current_fsuid(); + inode->i_gid = current_fsgid(); + inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info; + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + + ip = DLMFS_I(inode); + ip->ip_dlm = DLMFS_I(parent)->ip_dlm; + + switch (mode & S_IFMT) { + default: + /* for now we don't support anything other than + * directories and regular files. */ + BUG(); + break; + case S_IFREG: + inode->i_op = &dlmfs_file_inode_operations; + inode->i_fop = &dlmfs_file_operations; + + i_size_write(inode, DLM_LVB_LEN); + + user_dlm_lock_res_init(&ip->ip_lockres, dentry); + + /* released at clear_inode time, this insures that we + * get to drop the dlm reference on each lock *before* + * we call the unregister code for releasing parent + * directories. */ + ip->ip_parent = igrab(parent); + BUG_ON(!ip->ip_parent); + break; + case S_IFDIR: + inode->i_op = &dlmfs_dir_inode_operations; + inode->i_fop = &simple_dir_operations; + + /* directory inodes start off with i_nlink == + * 2 (for "." entry) */ + inc_nlink(inode); + break; + } + + if (parent->i_mode & S_ISGID) { + inode->i_gid = parent->i_gid; + if (S_ISDIR(mode)) + inode->i_mode |= S_ISGID; + } + + return inode; +} + +/* + * File creation. Allocate an inode, and we're done.. + */ +/* SMP-safe */ +static int dlmfs_mkdir(struct inode * dir, + struct dentry * dentry, + int mode) +{ + int status; + struct inode *inode = NULL; + struct qstr *domain = &dentry->d_name; + struct dlmfs_inode_private *ip; + struct dlm_ctxt *dlm; + struct dlm_protocol_version proto = user_locking_protocol; + + mlog(0, "mkdir %.*s\n", domain->len, domain->name); + + /* verify that we have a proper domain */ + if (domain->len >= O2NM_MAX_NAME_LEN) { + status = -EINVAL; + mlog(ML_ERROR, "invalid domain name for directory.\n"); + goto bail; + } + + inode = dlmfs_get_inode(dir, dentry, mode | S_IFDIR); + if (!inode) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + ip = DLMFS_I(inode); + + dlm = user_dlm_register_context(domain, &proto); + if (IS_ERR(dlm)) { + status = PTR_ERR(dlm); + mlog(ML_ERROR, "Error %d could not register domain \"%.*s\"\n", + status, domain->len, domain->name); + goto bail; + } + ip->ip_dlm = dlm; + + inc_nlink(dir); + d_instantiate(dentry, inode); + dget(dentry); /* Extra count - pin the dentry in core */ + + status = 0; +bail: + if (status < 0) + iput(inode); + return status; +} + +static int dlmfs_create(struct inode *dir, + struct dentry *dentry, + int mode, + struct nameidata *nd) +{ + int status = 0; + struct inode *inode; + struct qstr *name = &dentry->d_name; + + mlog(0, "create %.*s\n", name->len, name->name); + + /* verify name is valid and doesn't contain any dlm reserved + * characters */ + if (name->len >= USER_DLM_LOCK_ID_MAX_LEN || + name->name[0] == '$') { + status = -EINVAL; + mlog(ML_ERROR, "invalid lock name, %.*s\n", name->len, + name->name); + goto bail; + } + + inode = dlmfs_get_inode(dir, dentry, mode | S_IFREG); + if (!inode) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + d_instantiate(dentry, inode); + dget(dentry); /* Extra count - pin the dentry in core */ +bail: + return status; +} + +static int dlmfs_unlink(struct inode *dir, + struct dentry *dentry) +{ + int status; + struct inode *inode = dentry->d_inode; + + mlog(0, "unlink inode %lu\n", inode->i_ino); + + /* if there are no current holders, or none that are waiting + * to acquire a lock, this basically destroys our lockres. */ + status = user_dlm_destroy_lock(&DLMFS_I(inode)->ip_lockres); + if (status < 0) { + mlog(ML_ERROR, "unlink %.*s, error %d from destroy\n", + dentry->d_name.len, dentry->d_name.name, status); + goto bail; + } + status = simple_unlink(dir, dentry); +bail: + return status; +} + +static int dlmfs_fill_super(struct super_block * sb, + void * data, + int silent) +{ + struct inode * inode; + struct dentry * root; + + sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_blocksize = PAGE_CACHE_SIZE; + sb->s_blocksize_bits = PAGE_CACHE_SHIFT; + sb->s_magic = DLMFS_MAGIC; + sb->s_op = &dlmfs_ops; + inode = dlmfs_get_root_inode(sb); + if (!inode) + return -ENOMEM; + + root = d_alloc_root(inode); + if (!root) { + iput(inode); + return -ENOMEM; + } + sb->s_root = root; + return 0; +} + +static const struct file_operations dlmfs_file_operations = { + .open = dlmfs_file_open, + .release = dlmfs_file_release, + .poll = dlmfs_file_poll, + .read = dlmfs_file_read, + .write = dlmfs_file_write, +}; + +static const struct inode_operations dlmfs_dir_inode_operations = { + .create = dlmfs_create, + .lookup = simple_lookup, + .unlink = dlmfs_unlink, +}; + +/* this way we can restrict mkdir to only the toplevel of the fs. */ +static const struct inode_operations dlmfs_root_inode_operations = { + .lookup = simple_lookup, + .mkdir = dlmfs_mkdir, + .rmdir = simple_rmdir, +}; + +static const struct super_operations dlmfs_ops = { + .statfs = simple_statfs, + .alloc_inode = dlmfs_alloc_inode, + .destroy_inode = dlmfs_destroy_inode, + .clear_inode = dlmfs_clear_inode, + .drop_inode = generic_delete_inode, +}; + +static const struct inode_operations dlmfs_file_inode_operations = { + .getattr = simple_getattr, +}; + +static int dlmfs_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, struct vfsmount *mnt) +{ + return get_sb_nodev(fs_type, flags, data, dlmfs_fill_super, mnt); +} + +static struct file_system_type dlmfs_fs_type = { + .owner = THIS_MODULE, + .name = "ocfs2_dlmfs", + .get_sb = dlmfs_get_sb, + .kill_sb = kill_litter_super, +}; + +static int __init init_dlmfs_fs(void) +{ + int status; + int cleanup_inode = 0, cleanup_worker = 0; + + dlmfs_print_version(); + + status = bdi_init(&dlmfs_backing_dev_info); + if (status) + return status; + + dlmfs_inode_cache = kmem_cache_create("dlmfs_inode_cache", + sizeof(struct dlmfs_inode_private), + 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD), + dlmfs_init_once); + if (!dlmfs_inode_cache) { + status = -ENOMEM; + goto bail; + } + cleanup_inode = 1; + + user_dlm_worker = create_singlethread_workqueue("user_dlm"); + if (!user_dlm_worker) { + status = -ENOMEM; + goto bail; + } + cleanup_worker = 1; + + status = register_filesystem(&dlmfs_fs_type); +bail: + if (status) { + if (cleanup_inode) + kmem_cache_destroy(dlmfs_inode_cache); + if (cleanup_worker) + destroy_workqueue(user_dlm_worker); + bdi_destroy(&dlmfs_backing_dev_info); + } else + printk("OCFS2 User DLM kernel interface loaded\n"); + return status; +} + +static void __exit exit_dlmfs_fs(void) +{ + unregister_filesystem(&dlmfs_fs_type); + + flush_workqueue(user_dlm_worker); + destroy_workqueue(user_dlm_worker); + + kmem_cache_destroy(dlmfs_inode_cache); + + bdi_destroy(&dlmfs_backing_dev_info); +} + +MODULE_AUTHOR("Oracle"); +MODULE_LICENSE("GPL"); + +module_init(init_dlmfs_fs) +module_exit(exit_dlmfs_fs) diff --git a/fs/ocfs2/dlmfs/dlmfsver.c b/fs/ocfs2/dlmfs/dlmfsver.c new file mode 100644 index 000000000000..a733b3321f83 --- /dev/null +++ b/fs/ocfs2/dlmfs/dlmfsver.c @@ -0,0 +1,42 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * dlmfsver.c + * + * version string + * + * Copyright (C) 2002, 2005 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/module.h> +#include <linux/kernel.h> + +#include "dlmfsver.h" + +#define DLM_BUILD_VERSION "1.5.0" + +#define VERSION_STR "OCFS2 DLMFS " DLM_BUILD_VERSION + +void dlmfs_print_version(void) +{ + printk(KERN_INFO "%s\n", VERSION_STR); +} + +MODULE_DESCRIPTION(VERSION_STR); + +MODULE_VERSION(DLM_BUILD_VERSION); diff --git a/fs/ocfs2/dlmfs/dlmfsver.h b/fs/ocfs2/dlmfs/dlmfsver.h new file mode 100644 index 000000000000..f35eadbed25c --- /dev/null +++ b/fs/ocfs2/dlmfs/dlmfsver.h @@ -0,0 +1,31 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * dlmver.h + * + * Function prototypes + * + * Copyright (C) 2005 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef DLMFS_VER_H +#define DLMFS_VER_H + +void dlmfs_print_version(void); + +#endif /* DLMFS_VER_H */ diff --git a/fs/ocfs2/dlmfs/userdlm.c b/fs/ocfs2/dlmfs/userdlm.c new file mode 100644 index 000000000000..6adae70cee8e --- /dev/null +++ b/fs/ocfs2/dlmfs/userdlm.c @@ -0,0 +1,676 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * userdlm.c + * + * Code which implements the kernel side of a minimal userspace + * interface to our DLM. + * + * Many of the functions here are pared down versions of dlmglue.c + * functions. + * + * Copyright (C) 2003, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/signal.h> + +#include <linux/module.h> +#include <linux/fs.h> +#include <linux/types.h> +#include <linux/crc32.h> + + +#include "cluster/nodemanager.h" +#include "cluster/heartbeat.h" +#include "cluster/tcp.h" + +#include "dlm/dlmapi.h" + +#include "userdlm.h" + +#define MLOG_MASK_PREFIX ML_DLMFS +#include "cluster/masklog.h" + +static inline int user_check_wait_flag(struct user_lock_res *lockres, + int flag) +{ + int ret; + + spin_lock(&lockres->l_lock); + ret = lockres->l_flags & flag; + spin_unlock(&lockres->l_lock); + + return ret; +} + +static inline void user_wait_on_busy_lock(struct user_lock_res *lockres) + +{ + wait_event(lockres->l_event, + !user_check_wait_flag(lockres, USER_LOCK_BUSY)); +} + +static inline void user_wait_on_blocked_lock(struct user_lock_res *lockres) + +{ + wait_event(lockres->l_event, + !user_check_wait_flag(lockres, USER_LOCK_BLOCKED)); +} + +/* I heart container_of... */ +static inline struct dlm_ctxt * +dlm_ctxt_from_user_lockres(struct user_lock_res *lockres) +{ + struct dlmfs_inode_private *ip; + + ip = container_of(lockres, + struct dlmfs_inode_private, + ip_lockres); + return ip->ip_dlm; +} + +static struct inode * +user_dlm_inode_from_user_lockres(struct user_lock_res *lockres) +{ + struct dlmfs_inode_private *ip; + + ip = container_of(lockres, + struct dlmfs_inode_private, + ip_lockres); + return &ip->ip_vfs_inode; +} + +static inline void user_recover_from_dlm_error(struct user_lock_res *lockres) +{ + spin_lock(&lockres->l_lock); + lockres->l_flags &= ~USER_LOCK_BUSY; + spin_unlock(&lockres->l_lock); +} + +#define user_log_dlm_error(_func, _stat, _lockres) do { \ + mlog(ML_ERROR, "Dlm error \"%s\" while calling %s on " \ + "resource %.*s: %s\n", dlm_errname(_stat), _func, \ + _lockres->l_namelen, _lockres->l_name, dlm_errmsg(_stat)); \ +} while (0) + +/* WARNING: This function lives in a world where the only three lock + * levels are EX, PR, and NL. It *will* have to be adjusted when more + * lock types are added. */ +static inline int user_highest_compat_lock_level(int level) +{ + int new_level = LKM_EXMODE; + + if (level == LKM_EXMODE) + new_level = LKM_NLMODE; + else if (level == LKM_PRMODE) + new_level = LKM_PRMODE; + return new_level; +} + +static void user_ast(void *opaque) +{ + struct user_lock_res *lockres = opaque; + struct dlm_lockstatus *lksb; + + mlog(0, "AST fired for lockres %.*s\n", lockres->l_namelen, + lockres->l_name); + + spin_lock(&lockres->l_lock); + + lksb = &(lockres->l_lksb); + if (lksb->status != DLM_NORMAL) { + mlog(ML_ERROR, "lksb status value of %u on lockres %.*s\n", + lksb->status, lockres->l_namelen, lockres->l_name); + spin_unlock(&lockres->l_lock); + return; + } + + mlog_bug_on_msg(lockres->l_requested == LKM_IVMODE, + "Lockres %.*s, requested ivmode. flags 0x%x\n", + lockres->l_namelen, lockres->l_name, lockres->l_flags); + + /* we're downconverting. */ + if (lockres->l_requested < lockres->l_level) { + if (lockres->l_requested <= + user_highest_compat_lock_level(lockres->l_blocking)) { + lockres->l_blocking = LKM_NLMODE; + lockres->l_flags &= ~USER_LOCK_BLOCKED; + } + } + + lockres->l_level = lockres->l_requested; + lockres->l_requested = LKM_IVMODE; + lockres->l_flags |= USER_LOCK_ATTACHED; + lockres->l_flags &= ~USER_LOCK_BUSY; + + spin_unlock(&lockres->l_lock); + + wake_up(&lockres->l_event); +} + +static inline void user_dlm_grab_inode_ref(struct user_lock_res *lockres) +{ + struct inode *inode; + inode = user_dlm_inode_from_user_lockres(lockres); + if (!igrab(inode)) + BUG(); +} + +static void user_dlm_unblock_lock(struct work_struct *work); + +static void __user_dlm_queue_lockres(struct user_lock_res *lockres) +{ + if (!(lockres->l_flags & USER_LOCK_QUEUED)) { + user_dlm_grab_inode_ref(lockres); + + INIT_WORK(&lockres->l_work, user_dlm_unblock_lock); + + queue_work(user_dlm_worker, &lockres->l_work); + lockres->l_flags |= USER_LOCK_QUEUED; + } +} + +static void __user_dlm_cond_queue_lockres(struct user_lock_res *lockres) +{ + int queue = 0; + + if (!(lockres->l_flags & USER_LOCK_BLOCKED)) + return; + + switch (lockres->l_blocking) { + case LKM_EXMODE: + if (!lockres->l_ex_holders && !lockres->l_ro_holders) + queue = 1; + break; + case LKM_PRMODE: + if (!lockres->l_ex_holders) + queue = 1; + break; + default: + BUG(); + } + + if (queue) + __user_dlm_queue_lockres(lockres); +} + +static void user_bast(void *opaque, int level) +{ + struct user_lock_res *lockres = opaque; + + mlog(0, "Blocking AST fired for lockres %.*s. Blocking level %d\n", + lockres->l_namelen, lockres->l_name, level); + + spin_lock(&lockres->l_lock); + lockres->l_flags |= USER_LOCK_BLOCKED; + if (level > lockres->l_blocking) + lockres->l_blocking = level; + + __user_dlm_queue_lockres(lockres); + spin_unlock(&lockres->l_lock); + + wake_up(&lockres->l_event); +} + +static void user_unlock_ast(void *opaque, enum dlm_status status) +{ + struct user_lock_res *lockres = opaque; + + mlog(0, "UNLOCK AST called on lock %.*s\n", lockres->l_namelen, + lockres->l_name); + + if (status != DLM_NORMAL && status != DLM_CANCELGRANT) + mlog(ML_ERROR, "Dlm returns status %d\n", status); + + spin_lock(&lockres->l_lock); + /* The teardown flag gets set early during the unlock process, + * so test the cancel flag to make sure that this ast isn't + * for a concurrent cancel. */ + if (lockres->l_flags & USER_LOCK_IN_TEARDOWN + && !(lockres->l_flags & USER_LOCK_IN_CANCEL)) { + lockres->l_level = LKM_IVMODE; + } else if (status == DLM_CANCELGRANT) { + /* We tried to cancel a convert request, but it was + * already granted. Don't clear the busy flag - the + * ast should've done this already. */ + BUG_ON(!(lockres->l_flags & USER_LOCK_IN_CANCEL)); + lockres->l_flags &= ~USER_LOCK_IN_CANCEL; + goto out_noclear; + } else { + BUG_ON(!(lockres->l_flags & USER_LOCK_IN_CANCEL)); + /* Cancel succeeded, we want to re-queue */ + lockres->l_requested = LKM_IVMODE; /* cancel an + * upconvert + * request. */ + lockres->l_flags &= ~USER_LOCK_IN_CANCEL; + /* we want the unblock thread to look at it again + * now. */ + if (lockres->l_flags & USER_LOCK_BLOCKED) + __user_dlm_queue_lockres(lockres); + } + + lockres->l_flags &= ~USER_LOCK_BUSY; +out_noclear: + spin_unlock(&lockres->l_lock); + + wake_up(&lockres->l_event); +} + +static inline void user_dlm_drop_inode_ref(struct user_lock_res *lockres) +{ + struct inode *inode; + inode = user_dlm_inode_from_user_lockres(lockres); + iput(inode); +} + +static void user_dlm_unblock_lock(struct work_struct *work) +{ + int new_level, status; + struct user_lock_res *lockres = + container_of(work, struct user_lock_res, l_work); + struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres); + + mlog(0, "processing lockres %.*s\n", lockres->l_namelen, + lockres->l_name); + + spin_lock(&lockres->l_lock); + + mlog_bug_on_msg(!(lockres->l_flags & USER_LOCK_QUEUED), + "Lockres %.*s, flags 0x%x\n", + lockres->l_namelen, lockres->l_name, lockres->l_flags); + + /* notice that we don't clear USER_LOCK_BLOCKED here. If it's + * set, we want user_ast clear it. */ + lockres->l_flags &= ~USER_LOCK_QUEUED; + + /* It's valid to get here and no longer be blocked - if we get + * several basts in a row, we might be queued by the first + * one, the unblock thread might run and clear the queued + * flag, and finally we might get another bast which re-queues + * us before our ast for the downconvert is called. */ + if (!(lockres->l_flags & USER_LOCK_BLOCKED)) { + spin_unlock(&lockres->l_lock); + goto drop_ref; + } + + if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) { + spin_unlock(&lockres->l_lock); + goto drop_ref; + } + + if (lockres->l_flags & USER_LOCK_BUSY) { + if (lockres->l_flags & USER_LOCK_IN_CANCEL) { + spin_unlock(&lockres->l_lock); + goto drop_ref; + } + + lockres->l_flags |= USER_LOCK_IN_CANCEL; + spin_unlock(&lockres->l_lock); + + status = dlmunlock(dlm, + &lockres->l_lksb, + LKM_CANCEL, + user_unlock_ast, + lockres); + if (status != DLM_NORMAL) + user_log_dlm_error("dlmunlock", status, lockres); + goto drop_ref; + } + + /* If there are still incompat holders, we can exit safely + * without worrying about re-queueing this lock as that will + * happen on the last call to user_cluster_unlock. */ + if ((lockres->l_blocking == LKM_EXMODE) + && (lockres->l_ex_holders || lockres->l_ro_holders)) { + spin_unlock(&lockres->l_lock); + mlog(0, "can't downconvert for ex: ro = %u, ex = %u\n", + lockres->l_ro_holders, lockres->l_ex_holders); + goto drop_ref; + } + + if ((lockres->l_blocking == LKM_PRMODE) + && lockres->l_ex_holders) { + spin_unlock(&lockres->l_lock); + mlog(0, "can't downconvert for pr: ex = %u\n", + lockres->l_ex_holders); + goto drop_ref; + } + + /* yay, we can downconvert now. */ + new_level = user_highest_compat_lock_level(lockres->l_blocking); + lockres->l_requested = new_level; + lockres->l_flags |= USER_LOCK_BUSY; + mlog(0, "Downconvert lock from %d to %d\n", + lockres->l_level, new_level); + spin_unlock(&lockres->l_lock); + + /* need lock downconvert request now... */ + status = dlmlock(dlm, + new_level, + &lockres->l_lksb, + LKM_CONVERT|LKM_VALBLK, + lockres->l_name, + lockres->l_namelen, + user_ast, + lockres, + user_bast); + if (status != DLM_NORMAL) { + user_log_dlm_error("dlmlock", status, lockres); + user_recover_from_dlm_error(lockres); + } + +drop_ref: + user_dlm_drop_inode_ref(lockres); +} + +static inline void user_dlm_inc_holders(struct user_lock_res *lockres, + int level) +{ + switch(level) { + case LKM_EXMODE: + lockres->l_ex_holders++; + break; + case LKM_PRMODE: + lockres->l_ro_holders++; + break; + default: + BUG(); + } +} + +/* predict what lock level we'll be dropping down to on behalf + * of another node, and return true if the currently wanted + * level will be compatible with it. */ +static inline int +user_may_continue_on_blocked_lock(struct user_lock_res *lockres, + int wanted) +{ + BUG_ON(!(lockres->l_flags & USER_LOCK_BLOCKED)); + + return wanted <= user_highest_compat_lock_level(lockres->l_blocking); +} + +int user_dlm_cluster_lock(struct user_lock_res *lockres, + int level, + int lkm_flags) +{ + int status, local_flags; + struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres); + + if (level != LKM_EXMODE && + level != LKM_PRMODE) { + mlog(ML_ERROR, "lockres %.*s: invalid request!\n", + lockres->l_namelen, lockres->l_name); + status = -EINVAL; + goto bail; + } + + mlog(0, "lockres %.*s: asking for %s lock, passed flags = 0x%x\n", + lockres->l_namelen, lockres->l_name, + (level == LKM_EXMODE) ? "LKM_EXMODE" : "LKM_PRMODE", + lkm_flags); + +again: + if (signal_pending(current)) { + status = -ERESTARTSYS; + goto bail; + } + + spin_lock(&lockres->l_lock); + + /* We only compare against the currently granted level + * here. If the lock is blocked waiting on a downconvert, + * we'll get caught below. */ + if ((lockres->l_flags & USER_LOCK_BUSY) && + (level > lockres->l_level)) { + /* is someone sitting in dlm_lock? If so, wait on + * them. */ + spin_unlock(&lockres->l_lock); + + user_wait_on_busy_lock(lockres); + goto again; + } + + if ((lockres->l_flags & USER_LOCK_BLOCKED) && + (!user_may_continue_on_blocked_lock(lockres, level))) { + /* is the lock is currently blocked on behalf of + * another node */ + spin_unlock(&lockres->l_lock); + + user_wait_on_blocked_lock(lockres); + goto again; + } + + if (level > lockres->l_level) { + local_flags = lkm_flags | LKM_VALBLK; + if (lockres->l_level != LKM_IVMODE) + local_flags |= LKM_CONVERT; + + lockres->l_requested = level; + lockres->l_flags |= USER_LOCK_BUSY; + spin_unlock(&lockres->l_lock); + + BUG_ON(level == LKM_IVMODE); + BUG_ON(level == LKM_NLMODE); + + /* call dlm_lock to upgrade lock now */ + status = dlmlock(dlm, + level, + &lockres->l_lksb, + local_flags, + lockres->l_name, + lockres->l_namelen, + user_ast, + lockres, + user_bast); + if (status != DLM_NORMAL) { + if ((lkm_flags & LKM_NOQUEUE) && + (status == DLM_NOTQUEUED)) + status = -EAGAIN; + else { + user_log_dlm_error("dlmlock", status, lockres); + status = -EINVAL; + } + user_recover_from_dlm_error(lockres); + goto bail; + } + + user_wait_on_busy_lock(lockres); + goto again; + } + + user_dlm_inc_holders(lockres, level); + spin_unlock(&lockres->l_lock); + + status = 0; +bail: + return status; +} + +static inline void user_dlm_dec_holders(struct user_lock_res *lockres, + int level) +{ + switch(level) { + case LKM_EXMODE: + BUG_ON(!lockres->l_ex_holders); + lockres->l_ex_holders--; + break; + case LKM_PRMODE: + BUG_ON(!lockres->l_ro_holders); + lockres->l_ro_holders--; + break; + default: + BUG(); + } +} + +void user_dlm_cluster_unlock(struct user_lock_res *lockres, + int level) +{ + if (level != LKM_EXMODE && + level != LKM_PRMODE) { + mlog(ML_ERROR, "lockres %.*s: invalid request!\n", + lockres->l_namelen, lockres->l_name); + return; + } + + spin_lock(&lockres->l_lock); + user_dlm_dec_holders(lockres, level); + __user_dlm_cond_queue_lockres(lockres); + spin_unlock(&lockres->l_lock); +} + +void user_dlm_write_lvb(struct inode *inode, + const char *val, + unsigned int len) +{ + struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres; + char *lvb = lockres->l_lksb.lvb; + + BUG_ON(len > DLM_LVB_LEN); + + spin_lock(&lockres->l_lock); + + BUG_ON(lockres->l_level < LKM_EXMODE); + memcpy(lvb, val, len); + + spin_unlock(&lockres->l_lock); +} + +void user_dlm_read_lvb(struct inode *inode, + char *val, + unsigned int len) +{ + struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres; + char *lvb = lockres->l_lksb.lvb; + + BUG_ON(len > DLM_LVB_LEN); + + spin_lock(&lockres->l_lock); + + BUG_ON(lockres->l_level < LKM_PRMODE); + memcpy(val, lvb, len); + + spin_unlock(&lockres->l_lock); +} + +void user_dlm_lock_res_init(struct user_lock_res *lockres, + struct dentry *dentry) +{ + memset(lockres, 0, sizeof(*lockres)); + + spin_lock_init(&lockres->l_lock); + init_waitqueue_head(&lockres->l_event); + lockres->l_level = LKM_IVMODE; + lockres->l_requested = LKM_IVMODE; + lockres->l_blocking = LKM_IVMODE; + + /* should have been checked before getting here. */ + BUG_ON(dentry->d_name.len >= USER_DLM_LOCK_ID_MAX_LEN); + + memcpy(lockres->l_name, + dentry->d_name.name, + dentry->d_name.len); + lockres->l_namelen = dentry->d_name.len; +} + +int user_dlm_destroy_lock(struct user_lock_res *lockres) +{ + int status = -EBUSY; + struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres); + + mlog(0, "asked to destroy %.*s\n", lockres->l_namelen, lockres->l_name); + + spin_lock(&lockres->l_lock); + if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) { + spin_unlock(&lockres->l_lock); + return 0; + } + + lockres->l_flags |= USER_LOCK_IN_TEARDOWN; + + while (lockres->l_flags & USER_LOCK_BUSY) { + spin_unlock(&lockres->l_lock); + + user_wait_on_busy_lock(lockres); + + spin_lock(&lockres->l_lock); + } + + if (lockres->l_ro_holders || lockres->l_ex_holders) { + spin_unlock(&lockres->l_lock); + goto bail; + } + + status = 0; + if (!(lockres->l_flags & USER_LOCK_ATTACHED)) { + spin_unlock(&lockres->l_lock); + goto bail; + } + + lockres->l_flags &= ~USER_LOCK_ATTACHED; + lockres->l_flags |= USER_LOCK_BUSY; + spin_unlock(&lockres->l_lock); + + status = dlmunlock(dlm, + &lockres->l_lksb, + LKM_VALBLK, + user_unlock_ast, + lockres); + if (status != DLM_NORMAL) { + user_log_dlm_error("dlmunlock", status, lockres); + status = -EINVAL; + goto bail; + } + + user_wait_on_busy_lock(lockres); + + status = 0; +bail: + return status; +} + +struct dlm_ctxt *user_dlm_register_context(struct qstr *name, + struct dlm_protocol_version *proto) +{ + struct dlm_ctxt *dlm; + u32 dlm_key; + char *domain; + + domain = kmalloc(name->len + 1, GFP_NOFS); + if (!domain) { + mlog_errno(-ENOMEM); + return ERR_PTR(-ENOMEM); + } + + dlm_key = crc32_le(0, name->name, name->len); + + snprintf(domain, name->len + 1, "%.*s", name->len, name->name); + + dlm = dlm_register_domain(domain, dlm_key, proto); + if (IS_ERR(dlm)) + mlog_errno(PTR_ERR(dlm)); + + kfree(domain); + return dlm; +} + +void user_dlm_unregister_context(struct dlm_ctxt *dlm) +{ + dlm_unregister_domain(dlm); +} diff --git a/fs/ocfs2/dlmfs/userdlm.h b/fs/ocfs2/dlmfs/userdlm.h new file mode 100644 index 000000000000..0c3cc03c61fa --- /dev/null +++ b/fs/ocfs2/dlmfs/userdlm.h @@ -0,0 +1,113 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * userdlm.h + * + * Userspace dlm defines + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + + +#ifndef USERDLM_H +#define USERDLM_H + +#include <linux/module.h> +#include <linux/fs.h> +#include <linux/types.h> +#include <linux/workqueue.h> + +/* user_lock_res->l_flags flags. */ +#define USER_LOCK_ATTACHED (0x00000001) /* we have initialized + * the lvb */ +#define USER_LOCK_BUSY (0x00000002) /* we are currently in + * dlm_lock */ +#define USER_LOCK_BLOCKED (0x00000004) /* blocked waiting to + * downconvert*/ +#define USER_LOCK_IN_TEARDOWN (0x00000008) /* we're currently + * destroying this + * lock. */ +#define USER_LOCK_QUEUED (0x00000010) /* lock is on the + * workqueue */ +#define USER_LOCK_IN_CANCEL (0x00000020) + +struct user_lock_res { + spinlock_t l_lock; + + int l_flags; + +#define USER_DLM_LOCK_ID_MAX_LEN 32 + char l_name[USER_DLM_LOCK_ID_MAX_LEN]; + int l_namelen; + int l_level; + unsigned int l_ro_holders; + unsigned int l_ex_holders; + struct dlm_lockstatus l_lksb; + + int l_requested; + int l_blocking; + + wait_queue_head_t l_event; + + struct work_struct l_work; +}; + +extern struct workqueue_struct *user_dlm_worker; + +void user_dlm_lock_res_init(struct user_lock_res *lockres, + struct dentry *dentry); +int user_dlm_destroy_lock(struct user_lock_res *lockres); +int user_dlm_cluster_lock(struct user_lock_res *lockres, + int level, + int lkm_flags); +void user_dlm_cluster_unlock(struct user_lock_res *lockres, + int level); +void user_dlm_write_lvb(struct inode *inode, + const char *val, + unsigned int len); +void user_dlm_read_lvb(struct inode *inode, + char *val, + unsigned int len); +struct dlm_ctxt *user_dlm_register_context(struct qstr *name, + struct dlm_protocol_version *proto); +void user_dlm_unregister_context(struct dlm_ctxt *dlm); + +struct dlmfs_inode_private { + struct dlm_ctxt *ip_dlm; + + struct user_lock_res ip_lockres; /* unused for directories. */ + struct inode *ip_parent; + + struct inode ip_vfs_inode; +}; + +static inline struct dlmfs_inode_private * +DLMFS_I(struct inode *inode) +{ + return container_of(inode, + struct dlmfs_inode_private, + ip_vfs_inode); +} + +struct dlmfs_filp_private { + int fp_lock_level; +}; + +#define DLMFS_MAGIC 0x76a9f425 + +#endif /* USERDLM_H */ -- cgit v1.2.3 From a796d2862aed8117acc9f470f3429a5ee852912e Mon Sep 17 00:00:00 2001 From: Joel Becker <joel.becker@oracle.com> Date: Thu, 28 Jan 2010 19:22:39 -0800 Subject: ocfs2: Pass lksbs back from stackglue ast/bast functions. The stackglue ast and bast functions tried to maintain the fiction that their arguments were void pointers. In reality, stack_user.c had to know that the argument was an ocfs2_lock_res in order to get the status off of the lksb. That's ugly. This changes stackglue to always pass the lksb as the argument to ast and bast functions. The caller can always use container_of() to get the ocfs2_lock_res or user_dlm_lock_res. The net effect to the caller is zero. They still get back the lockres in their ast. stackglue gets cleaner, and now can use the lksb itself. Signed-off-by: Joel Becker <joel.becker@oracle.com> --- fs/ocfs2/dlmglue.c | 34 +++++++++++++++++----------------- fs/ocfs2/stack_o2cb.c | 22 +++++++++++++--------- fs/ocfs2/stack_user.c | 29 +++++++++++------------------ fs/ocfs2/stackglue.c | 19 ++++++++----------- fs/ocfs2/stackglue.h | 42 +++++++++++++++++++++--------------------- 5 files changed, 70 insertions(+), 76 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index e044019cb3b1..1ba67df00924 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -297,6 +297,11 @@ static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres) lockres->l_type == OCFS2_LOCK_TYPE_OPEN; } +static inline struct ocfs2_lock_res *ocfs2_lksb_to_lock_res(union ocfs2_dlm_lksb *lksb) +{ + return container_of(lksb, struct ocfs2_lock_res, l_lksb); +} + static inline struct inode *ocfs2_lock_res_inode(struct ocfs2_lock_res *lockres) { BUG_ON(!ocfs2_is_inode_lock(lockres)); @@ -1041,9 +1046,9 @@ static unsigned int lockres_set_pending(struct ocfs2_lock_res *lockres) } -static void ocfs2_blocking_ast(void *opaque, int level) +static void ocfs2_blocking_ast(union ocfs2_dlm_lksb *lksb, int level) { - struct ocfs2_lock_res *lockres = opaque; + struct ocfs2_lock_res *lockres = ocfs2_lksb_to_lock_res(lksb); struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres); int needs_downconvert; unsigned long flags; @@ -1072,9 +1077,9 @@ static void ocfs2_blocking_ast(void *opaque, int level) ocfs2_wake_downconvert_thread(osb); } -static void ocfs2_locking_ast(void *opaque) +static void ocfs2_locking_ast(union ocfs2_dlm_lksb *lksb) { - struct ocfs2_lock_res *lockres = opaque; + struct ocfs2_lock_res *lockres = ocfs2_lksb_to_lock_res(lksb); struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres); unsigned long flags; int status; @@ -1189,8 +1194,7 @@ static int ocfs2_lock_create(struct ocfs2_super *osb, &lockres->l_lksb, dlm_flags, lockres->l_name, - OCFS2_LOCK_ID_MAX_LEN - 1, - lockres); + OCFS2_LOCK_ID_MAX_LEN - 1); lockres_clear_pending(lockres, gen, osb); if (ret) { ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres); @@ -1421,8 +1425,7 @@ again: &lockres->l_lksb, lkm_flags, lockres->l_name, - OCFS2_LOCK_ID_MAX_LEN - 1, - lockres); + OCFS2_LOCK_ID_MAX_LEN - 1); lockres_clear_pending(lockres, gen, osb); if (ret) { if (!(lkm_flags & DLM_LKF_NOQUEUE) || @@ -1859,8 +1862,7 @@ int ocfs2_file_lock(struct file *file, int ex, int trylock) spin_unlock_irqrestore(&lockres->l_lock, flags); ret = ocfs2_dlm_lock(osb->cconn, level, &lockres->l_lksb, lkm_flags, - lockres->l_name, OCFS2_LOCK_ID_MAX_LEN - 1, - lockres); + lockres->l_name, OCFS2_LOCK_ID_MAX_LEN - 1); if (ret) { if (!trylock || (ret != -EAGAIN)) { ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres); @@ -3056,9 +3058,9 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb, mlog_exit_void(); } -static void ocfs2_unlock_ast(void *opaque, int error) +static void ocfs2_unlock_ast(union ocfs2_dlm_lksb *lksb, int error) { - struct ocfs2_lock_res *lockres = opaque; + struct ocfs2_lock_res *lockres = ocfs2_lksb_to_lock_res(lksb); unsigned long flags; mlog_entry_void(); @@ -3167,8 +3169,7 @@ static int ocfs2_drop_lock(struct ocfs2_super *osb, mlog(0, "lock %s\n", lockres->l_name); - ret = ocfs2_dlm_unlock(osb->cconn, &lockres->l_lksb, lkm_flags, - lockres); + ret = ocfs2_dlm_unlock(osb->cconn, &lockres->l_lksb, lkm_flags); if (ret) { ocfs2_log_dlm_error("ocfs2_dlm_unlock", ret, lockres); mlog(ML_ERROR, "lockres flags: %lu\n", lockres->l_flags); @@ -3309,8 +3310,7 @@ static int ocfs2_downconvert_lock(struct ocfs2_super *osb, &lockres->l_lksb, dlm_flags, lockres->l_name, - OCFS2_LOCK_ID_MAX_LEN - 1, - lockres); + OCFS2_LOCK_ID_MAX_LEN - 1); lockres_clear_pending(lockres, generation, osb); if (ret) { ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres); @@ -3365,7 +3365,7 @@ static int ocfs2_cancel_convert(struct ocfs2_super *osb, mlog(0, "lock %s\n", lockres->l_name); ret = ocfs2_dlm_unlock(osb->cconn, &lockres->l_lksb, - DLM_LKF_CANCEL, lockres); + DLM_LKF_CANCEL); if (ret) { ocfs2_log_dlm_error("ocfs2_dlm_unlock", ret, lockres); ocfs2_recover_from_dlm_error(lockres, 0); diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c index 3038c92af493..c4cedff365df 100644 --- a/fs/ocfs2/stack_o2cb.c +++ b/fs/ocfs2/stack_o2cb.c @@ -161,20 +161,26 @@ static int dlm_status_to_errno(enum dlm_status status) static void o2dlm_lock_ast_wrapper(void *astarg) { + union ocfs2_dlm_lksb *lksb = astarg; + BUG_ON(o2cb_stack.sp_proto == NULL); - o2cb_stack.sp_proto->lp_lock_ast(astarg); + o2cb_stack.sp_proto->lp_lock_ast(lksb); } static void o2dlm_blocking_ast_wrapper(void *astarg, int level) { + union ocfs2_dlm_lksb *lksb = astarg; + BUG_ON(o2cb_stack.sp_proto == NULL); - o2cb_stack.sp_proto->lp_blocking_ast(astarg, level); + o2cb_stack.sp_proto->lp_blocking_ast(lksb, level); } static void o2dlm_unlock_ast_wrapper(void *astarg, enum dlm_status status) { + union ocfs2_dlm_lksb *lksb = astarg; + int error = dlm_status_to_errno(status); BUG_ON(o2cb_stack.sp_proto == NULL); @@ -193,7 +199,7 @@ static void o2dlm_unlock_ast_wrapper(void *astarg, enum dlm_status status) if (status == DLM_CANCELGRANT) return; - o2cb_stack.sp_proto->lp_unlock_ast(astarg, error); + o2cb_stack.sp_proto->lp_unlock_ast(lksb, error); } static int o2cb_dlm_lock(struct ocfs2_cluster_connection *conn, @@ -201,8 +207,7 @@ static int o2cb_dlm_lock(struct ocfs2_cluster_connection *conn, union ocfs2_dlm_lksb *lksb, u32 flags, void *name, - unsigned int namelen, - void *astarg) + unsigned int namelen) { enum dlm_status status; int o2dlm_mode = mode_to_o2dlm(mode); @@ -211,7 +216,7 @@ static int o2cb_dlm_lock(struct ocfs2_cluster_connection *conn, status = dlmlock(conn->cc_lockspace, o2dlm_mode, &lksb->lksb_o2dlm, o2dlm_flags, name, namelen, - o2dlm_lock_ast_wrapper, astarg, + o2dlm_lock_ast_wrapper, lksb, o2dlm_blocking_ast_wrapper); ret = dlm_status_to_errno(status); return ret; @@ -219,15 +224,14 @@ static int o2cb_dlm_lock(struct ocfs2_cluster_connection *conn, static int o2cb_dlm_unlock(struct ocfs2_cluster_connection *conn, union ocfs2_dlm_lksb *lksb, - u32 flags, - void *astarg) + u32 flags) { enum dlm_status status; int o2dlm_flags = flags_to_o2dlm(flags); int ret; status = dlmunlock(conn->cc_lockspace, &lksb->lksb_o2dlm, - o2dlm_flags, o2dlm_unlock_ast_wrapper, astarg); + o2dlm_flags, o2dlm_unlock_ast_wrapper, lksb); ret = dlm_status_to_errno(status); return ret; } diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c index da78a2a334fd..129b93159cca 100644 --- a/fs/ocfs2/stack_user.c +++ b/fs/ocfs2/stack_user.c @@ -25,7 +25,6 @@ #include <linux/reboot.h> #include <asm/uaccess.h> -#include "ocfs2.h" /* For struct ocfs2_lock_res */ #include "stackglue.h" #include <linux/dlm_plock.h> @@ -664,16 +663,10 @@ static void ocfs2_control_exit(void) -rc); } -static struct dlm_lksb *fsdlm_astarg_to_lksb(void *astarg) -{ - struct ocfs2_lock_res *res = astarg; - return &res->l_lksb.lksb_fsdlm; -} - static void fsdlm_lock_ast_wrapper(void *astarg) { - struct dlm_lksb *lksb = fsdlm_astarg_to_lksb(astarg); - int status = lksb->sb_status; + union ocfs2_dlm_lksb *lksb = astarg; + int status = lksb->lksb_fsdlm.sb_status; BUG_ON(ocfs2_user_plugin.sp_proto == NULL); @@ -688,16 +681,18 @@ static void fsdlm_lock_ast_wrapper(void *astarg) */ if (status == -DLM_EUNLOCK || status == -DLM_ECANCEL) - ocfs2_user_plugin.sp_proto->lp_unlock_ast(astarg, 0); + ocfs2_user_plugin.sp_proto->lp_unlock_ast(lksb, 0); else - ocfs2_user_plugin.sp_proto->lp_lock_ast(astarg); + ocfs2_user_plugin.sp_proto->lp_lock_ast(lksb); } static void fsdlm_blocking_ast_wrapper(void *astarg, int level) { + union ocfs2_dlm_lksb *lksb = astarg; + BUG_ON(ocfs2_user_plugin.sp_proto == NULL); - ocfs2_user_plugin.sp_proto->lp_blocking_ast(astarg, level); + ocfs2_user_plugin.sp_proto->lp_blocking_ast(lksb, level); } static int user_dlm_lock(struct ocfs2_cluster_connection *conn, @@ -705,8 +700,7 @@ static int user_dlm_lock(struct ocfs2_cluster_connection *conn, union ocfs2_dlm_lksb *lksb, u32 flags, void *name, - unsigned int namelen, - void *astarg) + unsigned int namelen) { int ret; @@ -716,20 +710,19 @@ static int user_dlm_lock(struct ocfs2_cluster_connection *conn, ret = dlm_lock(conn->cc_lockspace, mode, &lksb->lksb_fsdlm, flags|DLM_LKF_NODLCKWT, name, namelen, 0, - fsdlm_lock_ast_wrapper, astarg, + fsdlm_lock_ast_wrapper, lksb, fsdlm_blocking_ast_wrapper); return ret; } static int user_dlm_unlock(struct ocfs2_cluster_connection *conn, union ocfs2_dlm_lksb *lksb, - u32 flags, - void *astarg) + u32 flags) { int ret; ret = dlm_unlock(conn->cc_lockspace, lksb->lksb_fsdlm.sb_lkid, - flags, &lksb->lksb_fsdlm, astarg); + flags, &lksb->lksb_fsdlm, lksb); return ret; } diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c index f3df0baa9a48..3500d9839d76 100644 --- a/fs/ocfs2/stackglue.c +++ b/fs/ocfs2/stackglue.c @@ -233,35 +233,32 @@ EXPORT_SYMBOL_GPL(ocfs2_stack_glue_set_locking_protocol); /* - * The ocfs2_dlm_lock() and ocfs2_dlm_unlock() functions take - * "struct ocfs2_lock_res *astarg" instead of "void *astarg" because the - * underlying stack plugins need to pilfer the lksb off of the lock_res. - * If some other structure needs to be passed as an astarg, the plugins - * will need to be given a different avenue to the lksb. + * The ocfs2_dlm_lock() and ocfs2_dlm_unlock() functions take no argument + * for the ast and bast functions. They will pass the lksb to the ast + * and bast. The caller can wrap the lksb with their own structure to + * get more information. */ int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn, int mode, union ocfs2_dlm_lksb *lksb, u32 flags, void *name, - unsigned int namelen, - struct ocfs2_lock_res *astarg) + unsigned int namelen) { BUG_ON(lproto == NULL); return active_stack->sp_ops->dlm_lock(conn, mode, lksb, flags, - name, namelen, astarg); + name, namelen); } EXPORT_SYMBOL_GPL(ocfs2_dlm_lock); int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn, union ocfs2_dlm_lksb *lksb, - u32 flags, - struct ocfs2_lock_res *astarg) + u32 flags) { BUG_ON(lproto == NULL); - return active_stack->sp_ops->dlm_unlock(conn, lksb, flags, astarg); + return active_stack->sp_ops->dlm_unlock(conn, lksb, flags); } EXPORT_SYMBOL_GPL(ocfs2_dlm_unlock); diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h index 03a44d60eac9..d699117fb851 100644 --- a/fs/ocfs2/stackglue.h +++ b/fs/ocfs2/stackglue.h @@ -55,17 +55,6 @@ struct ocfs2_protocol_version { u8 pv_minor; }; -/* - * The ocfs2_locking_protocol defines the handlers called on ocfs2's behalf. - */ -struct ocfs2_locking_protocol { - struct ocfs2_protocol_version lp_max_version; - void (*lp_lock_ast)(void *astarg); - void (*lp_blocking_ast)(void *astarg, int level); - void (*lp_unlock_ast)(void *astarg, int error); -}; - - /* * The dlm_lockstatus struct includes lvb space, but the dlm_lksb struct only * has a pointer to separately allocated lvb space. This struct exists only to @@ -87,6 +76,17 @@ union ocfs2_dlm_lksb { struct fsdlm_lksb_plus_lvb padding; }; +/* + * The ocfs2_locking_protocol defines the handlers called on ocfs2's behalf. + */ +struct ocfs2_locking_protocol { + struct ocfs2_protocol_version lp_max_version; + void (*lp_lock_ast)(union ocfs2_dlm_lksb *lksb); + void (*lp_blocking_ast)(union ocfs2_dlm_lksb *lksb, int level); + void (*lp_unlock_ast)(union ocfs2_dlm_lksb *lksb, int error); +}; + + /* * A cluster connection. Mostly opaque to ocfs2, the connection holds * state for the underlying stack. ocfs2 does use cc_version to determine @@ -155,27 +155,29 @@ struct ocfs2_stack_operations { * * ast and bast functions are not part of the call because the * stack will likely want to wrap ast and bast calls before passing - * them to stack->sp_proto. + * them to stack->sp_proto. There is no astarg. The lksb will + * be passed back to the ast and bast functions. The caller can + * use this to find their object. */ int (*dlm_lock)(struct ocfs2_cluster_connection *conn, int mode, union ocfs2_dlm_lksb *lksb, u32 flags, void *name, - unsigned int namelen, - void *astarg); + unsigned int namelen); /* * Call the underlying dlm unlock function. The ->dlm_unlock() * function should convert the flags as appropriate. * * The unlock ast is not passed, as the stack will want to wrap - * it before calling stack->sp_proto->lp_unlock_ast(). + * it before calling stack->sp_proto->lp_unlock_ast(). There is + * no astarg. The lksb will be passed back to the unlock ast + * function. The caller can use this to find their object. */ int (*dlm_unlock)(struct ocfs2_cluster_connection *conn, union ocfs2_dlm_lksb *lksb, - u32 flags, - void *astarg); + u32 flags); /* * Return the status of the current lock status block. The fs @@ -249,12 +251,10 @@ int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn, union ocfs2_dlm_lksb *lksb, u32 flags, void *name, - unsigned int namelen, - struct ocfs2_lock_res *astarg); + unsigned int namelen); int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn, union ocfs2_dlm_lksb *lksb, - u32 flags, - struct ocfs2_lock_res *astarg); + u32 flags); int ocfs2_dlm_lock_status(union ocfs2_dlm_lksb *lksb); int ocfs2_dlm_lvb_valid(union ocfs2_dlm_lksb *lksb); -- cgit v1.2.3 From c0e4133851ed94c73ee3d34a2f2a245fcd0a60a1 Mon Sep 17 00:00:00 2001 From: Joel Becker <joel.becker@oracle.com> Date: Fri, 29 Jan 2010 14:46:44 -0800 Subject: ocfs2: Attach the connection to the lksb We're going to want it in the ast functions, so we convert union ocfs2_dlm_lksb to struct ocfs2_dlm_lksb and let it carry the connection. Signed-off-by: Joel Becker <joel.becker@oracle.com> --- fs/ocfs2/dlmglue.c | 8 ++++---- fs/ocfs2/ocfs2.h | 2 +- fs/ocfs2/stack_o2cb.c | 18 +++++++++--------- fs/ocfs2/stack_user.c | 16 ++++++++-------- fs/ocfs2/stackglue.c | 17 +++++++++++------ fs/ocfs2/stackglue.h | 42 +++++++++++++++++++++++------------------- 6 files changed, 56 insertions(+), 47 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 1ba67df00924..2bb868b7b44f 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -297,7 +297,7 @@ static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres) lockres->l_type == OCFS2_LOCK_TYPE_OPEN; } -static inline struct ocfs2_lock_res *ocfs2_lksb_to_lock_res(union ocfs2_dlm_lksb *lksb) +static inline struct ocfs2_lock_res *ocfs2_lksb_to_lock_res(struct ocfs2_dlm_lksb *lksb) { return container_of(lksb, struct ocfs2_lock_res, l_lksb); } @@ -1046,7 +1046,7 @@ static unsigned int lockres_set_pending(struct ocfs2_lock_res *lockres) } -static void ocfs2_blocking_ast(union ocfs2_dlm_lksb *lksb, int level) +static void ocfs2_blocking_ast(struct ocfs2_dlm_lksb *lksb, int level) { struct ocfs2_lock_res *lockres = ocfs2_lksb_to_lock_res(lksb); struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres); @@ -1077,7 +1077,7 @@ static void ocfs2_blocking_ast(union ocfs2_dlm_lksb *lksb, int level) ocfs2_wake_downconvert_thread(osb); } -static void ocfs2_locking_ast(union ocfs2_dlm_lksb *lksb) +static void ocfs2_locking_ast(struct ocfs2_dlm_lksb *lksb) { struct ocfs2_lock_res *lockres = ocfs2_lksb_to_lock_res(lksb); struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres); @@ -3058,7 +3058,7 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb, mlog_exit_void(); } -static void ocfs2_unlock_ast(union ocfs2_dlm_lksb *lksb, int error) +static void ocfs2_unlock_ast(struct ocfs2_dlm_lksb *lksb, int error) { struct ocfs2_lock_res *lockres = ocfs2_lksb_to_lock_res(lksb); unsigned long flags; diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 8857dd724f90..b27fe2489e0c 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -159,7 +159,7 @@ struct ocfs2_lock_res { int l_level; unsigned int l_ro_holders; unsigned int l_ex_holders; - union ocfs2_dlm_lksb l_lksb; + struct ocfs2_dlm_lksb l_lksb; /* used from AST/BAST funcs. */ enum ocfs2_ast_action l_action; diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c index c4cedff365df..fa9dd79c3615 100644 --- a/fs/ocfs2/stack_o2cb.c +++ b/fs/ocfs2/stack_o2cb.c @@ -161,7 +161,7 @@ static int dlm_status_to_errno(enum dlm_status status) static void o2dlm_lock_ast_wrapper(void *astarg) { - union ocfs2_dlm_lksb *lksb = astarg; + struct ocfs2_dlm_lksb *lksb = astarg; BUG_ON(o2cb_stack.sp_proto == NULL); @@ -170,7 +170,7 @@ static void o2dlm_lock_ast_wrapper(void *astarg) static void o2dlm_blocking_ast_wrapper(void *astarg, int level) { - union ocfs2_dlm_lksb *lksb = astarg; + struct ocfs2_dlm_lksb *lksb = astarg; BUG_ON(o2cb_stack.sp_proto == NULL); @@ -179,7 +179,7 @@ static void o2dlm_blocking_ast_wrapper(void *astarg, int level) static void o2dlm_unlock_ast_wrapper(void *astarg, enum dlm_status status) { - union ocfs2_dlm_lksb *lksb = astarg; + struct ocfs2_dlm_lksb *lksb = astarg; int error = dlm_status_to_errno(status); @@ -204,7 +204,7 @@ static void o2dlm_unlock_ast_wrapper(void *astarg, enum dlm_status status) static int o2cb_dlm_lock(struct ocfs2_cluster_connection *conn, int mode, - union ocfs2_dlm_lksb *lksb, + struct ocfs2_dlm_lksb *lksb, u32 flags, void *name, unsigned int namelen) @@ -223,7 +223,7 @@ static int o2cb_dlm_lock(struct ocfs2_cluster_connection *conn, } static int o2cb_dlm_unlock(struct ocfs2_cluster_connection *conn, - union ocfs2_dlm_lksb *lksb, + struct ocfs2_dlm_lksb *lksb, u32 flags) { enum dlm_status status; @@ -236,7 +236,7 @@ static int o2cb_dlm_unlock(struct ocfs2_cluster_connection *conn, return ret; } -static int o2cb_dlm_lock_status(union ocfs2_dlm_lksb *lksb) +static int o2cb_dlm_lock_status(struct ocfs2_dlm_lksb *lksb) { return dlm_status_to_errno(lksb->lksb_o2dlm.status); } @@ -246,17 +246,17 @@ static int o2cb_dlm_lock_status(union ocfs2_dlm_lksb *lksb) * contents, it will zero out the LVB. Thus the caller can always trust * the contents. */ -static int o2cb_dlm_lvb_valid(union ocfs2_dlm_lksb *lksb) +static int o2cb_dlm_lvb_valid(struct ocfs2_dlm_lksb *lksb) { return 1; } -static void *o2cb_dlm_lvb(union ocfs2_dlm_lksb *lksb) +static void *o2cb_dlm_lvb(struct ocfs2_dlm_lksb *lksb) { return (void *)(lksb->lksb_o2dlm.lvb); } -static void o2cb_dump_lksb(union ocfs2_dlm_lksb *lksb) +static void o2cb_dump_lksb(struct ocfs2_dlm_lksb *lksb) { dlm_print_one_lock(lksb->lksb_o2dlm.lockid); } diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c index 129b93159cca..31276bac78f5 100644 --- a/fs/ocfs2/stack_user.c +++ b/fs/ocfs2/stack_user.c @@ -665,7 +665,7 @@ static void ocfs2_control_exit(void) static void fsdlm_lock_ast_wrapper(void *astarg) { - union ocfs2_dlm_lksb *lksb = astarg; + struct ocfs2_dlm_lksb *lksb = astarg; int status = lksb->lksb_fsdlm.sb_status; BUG_ON(ocfs2_user_plugin.sp_proto == NULL); @@ -688,7 +688,7 @@ static void fsdlm_lock_ast_wrapper(void *astarg) static void fsdlm_blocking_ast_wrapper(void *astarg, int level) { - union ocfs2_dlm_lksb *lksb = astarg; + struct ocfs2_dlm_lksb *lksb = astarg; BUG_ON(ocfs2_user_plugin.sp_proto == NULL); @@ -697,7 +697,7 @@ static void fsdlm_blocking_ast_wrapper(void *astarg, int level) static int user_dlm_lock(struct ocfs2_cluster_connection *conn, int mode, - union ocfs2_dlm_lksb *lksb, + struct ocfs2_dlm_lksb *lksb, u32 flags, void *name, unsigned int namelen) @@ -716,7 +716,7 @@ static int user_dlm_lock(struct ocfs2_cluster_connection *conn, } static int user_dlm_unlock(struct ocfs2_cluster_connection *conn, - union ocfs2_dlm_lksb *lksb, + struct ocfs2_dlm_lksb *lksb, u32 flags) { int ret; @@ -726,19 +726,19 @@ static int user_dlm_unlock(struct ocfs2_cluster_connection *conn, return ret; } -static int user_dlm_lock_status(union ocfs2_dlm_lksb *lksb) +static int user_dlm_lock_status(struct ocfs2_dlm_lksb *lksb) { return lksb->lksb_fsdlm.sb_status; } -static int user_dlm_lvb_valid(union ocfs2_dlm_lksb *lksb) +static int user_dlm_lvb_valid(struct ocfs2_dlm_lksb *lksb) { int invalid = lksb->lksb_fsdlm.sb_flags & DLM_SBF_VALNOTVALID; return !invalid; } -static void *user_dlm_lvb(union ocfs2_dlm_lksb *lksb) +static void *user_dlm_lvb(struct ocfs2_dlm_lksb *lksb) { if (!lksb->lksb_fsdlm.sb_lvbptr) lksb->lksb_fsdlm.sb_lvbptr = (char *)lksb + @@ -746,7 +746,7 @@ static void *user_dlm_lvb(union ocfs2_dlm_lksb *lksb) return (void *)(lksb->lksb_fsdlm.sb_lvbptr); } -static void user_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb) +static void user_dlm_dump_lksb(struct ocfs2_dlm_lksb *lksb) { } diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c index 3500d9839d76..8ef9a574315e 100644 --- a/fs/ocfs2/stackglue.c +++ b/fs/ocfs2/stackglue.c @@ -240,47 +240,52 @@ EXPORT_SYMBOL_GPL(ocfs2_stack_glue_set_locking_protocol); */ int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn, int mode, - union ocfs2_dlm_lksb *lksb, + struct ocfs2_dlm_lksb *lksb, u32 flags, void *name, unsigned int namelen) { BUG_ON(lproto == NULL); + if (!lksb->lksb_conn) + lksb->lksb_conn = conn; + else + BUG_ON(lksb->lksb_conn != conn); return active_stack->sp_ops->dlm_lock(conn, mode, lksb, flags, name, namelen); } EXPORT_SYMBOL_GPL(ocfs2_dlm_lock); int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn, - union ocfs2_dlm_lksb *lksb, + struct ocfs2_dlm_lksb *lksb, u32 flags) { BUG_ON(lproto == NULL); + BUG_ON(lksb->lksb_conn == NULL); return active_stack->sp_ops->dlm_unlock(conn, lksb, flags); } EXPORT_SYMBOL_GPL(ocfs2_dlm_unlock); -int ocfs2_dlm_lock_status(union ocfs2_dlm_lksb *lksb) +int ocfs2_dlm_lock_status(struct ocfs2_dlm_lksb *lksb) { return active_stack->sp_ops->lock_status(lksb); } EXPORT_SYMBOL_GPL(ocfs2_dlm_lock_status); -int ocfs2_dlm_lvb_valid(union ocfs2_dlm_lksb *lksb) +int ocfs2_dlm_lvb_valid(struct ocfs2_dlm_lksb *lksb) { return active_stack->sp_ops->lvb_valid(lksb); } EXPORT_SYMBOL_GPL(ocfs2_dlm_lvb_valid); -void *ocfs2_dlm_lvb(union ocfs2_dlm_lksb *lksb) +void *ocfs2_dlm_lvb(struct ocfs2_dlm_lksb *lksb) { return active_stack->sp_ops->lock_lvb(lksb); } EXPORT_SYMBOL_GPL(ocfs2_dlm_lvb); -void ocfs2_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb) +void ocfs2_dlm_dump_lksb(struct ocfs2_dlm_lksb *lksb) { active_stack->sp_ops->dump_lksb(lksb); } diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h index d699117fb851..bb32926912ba 100644 --- a/fs/ocfs2/stackglue.h +++ b/fs/ocfs2/stackglue.h @@ -70,10 +70,14 @@ struct fsdlm_lksb_plus_lvb { * size of the union is known. Lock status structures are embedded in * ocfs2 inodes. */ -union ocfs2_dlm_lksb { - struct dlm_lockstatus lksb_o2dlm; - struct dlm_lksb lksb_fsdlm; - struct fsdlm_lksb_plus_lvb padding; +struct ocfs2_cluster_connection; +struct ocfs2_dlm_lksb { + union { + struct dlm_lockstatus lksb_o2dlm; + struct dlm_lksb lksb_fsdlm; + struct fsdlm_lksb_plus_lvb padding; + }; + struct ocfs2_cluster_connection *lksb_conn; }; /* @@ -81,9 +85,9 @@ union ocfs2_dlm_lksb { */ struct ocfs2_locking_protocol { struct ocfs2_protocol_version lp_max_version; - void (*lp_lock_ast)(union ocfs2_dlm_lksb *lksb); - void (*lp_blocking_ast)(union ocfs2_dlm_lksb *lksb, int level); - void (*lp_unlock_ast)(union ocfs2_dlm_lksb *lksb, int error); + void (*lp_lock_ast)(struct ocfs2_dlm_lksb *lksb); + void (*lp_blocking_ast)(struct ocfs2_dlm_lksb *lksb, int level); + void (*lp_unlock_ast)(struct ocfs2_dlm_lksb *lksb, int error); }; @@ -161,7 +165,7 @@ struct ocfs2_stack_operations { */ int (*dlm_lock)(struct ocfs2_cluster_connection *conn, int mode, - union ocfs2_dlm_lksb *lksb, + struct ocfs2_dlm_lksb *lksb, u32 flags, void *name, unsigned int namelen); @@ -176,7 +180,7 @@ struct ocfs2_stack_operations { * function. The caller can use this to find their object. */ int (*dlm_unlock)(struct ocfs2_cluster_connection *conn, - union ocfs2_dlm_lksb *lksb, + struct ocfs2_dlm_lksb *lksb, u32 flags); /* @@ -185,17 +189,17 @@ struct ocfs2_stack_operations { * callback pulls out the stack-specific lksb, converts the status * to a proper errno, and returns it. */ - int (*lock_status)(union ocfs2_dlm_lksb *lksb); + int (*lock_status)(struct ocfs2_dlm_lksb *lksb); /* * Return non-zero if the LVB is valid. */ - int (*lvb_valid)(union ocfs2_dlm_lksb *lksb); + int (*lvb_valid)(struct ocfs2_dlm_lksb *lksb); /* * Pull the lvb pointer off of the stack-specific lksb. */ - void *(*lock_lvb)(union ocfs2_dlm_lksb *lksb); + void *(*lock_lvb)(struct ocfs2_dlm_lksb *lksb); /* * Cluster-aware posix locks @@ -212,7 +216,7 @@ struct ocfs2_stack_operations { * This is an optoinal debugging hook. If provided, the * stack can dump debugging information about this lock. */ - void (*dump_lksb)(union ocfs2_dlm_lksb *lksb); + void (*dump_lksb)(struct ocfs2_dlm_lksb *lksb); }; /* @@ -248,18 +252,18 @@ int ocfs2_cluster_this_node(unsigned int *node); struct ocfs2_lock_res; int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn, int mode, - union ocfs2_dlm_lksb *lksb, + struct ocfs2_dlm_lksb *lksb, u32 flags, void *name, unsigned int namelen); int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn, - union ocfs2_dlm_lksb *lksb, + struct ocfs2_dlm_lksb *lksb, u32 flags); -int ocfs2_dlm_lock_status(union ocfs2_dlm_lksb *lksb); -int ocfs2_dlm_lvb_valid(union ocfs2_dlm_lksb *lksb); -void *ocfs2_dlm_lvb(union ocfs2_dlm_lksb *lksb); -void ocfs2_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb); +int ocfs2_dlm_lock_status(struct ocfs2_dlm_lksb *lksb); +int ocfs2_dlm_lvb_valid(struct ocfs2_dlm_lksb *lksb); +void *ocfs2_dlm_lvb(struct ocfs2_dlm_lksb *lksb); +void ocfs2_dlm_dump_lksb(struct ocfs2_dlm_lksb *lksb); int ocfs2_stack_supports_plocks(void); int ocfs2_plock(struct ocfs2_cluster_connection *conn, u64 ino, -- cgit v1.2.3 From 110946c8fb23c1e1e23312afed0977ad4aa37c95 Mon Sep 17 00:00:00 2001 From: Joel Becker <joel.becker@oracle.com> Date: Fri, 29 Jan 2010 15:46:23 -0800 Subject: ocfs2: Hang the locking proto on the cluster conn and use it in asts. With the ocfs2_cluster_connection hanging off of the ocfs2_dlm_lksb, we have access to it in the ast and bast wrapper functions. Attach the ocfs2_locking_protocol to the conn. Now, instead of refering to a static variable for ast/bast pointers, the wrappers can look at the connection. This means different connections can have different ast/bast pointers, and it reduces the need for the static pointer. Signed-off-by: Joel Becker <joel.becker@oracle.com> --- fs/ocfs2/stack_o2cb.c | 15 ++++----------- fs/ocfs2/stack_user.c | 10 +++------- fs/ocfs2/stackglue.c | 1 + fs/ocfs2/stackglue.h | 1 + 4 files changed, 9 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c index fa9dd79c3615..7020e1253ffa 100644 --- a/fs/ocfs2/stack_o2cb.c +++ b/fs/ocfs2/stack_o2cb.c @@ -163,28 +163,21 @@ static void o2dlm_lock_ast_wrapper(void *astarg) { struct ocfs2_dlm_lksb *lksb = astarg; - BUG_ON(o2cb_stack.sp_proto == NULL); - - o2cb_stack.sp_proto->lp_lock_ast(lksb); + lksb->lksb_conn->cc_proto->lp_lock_ast(lksb); } static void o2dlm_blocking_ast_wrapper(void *astarg, int level) { struct ocfs2_dlm_lksb *lksb = astarg; - BUG_ON(o2cb_stack.sp_proto == NULL); - - o2cb_stack.sp_proto->lp_blocking_ast(lksb, level); + lksb->lksb_conn->cc_proto->lp_blocking_ast(lksb, level); } static void o2dlm_unlock_ast_wrapper(void *astarg, enum dlm_status status) { struct ocfs2_dlm_lksb *lksb = astarg; - int error = dlm_status_to_errno(status); - BUG_ON(o2cb_stack.sp_proto == NULL); - /* * In o2dlm, you can get both the lock_ast() for the lock being * granted and the unlock_ast() for the CANCEL failing. A @@ -199,7 +192,7 @@ static void o2dlm_unlock_ast_wrapper(void *astarg, enum dlm_status status) if (status == DLM_CANCELGRANT) return; - o2cb_stack.sp_proto->lp_unlock_ast(lksb, error); + lksb->lksb_conn->cc_proto->lp_unlock_ast(lksb, error); } static int o2cb_dlm_lock(struct ocfs2_cluster_connection *conn, @@ -284,7 +277,7 @@ static int o2cb_cluster_connect(struct ocfs2_cluster_connection *conn) struct dlm_protocol_version fs_version; BUG_ON(conn == NULL); - BUG_ON(o2cb_stack.sp_proto == NULL); + BUG_ON(conn->cc_proto == NULL); /* for now we only have one cluster/node, make sure we see it * in the heartbeat universe */ diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c index 31276bac78f5..b4cf616ef423 100644 --- a/fs/ocfs2/stack_user.c +++ b/fs/ocfs2/stack_user.c @@ -668,8 +668,6 @@ static void fsdlm_lock_ast_wrapper(void *astarg) struct ocfs2_dlm_lksb *lksb = astarg; int status = lksb->lksb_fsdlm.sb_status; - BUG_ON(ocfs2_user_plugin.sp_proto == NULL); - /* * For now we're punting on the issue of other non-standard errors * where we can't tell if the unlock_ast or lock_ast should be called. @@ -681,18 +679,16 @@ static void fsdlm_lock_ast_wrapper(void *astarg) */ if (status == -DLM_EUNLOCK || status == -DLM_ECANCEL) - ocfs2_user_plugin.sp_proto->lp_unlock_ast(lksb, 0); + lksb->lksb_conn->cc_proto->lp_unlock_ast(lksb, 0); else - ocfs2_user_plugin.sp_proto->lp_lock_ast(lksb); + lksb->lksb_conn->cc_proto->lp_lock_ast(lksb); } static void fsdlm_blocking_ast_wrapper(void *astarg, int level) { struct ocfs2_dlm_lksb *lksb = astarg; - BUG_ON(ocfs2_user_plugin.sp_proto == NULL); - - ocfs2_user_plugin.sp_proto->lp_blocking_ast(lksb, level); + lksb->lksb_conn->cc_proto->lp_blocking_ast(lksb, level); } static int user_dlm_lock(struct ocfs2_cluster_connection *conn, diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c index 8ef9a574315e..010ecabbdeb5 100644 --- a/fs/ocfs2/stackglue.c +++ b/fs/ocfs2/stackglue.c @@ -343,6 +343,7 @@ int ocfs2_cluster_connect(const char *stack_name, new_conn->cc_recovery_handler = recovery_handler; new_conn->cc_recovery_data = recovery_data; + new_conn->cc_proto = lproto; /* Start the new connection at our maximum compatibility level */ new_conn->cc_version = lproto->lp_max_version; diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h index bb32926912ba..cf8bac23ae09 100644 --- a/fs/ocfs2/stackglue.h +++ b/fs/ocfs2/stackglue.h @@ -100,6 +100,7 @@ struct ocfs2_cluster_connection { char cc_name[GROUP_NAME_MAX]; int cc_namelen; struct ocfs2_protocol_version cc_version; + struct ocfs2_locking_protocol *cc_proto; void (*cc_recovery_handler)(int node_num, void *recovery_data); void *cc_recovery_data; void *cc_lockspace; -- cgit v1.2.3 From e603cfb074e150736814ef093a411df32c02ba9f Mon Sep 17 00:00:00 2001 From: Joel Becker <joel.becker@oracle.com> Date: Fri, 29 Jan 2010 16:06:29 -0800 Subject: ocfs2: Remove the ast pointers from ocfs2_stack_plugins With the full ocfs2_locking_protocol hanging off of the ocfs2_cluster_connection, ast wrappers can get the ast/bast pointers there. They don't need to get them from their plugin structure. The user plugin still needs the maximum locking protocol version, though. This changes the plugin structure so that it only holds the max version, not the entire ocfs2_locking_protocol pointer. Signed-off-by: Joel Becker <joel.becker@oracle.com> --- fs/ocfs2/stack_user.c | 6 +++--- fs/ocfs2/stackglue.c | 4 ++-- fs/ocfs2/stackglue.h | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c index b4cf616ef423..5ae8812b2864 100644 --- a/fs/ocfs2/stack_user.c +++ b/fs/ocfs2/stack_user.c @@ -62,8 +62,8 @@ * negotiated by the client. The client negotiates based on the maximum * version advertised in /sys/fs/ocfs2/max_locking_protocol. The major * number from the "SETV" message must match - * ocfs2_user_plugin.sp_proto->lp_max_version.pv_major, and the minor number - * must be less than or equal to ...->lp_max_version.pv_minor. + * ocfs2_user_plugin.sp_max_proto.pv_major, and the minor number + * must be less than or equal to ...sp_max_version.pv_minor. * * Once this information has been set, mounts will be allowed. From this * point on, the "DOWN" message can be sent for node down notification. @@ -400,7 +400,7 @@ static int ocfs2_control_do_setversion_msg(struct file *file, char *ptr = NULL; struct ocfs2_control_private *p = file->private_data; struct ocfs2_protocol_version *max = - &ocfs2_user_plugin.sp_proto->lp_max_version; + &ocfs2_user_plugin.sp_max_proto; if (ocfs2_control_get_handshake_state(file) != OCFS2_CONTROL_HANDSHAKE_PROTOCOL) diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c index 010ecabbdeb5..fc184c762700 100644 --- a/fs/ocfs2/stackglue.c +++ b/fs/ocfs2/stackglue.c @@ -176,7 +176,7 @@ int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin) spin_lock(&ocfs2_stack_lock); if (!ocfs2_stack_lookup(plugin->sp_name)) { plugin->sp_count = 0; - plugin->sp_proto = lproto; + plugin->sp_max_proto = lproto->lp_max_version; list_add(&plugin->sp_list, &ocfs2_stack_list); printk(KERN_INFO "ocfs2: Registered cluster interface %s\n", plugin->sp_name); @@ -224,7 +224,7 @@ void ocfs2_stack_glue_set_locking_protocol(struct ocfs2_locking_protocol *proto) lproto = proto; list_for_each_entry(p, &ocfs2_stack_list, sp_list) { - p->sp_proto = lproto; + p->sp_max_proto = lproto->lp_max_version; } spin_unlock(&ocfs2_stack_lock); diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h index cf8bac23ae09..77a7a9aeba73 100644 --- a/fs/ocfs2/stackglue.h +++ b/fs/ocfs2/stackglue.h @@ -233,7 +233,7 @@ struct ocfs2_stack_plugin { /* These are managed by the stackglue code. */ struct list_head sp_list; unsigned int sp_count; - struct ocfs2_locking_protocol *sp_proto; + struct ocfs2_protocol_version sp_max_proto; }; -- cgit v1.2.3 From 553b5eb91abd5f8e679d23ae547b92c589726814 Mon Sep 17 00:00:00 2001 From: Joel Becker <joel.becker@oracle.com> Date: Fri, 29 Jan 2010 17:19:06 -0800 Subject: ocfs2: Pass the locking protocol into ocfs2_cluster_connect(). Inside the stackglue, the locking protocol structure is hanging off of the ocfs2_cluster_connection. This takes it one further; the locking protocol is passed into ocfs2_cluster_connect(). Now different cluster connections can have different locking protocols with distinct asts. Note that all locking protocols have to keep their maximum protocol version in lock-step. With the protocol structure set in ocfs2_cluster_connect(), there is no need for the stackglue to have a static pointer to a specific protocol structure. We can change initialization to only pass in the maximum protocol version. Signed-off-by: Joel Becker <joel.becker@oracle.com> --- fs/ocfs2/dlmglue.c | 168 +++++++++++++++++++++++++-------------------------- fs/ocfs2/stackglue.c | 43 +++++++------ fs/ocfs2/stackglue.h | 3 +- 3 files changed, 110 insertions(+), 104 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 2bb868b7b44f..d009d7744d63 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -1045,7 +1045,6 @@ static unsigned int lockres_set_pending(struct ocfs2_lock_res *lockres) return lockres->l_pending_gen; } - static void ocfs2_blocking_ast(struct ocfs2_dlm_lksb *lksb, int level) { struct ocfs2_lock_res *lockres = ocfs2_lksb_to_lock_res(lksb); @@ -1139,6 +1138,88 @@ out: spin_unlock_irqrestore(&lockres->l_lock, flags); } +static void ocfs2_unlock_ast(struct ocfs2_dlm_lksb *lksb, int error) +{ + struct ocfs2_lock_res *lockres = ocfs2_lksb_to_lock_res(lksb); + unsigned long flags; + + mlog_entry_void(); + + mlog(0, "UNLOCK AST called on lock %s, action = %d\n", lockres->l_name, + lockres->l_unlock_action); + + spin_lock_irqsave(&lockres->l_lock, flags); + if (error) { + mlog(ML_ERROR, "Dlm passes error %d for lock %s, " + "unlock_action %d\n", error, lockres->l_name, + lockres->l_unlock_action); + spin_unlock_irqrestore(&lockres->l_lock, flags); + mlog_exit_void(); + return; + } + + switch(lockres->l_unlock_action) { + case OCFS2_UNLOCK_CANCEL_CONVERT: + mlog(0, "Cancel convert success for %s\n", lockres->l_name); + lockres->l_action = OCFS2_AST_INVALID; + /* Downconvert thread may have requeued this lock, we + * need to wake it. */ + if (lockres->l_flags & OCFS2_LOCK_BLOCKED) + ocfs2_wake_downconvert_thread(ocfs2_get_lockres_osb(lockres)); + break; + case OCFS2_UNLOCK_DROP_LOCK: + lockres->l_level = DLM_LOCK_IV; + break; + default: + BUG(); + } + + lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); + lockres->l_unlock_action = OCFS2_UNLOCK_INVALID; + wake_up(&lockres->l_event); + spin_unlock_irqrestore(&lockres->l_lock, flags); + + mlog_exit_void(); +} + +/* + * This is the filesystem locking protocol. It provides the lock handling + * hooks for the underlying DLM. It has a maximum version number. + * The version number allows interoperability with systems running at + * the same major number and an equal or smaller minor number. + * + * Whenever the filesystem does new things with locks (adds or removes a + * lock, orders them differently, does different things underneath a lock), + * the version must be changed. The protocol is negotiated when joining + * the dlm domain. A node may join the domain if its major version is + * identical to all other nodes and its minor version is greater than + * or equal to all other nodes. When its minor version is greater than + * the other nodes, it will run at the minor version specified by the + * other nodes. + * + * If a locking change is made that will not be compatible with older + * versions, the major number must be increased and the minor version set + * to zero. If a change merely adds a behavior that can be disabled when + * speaking to older versions, the minor version must be increased. If a + * change adds a fully backwards compatible change (eg, LVB changes that + * are just ignored by older versions), the version does not need to be + * updated. + */ +static struct ocfs2_locking_protocol lproto = { + .lp_max_version = { + .pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR, + .pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR, + }, + .lp_lock_ast = ocfs2_locking_ast, + .lp_blocking_ast = ocfs2_blocking_ast, + .lp_unlock_ast = ocfs2_unlock_ast, +}; + +void ocfs2_set_locking_protocol(void) +{ + ocfs2_stack_glue_set_max_proto_version(&lproto.lp_max_version); +} + static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres, int convert) { @@ -2991,7 +3072,7 @@ int ocfs2_dlm_init(struct ocfs2_super *osb) status = ocfs2_cluster_connect(osb->osb_cluster_stack, osb->uuid_str, strlen(osb->uuid_str), - ocfs2_do_node_down, osb, + &lproto, ocfs2_do_node_down, osb, &conn); if (status) { mlog_errno(status); @@ -3058,50 +3139,6 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb, mlog_exit_void(); } -static void ocfs2_unlock_ast(struct ocfs2_dlm_lksb *lksb, int error) -{ - struct ocfs2_lock_res *lockres = ocfs2_lksb_to_lock_res(lksb); - unsigned long flags; - - mlog_entry_void(); - - mlog(0, "UNLOCK AST called on lock %s, action = %d\n", lockres->l_name, - lockres->l_unlock_action); - - spin_lock_irqsave(&lockres->l_lock, flags); - if (error) { - mlog(ML_ERROR, "Dlm passes error %d for lock %s, " - "unlock_action %d\n", error, lockres->l_name, - lockres->l_unlock_action); - spin_unlock_irqrestore(&lockres->l_lock, flags); - mlog_exit_void(); - return; - } - - switch(lockres->l_unlock_action) { - case OCFS2_UNLOCK_CANCEL_CONVERT: - mlog(0, "Cancel convert success for %s\n", lockres->l_name); - lockres->l_action = OCFS2_AST_INVALID; - /* Downconvert thread may have requeued this lock, we - * need to wake it. */ - if (lockres->l_flags & OCFS2_LOCK_BLOCKED) - ocfs2_wake_downconvert_thread(ocfs2_get_lockres_osb(lockres)); - break; - case OCFS2_UNLOCK_DROP_LOCK: - lockres->l_level = DLM_LOCK_IV; - break; - default: - BUG(); - } - - lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); - lockres->l_unlock_action = OCFS2_UNLOCK_INVALID; - wake_up(&lockres->l_event); - spin_unlock_irqrestore(&lockres->l_lock, flags); - - mlog_exit_void(); -} - static int ocfs2_drop_lock(struct ocfs2_super *osb, struct ocfs2_lock_res *lockres) { @@ -3910,45 +3947,6 @@ void ocfs2_refcount_unlock(struct ocfs2_refcount_tree *ref_tree, int ex) ocfs2_cluster_unlock(osb, lockres, level); } -/* - * This is the filesystem locking protocol. It provides the lock handling - * hooks for the underlying DLM. It has a maximum version number. - * The version number allows interoperability with systems running at - * the same major number and an equal or smaller minor number. - * - * Whenever the filesystem does new things with locks (adds or removes a - * lock, orders them differently, does different things underneath a lock), - * the version must be changed. The protocol is negotiated when joining - * the dlm domain. A node may join the domain if its major version is - * identical to all other nodes and its minor version is greater than - * or equal to all other nodes. When its minor version is greater than - * the other nodes, it will run at the minor version specified by the - * other nodes. - * - * If a locking change is made that will not be compatible with older - * versions, the major number must be increased and the minor version set - * to zero. If a change merely adds a behavior that can be disabled when - * speaking to older versions, the minor version must be increased. If a - * change adds a fully backwards compatible change (eg, LVB changes that - * are just ignored by older versions), the version does not need to be - * updated. - */ -static struct ocfs2_locking_protocol lproto = { - .lp_max_version = { - .pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR, - .pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR, - }, - .lp_lock_ast = ocfs2_locking_ast, - .lp_blocking_ast = ocfs2_blocking_ast, - .lp_unlock_ast = ocfs2_unlock_ast, -}; - -void ocfs2_set_locking_protocol(void) -{ - ocfs2_stack_glue_set_locking_protocol(&lproto); -} - - static void ocfs2_process_blocked_lock(struct ocfs2_super *osb, struct ocfs2_lock_res *lockres) { diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c index fc184c762700..31db2e87cfd4 100644 --- a/fs/ocfs2/stackglue.c +++ b/fs/ocfs2/stackglue.c @@ -36,7 +36,7 @@ #define OCFS2_STACK_PLUGIN_USER "user" #define OCFS2_MAX_HB_CTL_PATH 256 -static struct ocfs2_locking_protocol *lproto; +static struct ocfs2_protocol_version locking_max_version; static DEFINE_SPINLOCK(ocfs2_stack_lock); static LIST_HEAD(ocfs2_stack_list); static char cluster_stack_name[OCFS2_STACK_LABEL_LEN + 1]; @@ -176,7 +176,7 @@ int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin) spin_lock(&ocfs2_stack_lock); if (!ocfs2_stack_lookup(plugin->sp_name)) { plugin->sp_count = 0; - plugin->sp_max_proto = lproto->lp_max_version; + plugin->sp_max_proto = locking_max_version; list_add(&plugin->sp_list, &ocfs2_stack_list); printk(KERN_INFO "ocfs2: Registered cluster interface %s\n", plugin->sp_name); @@ -213,23 +213,23 @@ void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin) } EXPORT_SYMBOL_GPL(ocfs2_stack_glue_unregister); -void ocfs2_stack_glue_set_locking_protocol(struct ocfs2_locking_protocol *proto) +void ocfs2_stack_glue_set_max_proto_version(struct ocfs2_protocol_version *max_proto) { struct ocfs2_stack_plugin *p; - BUG_ON(proto == NULL); - spin_lock(&ocfs2_stack_lock); - BUG_ON(active_stack != NULL); + if (memcmp(max_proto, &locking_max_version, + sizeof(struct ocfs2_protocol_version))) { + BUG_ON(locking_max_version.pv_major != 0); - lproto = proto; - list_for_each_entry(p, &ocfs2_stack_list, sp_list) { - p->sp_max_proto = lproto->lp_max_version; + locking_max_version = *max_proto; + list_for_each_entry(p, &ocfs2_stack_list, sp_list) { + p->sp_max_proto = locking_max_version; + } } - spin_unlock(&ocfs2_stack_lock); } -EXPORT_SYMBOL_GPL(ocfs2_stack_glue_set_locking_protocol); +EXPORT_SYMBOL_GPL(ocfs2_stack_glue_set_max_proto_version); /* @@ -245,8 +245,6 @@ int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn, void *name, unsigned int namelen) { - BUG_ON(lproto == NULL); - if (!lksb->lksb_conn) lksb->lksb_conn = conn; else @@ -260,7 +258,6 @@ int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn, struct ocfs2_dlm_lksb *lksb, u32 flags) { - BUG_ON(lproto == NULL); BUG_ON(lksb->lksb_conn == NULL); return active_stack->sp_ops->dlm_unlock(conn, lksb, flags); @@ -314,6 +311,7 @@ EXPORT_SYMBOL_GPL(ocfs2_plock); int ocfs2_cluster_connect(const char *stack_name, const char *group, int grouplen, + struct ocfs2_locking_protocol *lproto, void (*recovery_handler)(int node_num, void *recovery_data), void *recovery_data, @@ -331,6 +329,12 @@ int ocfs2_cluster_connect(const char *stack_name, goto out; } + if (memcmp(&lproto->lp_max_version, &locking_max_version, + sizeof(struct ocfs2_protocol_version))) { + rc = -EINVAL; + goto out; + } + new_conn = kzalloc(sizeof(struct ocfs2_cluster_connection), GFP_KERNEL); if (!new_conn) { @@ -456,10 +460,10 @@ static ssize_t ocfs2_max_locking_protocol_show(struct kobject *kobj, ssize_t ret = 0; spin_lock(&ocfs2_stack_lock); - if (lproto) + if (locking_max_version.pv_major) ret = snprintf(buf, PAGE_SIZE, "%u.%u\n", - lproto->lp_max_version.pv_major, - lproto->lp_max_version.pv_minor); + locking_max_version.pv_major, + locking_max_version.pv_minor); spin_unlock(&ocfs2_stack_lock); return ret; @@ -688,7 +692,10 @@ static int __init ocfs2_stack_glue_init(void) static void __exit ocfs2_stack_glue_exit(void) { - lproto = NULL; + memset(&locking_max_version, 0, + sizeof(struct ocfs2_protocol_version)); + locking_max_version.pv_major = 0; + locking_max_version.pv_minor = 0; ocfs2_sysfs_exit(); if (ocfs2_table_header) unregister_sysctl_table(ocfs2_table_header); diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h index 77a7a9aeba73..b1981ba4c91f 100644 --- a/fs/ocfs2/stackglue.h +++ b/fs/ocfs2/stackglue.h @@ -241,6 +241,7 @@ struct ocfs2_stack_plugin { int ocfs2_cluster_connect(const char *stack_name, const char *group, int grouplen, + struct ocfs2_locking_protocol *lproto, void (*recovery_handler)(int node_num, void *recovery_data), void *recovery_data, @@ -270,7 +271,7 @@ int ocfs2_stack_supports_plocks(void); int ocfs2_plock(struct ocfs2_cluster_connection *conn, u64 ino, struct file *file, int cmd, struct file_lock *fl); -void ocfs2_stack_glue_set_locking_protocol(struct ocfs2_locking_protocol *proto); +void ocfs2_stack_glue_set_max_proto_version(struct ocfs2_protocol_version *max_proto); /* Used by stack plugins */ -- cgit v1.2.3 From e8fce482f3702c1ad27c97b26db5022aa1fa64c7 Mon Sep 17 00:00:00 2001 From: Joel Becker <joel.becker@oracle.com> Date: Tue, 9 Feb 2010 17:52:13 -0800 Subject: ocfs2_dlmfs: Don't honor truncate. The size of a dlmfs file is LVB_LEN We want folks using dlmfs to be able to use the LVB in places other than just write(2)/read(2). By ignoring truncate requests, we allow 'echo "contents" > /dlm/space/lockname' to work. Signed-off-by: Joel Becker <joel.becker@oracle.com> --- fs/ocfs2/dlmfs/dlmfs.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'fs') diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index e21ce0e5fc42..13ac2bffb05d 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -220,6 +220,23 @@ static int dlmfs_file_release(struct inode *inode, return 0; } +/* + * We do ->setattr() just to override size changes. Our size is the size + * of the LVB and nothing else. + */ +static int dlmfs_file_setattr(struct dentry *dentry, struct iattr *attr) +{ + int error; + struct inode *inode = dentry->d_inode; + + attr->ia_valid &= ~ATTR_SIZE; + error = inode_change_ok(inode, attr); + if (!error) + error = inode_setattr(inode, attr); + + return error; +} + static unsigned int dlmfs_file_poll(struct file *file, poll_table *wait) { int event = 0; @@ -634,6 +651,7 @@ static const struct super_operations dlmfs_ops = { static const struct inode_operations dlmfs_file_inode_operations = { .getattr = simple_getattr, + .setattr = dlmfs_file_setattr, }; static int dlmfs_get_sb(struct file_system_type *fs_type, -- cgit v1.2.3 From 0016eedc4185a3cd7e578b027a6e69001b85d6c4 Mon Sep 17 00:00:00 2001 From: Joel Becker <joel.becker@oracle.com> Date: Sat, 30 Jan 2010 04:33:50 -0800 Subject: ocfs2_dlmfs: Use the stackglue. Rather than directly using o2dlm, dlmfs can now use the stackglue. This allows it to use userspace cluster stacks and fs/dlm. This commit forces o2cb for now. A latter commit will bump the protocol version and allow non-o2cb stacks. This is one big sed, really. LKM_xxMODE becomes DLM_LOCK_xx. LKM_flag becomes DLM_LKF_flag. We also learn to check that the LVB is valid before reading it. Any DLM can lose the contents of the LVB during a complicated recovery. userdlm should be checking this. Now it does. dlmfs will return 0 from read(2) if the LVB was invalid. Signed-off-by: Joel Becker <joel.becker@oracle.com> --- fs/ocfs2/dlmfs/dlmfs.c | 57 ++++------ fs/ocfs2/dlmfs/userdlm.c | 266 ++++++++++++++++++++++++----------------------- fs/ocfs2/dlmfs/userdlm.h | 16 +-- 3 files changed, 166 insertions(+), 173 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index 13ac2bffb05d..8697366b63ad 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -47,21 +47,13 @@ #include <asm/uaccess.h> - -#include "cluster/nodemanager.h" -#include "cluster/heartbeat.h" -#include "cluster/tcp.h" - -#include "dlm/dlmapi.h" - +#include "stackglue.h" #include "userdlm.h" - #include "dlmfsver.h" #define MLOG_MASK_PREFIX ML_DLMFS #include "cluster/masklog.h" -#include "ocfs2_lockingver.h" static const struct super_operations dlmfs_ops; static const struct file_operations dlmfs_file_operations; @@ -72,15 +64,6 @@ static struct kmem_cache *dlmfs_inode_cache; struct workqueue_struct *user_dlm_worker; -/* - * This is the userdlmfs locking protocol version. - * - * See fs/ocfs2/dlmglue.c for more details on locking versions. - */ -static const struct dlm_protocol_version user_locking_protocol = { - .pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR, - .pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR, -}; /* @@ -259,7 +242,7 @@ static ssize_t dlmfs_file_read(struct file *filp, loff_t *ppos) { int bytes_left; - ssize_t readlen; + ssize_t readlen, got; char *lvb_buf; struct inode *inode = filp->f_path.dentry->d_inode; @@ -285,9 +268,13 @@ static ssize_t dlmfs_file_read(struct file *filp, if (!lvb_buf) return -ENOMEM; - user_dlm_read_lvb(inode, lvb_buf, readlen); - bytes_left = __copy_to_user(buf, lvb_buf, readlen); - readlen -= bytes_left; + got = user_dlm_read_lvb(inode, lvb_buf, readlen); + if (got) { + BUG_ON(got != readlen); + bytes_left = __copy_to_user(buf, lvb_buf, readlen); + readlen -= bytes_left; + } else + readlen = 0; kfree(lvb_buf); @@ -346,7 +333,7 @@ static void dlmfs_init_once(void *foo) struct dlmfs_inode_private *ip = (struct dlmfs_inode_private *) foo; - ip->ip_dlm = NULL; + ip->ip_conn = NULL; ip->ip_parent = NULL; inode_init_once(&ip->ip_vfs_inode); @@ -388,14 +375,14 @@ static void dlmfs_clear_inode(struct inode *inode) goto clear_fields; } - mlog(0, "we're a directory, ip->ip_dlm = 0x%p\n", ip->ip_dlm); + mlog(0, "we're a directory, ip->ip_conn = 0x%p\n", ip->ip_conn); /* we must be a directory. If required, lets unregister the * dlm context now. */ - if (ip->ip_dlm) - user_dlm_unregister_context(ip->ip_dlm); + if (ip->ip_conn) + user_dlm_unregister(ip->ip_conn); clear_fields: ip->ip_parent = NULL; - ip->ip_dlm = NULL; + ip->ip_conn = NULL; } static struct backing_dev_info dlmfs_backing_dev_info = { @@ -445,7 +432,7 @@ static struct inode *dlmfs_get_inode(struct inode *parent, inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; ip = DLMFS_I(inode); - ip->ip_dlm = DLMFS_I(parent)->ip_dlm; + ip->ip_conn = DLMFS_I(parent)->ip_conn; switch (mode & S_IFMT) { default: @@ -499,13 +486,12 @@ static int dlmfs_mkdir(struct inode * dir, struct inode *inode = NULL; struct qstr *domain = &dentry->d_name; struct dlmfs_inode_private *ip; - struct dlm_ctxt *dlm; - struct dlm_protocol_version proto = user_locking_protocol; + struct ocfs2_cluster_connection *conn; mlog(0, "mkdir %.*s\n", domain->len, domain->name); /* verify that we have a proper domain */ - if (domain->len >= O2NM_MAX_NAME_LEN) { + if (domain->len >= GROUP_NAME_MAX) { status = -EINVAL; mlog(ML_ERROR, "invalid domain name for directory.\n"); goto bail; @@ -520,14 +506,14 @@ static int dlmfs_mkdir(struct inode * dir, ip = DLMFS_I(inode); - dlm = user_dlm_register_context(domain, &proto); - if (IS_ERR(dlm)) { - status = PTR_ERR(dlm); + conn = user_dlm_register(domain); + if (IS_ERR(conn)) { + status = PTR_ERR(conn); mlog(ML_ERROR, "Error %d could not register domain \"%.*s\"\n", status, domain->len, domain->name); goto bail; } - ip->ip_dlm = dlm; + ip->ip_conn = conn; inc_nlink(dir); d_instantiate(dentry, inode); @@ -696,6 +682,7 @@ static int __init init_dlmfs_fs(void) } cleanup_worker = 1; + user_dlm_set_locking_protocol(); status = register_filesystem(&dlmfs_fs_type); bail: if (status) { diff --git a/fs/ocfs2/dlmfs/userdlm.c b/fs/ocfs2/dlmfs/userdlm.c index 6adae70cee8e..c1b6a56a268f 100644 --- a/fs/ocfs2/dlmfs/userdlm.c +++ b/fs/ocfs2/dlmfs/userdlm.c @@ -34,18 +34,19 @@ #include <linux/types.h> #include <linux/crc32.h> - -#include "cluster/nodemanager.h" -#include "cluster/heartbeat.h" -#include "cluster/tcp.h" - -#include "dlm/dlmapi.h" - +#include "ocfs2_lockingver.h" +#include "stackglue.h" #include "userdlm.h" #define MLOG_MASK_PREFIX ML_DLMFS #include "cluster/masklog.h" + +static inline struct user_lock_res *user_lksb_to_lock_res(struct ocfs2_dlm_lksb *lksb) +{ + return container_of(lksb, struct user_lock_res, l_lksb); +} + static inline int user_check_wait_flag(struct user_lock_res *lockres, int flag) { @@ -73,15 +74,15 @@ static inline void user_wait_on_blocked_lock(struct user_lock_res *lockres) } /* I heart container_of... */ -static inline struct dlm_ctxt * -dlm_ctxt_from_user_lockres(struct user_lock_res *lockres) +static inline struct ocfs2_cluster_connection * +cluster_connection_from_user_lockres(struct user_lock_res *lockres) { struct dlmfs_inode_private *ip; ip = container_of(lockres, struct dlmfs_inode_private, ip_lockres); - return ip->ip_dlm; + return ip->ip_conn; } static struct inode * @@ -103,9 +104,9 @@ static inline void user_recover_from_dlm_error(struct user_lock_res *lockres) } #define user_log_dlm_error(_func, _stat, _lockres) do { \ - mlog(ML_ERROR, "Dlm error \"%s\" while calling %s on " \ - "resource %.*s: %s\n", dlm_errname(_stat), _func, \ - _lockres->l_namelen, _lockres->l_name, dlm_errmsg(_stat)); \ + mlog(ML_ERROR, "Dlm error %d while calling %s on " \ + "resource %.*s\n", _stat, _func, \ + _lockres->l_namelen, _lockres->l_name); \ } while (0) /* WARNING: This function lives in a world where the only three lock @@ -113,34 +114,34 @@ static inline void user_recover_from_dlm_error(struct user_lock_res *lockres) * lock types are added. */ static inline int user_highest_compat_lock_level(int level) { - int new_level = LKM_EXMODE; + int new_level = DLM_LOCK_EX; - if (level == LKM_EXMODE) - new_level = LKM_NLMODE; - else if (level == LKM_PRMODE) - new_level = LKM_PRMODE; + if (level == DLM_LOCK_EX) + new_level = DLM_LOCK_NL; + else if (level == DLM_LOCK_PR) + new_level = DLM_LOCK_PR; return new_level; } -static void user_ast(void *opaque) +static void user_ast(struct ocfs2_dlm_lksb *lksb) { - struct user_lock_res *lockres = opaque; - struct dlm_lockstatus *lksb; + struct user_lock_res *lockres = user_lksb_to_lock_res(lksb); + int status; mlog(0, "AST fired for lockres %.*s\n", lockres->l_namelen, lockres->l_name); spin_lock(&lockres->l_lock); - lksb = &(lockres->l_lksb); - if (lksb->status != DLM_NORMAL) { + status = ocfs2_dlm_lock_status(&lockres->l_lksb); + if (status) { mlog(ML_ERROR, "lksb status value of %u on lockres %.*s\n", - lksb->status, lockres->l_namelen, lockres->l_name); + status, lockres->l_namelen, lockres->l_name); spin_unlock(&lockres->l_lock); return; } - mlog_bug_on_msg(lockres->l_requested == LKM_IVMODE, + mlog_bug_on_msg(lockres->l_requested == DLM_LOCK_IV, "Lockres %.*s, requested ivmode. flags 0x%x\n", lockres->l_namelen, lockres->l_name, lockres->l_flags); @@ -148,13 +149,13 @@ static void user_ast(void *opaque) if (lockres->l_requested < lockres->l_level) { if (lockres->l_requested <= user_highest_compat_lock_level(lockres->l_blocking)) { - lockres->l_blocking = LKM_NLMODE; + lockres->l_blocking = DLM_LOCK_NL; lockres->l_flags &= ~USER_LOCK_BLOCKED; } } lockres->l_level = lockres->l_requested; - lockres->l_requested = LKM_IVMODE; + lockres->l_requested = DLM_LOCK_IV; lockres->l_flags |= USER_LOCK_ATTACHED; lockres->l_flags &= ~USER_LOCK_BUSY; @@ -193,11 +194,11 @@ static void __user_dlm_cond_queue_lockres(struct user_lock_res *lockres) return; switch (lockres->l_blocking) { - case LKM_EXMODE: + case DLM_LOCK_EX: if (!lockres->l_ex_holders && !lockres->l_ro_holders) queue = 1; break; - case LKM_PRMODE: + case DLM_LOCK_PR: if (!lockres->l_ex_holders) queue = 1; break; @@ -209,9 +210,9 @@ static void __user_dlm_cond_queue_lockres(struct user_lock_res *lockres) __user_dlm_queue_lockres(lockres); } -static void user_bast(void *opaque, int level) +static void user_bast(struct ocfs2_dlm_lksb *lksb, int level) { - struct user_lock_res *lockres = opaque; + struct user_lock_res *lockres = user_lksb_to_lock_res(lksb); mlog(0, "Blocking AST fired for lockres %.*s. Blocking level %d\n", lockres->l_namelen, lockres->l_name, level); @@ -227,15 +228,15 @@ static void user_bast(void *opaque, int level) wake_up(&lockres->l_event); } -static void user_unlock_ast(void *opaque, enum dlm_status status) +static void user_unlock_ast(struct ocfs2_dlm_lksb *lksb, int status) { - struct user_lock_res *lockres = opaque; + struct user_lock_res *lockres = user_lksb_to_lock_res(lksb); mlog(0, "UNLOCK AST called on lock %.*s\n", lockres->l_namelen, lockres->l_name); - if (status != DLM_NORMAL && status != DLM_CANCELGRANT) - mlog(ML_ERROR, "Dlm returns status %d\n", status); + if (status) + mlog(ML_ERROR, "dlm returns status %d\n", status); spin_lock(&lockres->l_lock); /* The teardown flag gets set early during the unlock process, @@ -243,7 +244,7 @@ static void user_unlock_ast(void *opaque, enum dlm_status status) * for a concurrent cancel. */ if (lockres->l_flags & USER_LOCK_IN_TEARDOWN && !(lockres->l_flags & USER_LOCK_IN_CANCEL)) { - lockres->l_level = LKM_IVMODE; + lockres->l_level = DLM_LOCK_IV; } else if (status == DLM_CANCELGRANT) { /* We tried to cancel a convert request, but it was * already granted. Don't clear the busy flag - the @@ -254,7 +255,7 @@ static void user_unlock_ast(void *opaque, enum dlm_status status) } else { BUG_ON(!(lockres->l_flags & USER_LOCK_IN_CANCEL)); /* Cancel succeeded, we want to re-queue */ - lockres->l_requested = LKM_IVMODE; /* cancel an + lockres->l_requested = DLM_LOCK_IV; /* cancel an * upconvert * request. */ lockres->l_flags &= ~USER_LOCK_IN_CANCEL; @@ -271,6 +272,21 @@ out_noclear: wake_up(&lockres->l_event); } +/* + * This is the userdlmfs locking protocol version. + * + * See fs/ocfs2/dlmglue.c for more details on locking versions. + */ +static struct ocfs2_locking_protocol user_dlm_lproto = { + .lp_max_version = { + .pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR, + .pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR, + }, + .lp_lock_ast = user_ast, + .lp_blocking_ast = user_bast, + .lp_unlock_ast = user_unlock_ast, +}; + static inline void user_dlm_drop_inode_ref(struct user_lock_res *lockres) { struct inode *inode; @@ -283,7 +299,8 @@ static void user_dlm_unblock_lock(struct work_struct *work) int new_level, status; struct user_lock_res *lockres = container_of(work, struct user_lock_res, l_work); - struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres); + struct ocfs2_cluster_connection *conn = + cluster_connection_from_user_lockres(lockres); mlog(0, "processing lockres %.*s\n", lockres->l_namelen, lockres->l_name); @@ -322,20 +339,17 @@ static void user_dlm_unblock_lock(struct work_struct *work) lockres->l_flags |= USER_LOCK_IN_CANCEL; spin_unlock(&lockres->l_lock); - status = dlmunlock(dlm, - &lockres->l_lksb, - LKM_CANCEL, - user_unlock_ast, - lockres); - if (status != DLM_NORMAL) - user_log_dlm_error("dlmunlock", status, lockres); + status = ocfs2_dlm_unlock(conn, &lockres->l_lksb, + DLM_LKF_CANCEL); + if (status) + user_log_dlm_error("ocfs2_dlm_unlock", status, lockres); goto drop_ref; } /* If there are still incompat holders, we can exit safely * without worrying about re-queueing this lock as that will * happen on the last call to user_cluster_unlock. */ - if ((lockres->l_blocking == LKM_EXMODE) + if ((lockres->l_blocking == DLM_LOCK_EX) && (lockres->l_ex_holders || lockres->l_ro_holders)) { spin_unlock(&lockres->l_lock); mlog(0, "can't downconvert for ex: ro = %u, ex = %u\n", @@ -343,7 +357,7 @@ static void user_dlm_unblock_lock(struct work_struct *work) goto drop_ref; } - if ((lockres->l_blocking == LKM_PRMODE) + if ((lockres->l_blocking == DLM_LOCK_PR) && lockres->l_ex_holders) { spin_unlock(&lockres->l_lock); mlog(0, "can't downconvert for pr: ex = %u\n", @@ -360,17 +374,12 @@ static void user_dlm_unblock_lock(struct work_struct *work) spin_unlock(&lockres->l_lock); /* need lock downconvert request now... */ - status = dlmlock(dlm, - new_level, - &lockres->l_lksb, - LKM_CONVERT|LKM_VALBLK, - lockres->l_name, - lockres->l_namelen, - user_ast, - lockres, - user_bast); - if (status != DLM_NORMAL) { - user_log_dlm_error("dlmlock", status, lockres); + status = ocfs2_dlm_lock(conn, new_level, &lockres->l_lksb, + DLM_LKF_CONVERT|DLM_LKF_VALBLK, + lockres->l_name, + lockres->l_namelen); + if (status) { + user_log_dlm_error("ocfs2_dlm_lock", status, lockres); user_recover_from_dlm_error(lockres); } @@ -382,10 +391,10 @@ static inline void user_dlm_inc_holders(struct user_lock_res *lockres, int level) { switch(level) { - case LKM_EXMODE: + case DLM_LOCK_EX: lockres->l_ex_holders++; break; - case LKM_PRMODE: + case DLM_LOCK_PR: lockres->l_ro_holders++; break; default: @@ -410,10 +419,11 @@ int user_dlm_cluster_lock(struct user_lock_res *lockres, int lkm_flags) { int status, local_flags; - struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres); + struct ocfs2_cluster_connection *conn = + cluster_connection_from_user_lockres(lockres); - if (level != LKM_EXMODE && - level != LKM_PRMODE) { + if (level != DLM_LOCK_EX && + level != DLM_LOCK_PR) { mlog(ML_ERROR, "lockres %.*s: invalid request!\n", lockres->l_namelen, lockres->l_name); status = -EINVAL; @@ -422,7 +432,7 @@ int user_dlm_cluster_lock(struct user_lock_res *lockres, mlog(0, "lockres %.*s: asking for %s lock, passed flags = 0x%x\n", lockres->l_namelen, lockres->l_name, - (level == LKM_EXMODE) ? "LKM_EXMODE" : "LKM_PRMODE", + (level == DLM_LOCK_EX) ? "DLM_LOCK_EX" : "DLM_LOCK_PR", lkm_flags); again: @@ -457,35 +467,26 @@ again: } if (level > lockres->l_level) { - local_flags = lkm_flags | LKM_VALBLK; - if (lockres->l_level != LKM_IVMODE) - local_flags |= LKM_CONVERT; + local_flags = lkm_flags | DLM_LKF_VALBLK; + if (lockres->l_level != DLM_LOCK_IV) + local_flags |= DLM_LKF_CONVERT; lockres->l_requested = level; lockres->l_flags |= USER_LOCK_BUSY; spin_unlock(&lockres->l_lock); - BUG_ON(level == LKM_IVMODE); - BUG_ON(level == LKM_NLMODE); + BUG_ON(level == DLM_LOCK_IV); + BUG_ON(level == DLM_LOCK_NL); /* call dlm_lock to upgrade lock now */ - status = dlmlock(dlm, - level, - &lockres->l_lksb, - local_flags, - lockres->l_name, - lockres->l_namelen, - user_ast, - lockres, - user_bast); - if (status != DLM_NORMAL) { - if ((lkm_flags & LKM_NOQUEUE) && - (status == DLM_NOTQUEUED)) - status = -EAGAIN; - else { - user_log_dlm_error("dlmlock", status, lockres); - status = -EINVAL; - } + status = ocfs2_dlm_lock(conn, level, &lockres->l_lksb, + local_flags, lockres->l_name, + lockres->l_namelen); + if (status) { + if ((lkm_flags & DLM_LKF_NOQUEUE) && + (status != -EAGAIN)) + user_log_dlm_error("ocfs2_dlm_lock", + status, lockres); user_recover_from_dlm_error(lockres); goto bail; } @@ -506,11 +507,11 @@ static inline void user_dlm_dec_holders(struct user_lock_res *lockres, int level) { switch(level) { - case LKM_EXMODE: + case DLM_LOCK_EX: BUG_ON(!lockres->l_ex_holders); lockres->l_ex_holders--; break; - case LKM_PRMODE: + case DLM_LOCK_PR: BUG_ON(!lockres->l_ro_holders); lockres->l_ro_holders--; break; @@ -522,8 +523,8 @@ static inline void user_dlm_dec_holders(struct user_lock_res *lockres, void user_dlm_cluster_unlock(struct user_lock_res *lockres, int level) { - if (level != LKM_EXMODE && - level != LKM_PRMODE) { + if (level != DLM_LOCK_EX && + level != DLM_LOCK_PR) { mlog(ML_ERROR, "lockres %.*s: invalid request!\n", lockres->l_namelen, lockres->l_name); return; @@ -540,33 +541,40 @@ void user_dlm_write_lvb(struct inode *inode, unsigned int len) { struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres; - char *lvb = lockres->l_lksb.lvb; + char *lvb; BUG_ON(len > DLM_LVB_LEN); spin_lock(&lockres->l_lock); - BUG_ON(lockres->l_level < LKM_EXMODE); + BUG_ON(lockres->l_level < DLM_LOCK_EX); + lvb = ocfs2_dlm_lvb(&lockres->l_lksb); memcpy(lvb, val, len); spin_unlock(&lockres->l_lock); } -void user_dlm_read_lvb(struct inode *inode, - char *val, - unsigned int len) +ssize_t user_dlm_read_lvb(struct inode *inode, + char *val, + unsigned int len) { struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres; - char *lvb = lockres->l_lksb.lvb; + char *lvb; + ssize_t ret = len; BUG_ON(len > DLM_LVB_LEN); spin_lock(&lockres->l_lock); - BUG_ON(lockres->l_level < LKM_PRMODE); - memcpy(val, lvb, len); + BUG_ON(lockres->l_level < DLM_LOCK_PR); + if (ocfs2_dlm_lvb_valid(&lockres->l_lksb)) { + lvb = ocfs2_dlm_lvb(&lockres->l_lksb); + memcpy(val, lvb, len); + } else + ret = 0; spin_unlock(&lockres->l_lock); + return ret; } void user_dlm_lock_res_init(struct user_lock_res *lockres, @@ -576,9 +584,9 @@ void user_dlm_lock_res_init(struct user_lock_res *lockres, spin_lock_init(&lockres->l_lock); init_waitqueue_head(&lockres->l_event); - lockres->l_level = LKM_IVMODE; - lockres->l_requested = LKM_IVMODE; - lockres->l_blocking = LKM_IVMODE; + lockres->l_level = DLM_LOCK_IV; + lockres->l_requested = DLM_LOCK_IV; + lockres->l_blocking = DLM_LOCK_IV; /* should have been checked before getting here. */ BUG_ON(dentry->d_name.len >= USER_DLM_LOCK_ID_MAX_LEN); @@ -592,7 +600,8 @@ void user_dlm_lock_res_init(struct user_lock_res *lockres, int user_dlm_destroy_lock(struct user_lock_res *lockres) { int status = -EBUSY; - struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres); + struct ocfs2_cluster_connection *conn = + cluster_connection_from_user_lockres(lockres); mlog(0, "asked to destroy %.*s\n", lockres->l_namelen, lockres->l_name); @@ -627,14 +636,9 @@ int user_dlm_destroy_lock(struct user_lock_res *lockres) lockres->l_flags |= USER_LOCK_BUSY; spin_unlock(&lockres->l_lock); - status = dlmunlock(dlm, - &lockres->l_lksb, - LKM_VALBLK, - user_unlock_ast, - lockres); - if (status != DLM_NORMAL) { - user_log_dlm_error("dlmunlock", status, lockres); - status = -EINVAL; + status = ocfs2_dlm_unlock(conn, &lockres->l_lksb, DLM_LKF_VALBLK); + if (status) { + user_log_dlm_error("ocfs2_dlm_unlock", status, lockres); goto bail; } @@ -645,32 +649,34 @@ bail: return status; } -struct dlm_ctxt *user_dlm_register_context(struct qstr *name, - struct dlm_protocol_version *proto) +static void user_dlm_recovery_handler_noop(int node_num, + void *recovery_data) { - struct dlm_ctxt *dlm; - u32 dlm_key; - char *domain; - - domain = kmalloc(name->len + 1, GFP_NOFS); - if (!domain) { - mlog_errno(-ENOMEM); - return ERR_PTR(-ENOMEM); - } + /* We ignore recovery events */ + return; +} - dlm_key = crc32_le(0, name->name, name->len); +void user_dlm_set_locking_protocol(void) +{ + ocfs2_stack_glue_set_max_proto_version(&user_dlm_lproto.lp_max_version); +} - snprintf(domain, name->len + 1, "%.*s", name->len, name->name); +struct ocfs2_cluster_connection *user_dlm_register(struct qstr *name) +{ + int rc; + struct ocfs2_cluster_connection *conn; - dlm = dlm_register_domain(domain, dlm_key, proto); - if (IS_ERR(dlm)) - mlog_errno(PTR_ERR(dlm)); + rc = ocfs2_cluster_connect("o2cb", name->name, name->len, + &user_dlm_lproto, + user_dlm_recovery_handler_noop, + NULL, &conn); + if (rc) + mlog_errno(rc); - kfree(domain); - return dlm; + return rc ? ERR_PTR(rc) : conn; } -void user_dlm_unregister_context(struct dlm_ctxt *dlm) +void user_dlm_unregister(struct ocfs2_cluster_connection *conn) { - dlm_unregister_domain(dlm); + ocfs2_cluster_disconnect(conn, 0); } diff --git a/fs/ocfs2/dlmfs/userdlm.h b/fs/ocfs2/dlmfs/userdlm.h index 0c3cc03c61fa..3b42d79531d7 100644 --- a/fs/ocfs2/dlmfs/userdlm.h +++ b/fs/ocfs2/dlmfs/userdlm.h @@ -57,7 +57,7 @@ struct user_lock_res { int l_level; unsigned int l_ro_holders; unsigned int l_ex_holders; - struct dlm_lockstatus l_lksb; + struct ocfs2_dlm_lksb l_lksb; int l_requested; int l_blocking; @@ -80,15 +80,15 @@ void user_dlm_cluster_unlock(struct user_lock_res *lockres, void user_dlm_write_lvb(struct inode *inode, const char *val, unsigned int len); -void user_dlm_read_lvb(struct inode *inode, - char *val, - unsigned int len); -struct dlm_ctxt *user_dlm_register_context(struct qstr *name, - struct dlm_protocol_version *proto); -void user_dlm_unregister_context(struct dlm_ctxt *dlm); +ssize_t user_dlm_read_lvb(struct inode *inode, + char *val, + unsigned int len); +struct ocfs2_cluster_connection *user_dlm_register(struct qstr *name); +void user_dlm_unregister(struct ocfs2_cluster_connection *conn); +void user_dlm_set_locking_protocol(void); struct dlmfs_inode_private { - struct dlm_ctxt *ip_dlm; + struct ocfs2_cluster_connection *ip_conn; struct user_lock_res ip_lockres; /* unused for directories. */ struct inode *ip_parent; -- cgit v1.2.3 From cbe0e331fdbdb256943499358c75bc098a2134c1 Mon Sep 17 00:00:00 2001 From: Joel Becker <joel.becker@oracle.com> Date: Sat, 30 Jan 2010 06:02:10 -0800 Subject: ocfs2_dlmfs: Enable the use of user cluster stacks. Unlike ocfs2, dlmfs has no permanent storage. It can't store off a cluster stack it is supposed to be using. So it can't specify the stack name in ocfs2_cluster_connect(). Instead, we create ocfs2_cluster_connect_agnostic(), which simply uses the stack that is currently enabled. This is find for dlmfs, which will rely on the stack initialization. We add the "stackglue" capability to dlmfs's capability list. This lets userspace know dlmfs can be used with all cluster stacks. Signed-off-by: Joel Becker <joel.becker@oracle.com> --- fs/ocfs2/dlmfs/dlmfs.c | 2 +- fs/ocfs2/dlmfs/userdlm.c | 8 ++++---- fs/ocfs2/ocfs2_lockingver.h | 2 ++ fs/ocfs2/stackglue.c | 18 ++++++++++++++++++ fs/ocfs2/stackglue.h | 11 +++++++++++ 5 files changed, 36 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index 8697366b63ad..1b0de157a08c 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -87,7 +87,7 @@ struct workqueue_struct *user_dlm_worker; * - bast : POLLIN against the file descriptor of a held lock * signifies a bast fired on the lock. */ -#define DLMFS_CAPABILITIES "bast" +#define DLMFS_CAPABILITIES "bast stackglue" extern int param_set_dlmfs_capabilities(const char *val, struct kernel_param *kp) { diff --git a/fs/ocfs2/dlmfs/userdlm.c b/fs/ocfs2/dlmfs/userdlm.c index c1b6a56a268f..2858ee6003c7 100644 --- a/fs/ocfs2/dlmfs/userdlm.c +++ b/fs/ocfs2/dlmfs/userdlm.c @@ -666,10 +666,10 @@ struct ocfs2_cluster_connection *user_dlm_register(struct qstr *name) int rc; struct ocfs2_cluster_connection *conn; - rc = ocfs2_cluster_connect("o2cb", name->name, name->len, - &user_dlm_lproto, - user_dlm_recovery_handler_noop, - NULL, &conn); + rc = ocfs2_cluster_connect_agnostic(name->name, name->len, + &user_dlm_lproto, + user_dlm_recovery_handler_noop, + NULL, &conn); if (rc) mlog_errno(rc); diff --git a/fs/ocfs2/ocfs2_lockingver.h b/fs/ocfs2/ocfs2_lockingver.h index 82d5eeac0fff..2e45c8d2ea7e 100644 --- a/fs/ocfs2/ocfs2_lockingver.h +++ b/fs/ocfs2/ocfs2_lockingver.h @@ -23,6 +23,8 @@ /* * The protocol version for ocfs2 cluster locking. See dlmglue.c for * more details. + * + * 1.0 - Initial locking version from ocfs2 1.4. */ #define OCFS2_LOCKING_PROTOCOL_MAJOR 1 #define OCFS2_LOCKING_PROTOCOL_MINOR 0 diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c index 31db2e87cfd4..39abf89697ed 100644 --- a/fs/ocfs2/stackglue.c +++ b/fs/ocfs2/stackglue.c @@ -373,6 +373,24 @@ out: } EXPORT_SYMBOL_GPL(ocfs2_cluster_connect); +/* The caller will ensure all nodes have the same cluster stack */ +int ocfs2_cluster_connect_agnostic(const char *group, + int grouplen, + struct ocfs2_locking_protocol *lproto, + void (*recovery_handler)(int node_num, + void *recovery_data), + void *recovery_data, + struct ocfs2_cluster_connection **conn) +{ + char *stack_name = NULL; + + if (cluster_stack_name[0]) + stack_name = cluster_stack_name; + return ocfs2_cluster_connect(stack_name, group, grouplen, lproto, + recovery_handler, recovery_data, conn); +} +EXPORT_SYMBOL_GPL(ocfs2_cluster_connect_agnostic); + /* If hangup_pending is 0, the stack driver will be dropped */ int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn, int hangup_pending) diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h index b1981ba4c91f..8ce7398ae1d2 100644 --- a/fs/ocfs2/stackglue.h +++ b/fs/ocfs2/stackglue.h @@ -246,6 +246,17 @@ int ocfs2_cluster_connect(const char *stack_name, void *recovery_data), void *recovery_data, struct ocfs2_cluster_connection **conn); +/* + * Used by callers that don't store their stack name. They must ensure + * all nodes have the same stack. + */ +int ocfs2_cluster_connect_agnostic(const char *group, + int grouplen, + struct ocfs2_locking_protocol *lproto, + void (*recovery_handler)(int node_num, + void *recovery_data), + void *recovery_data, + struct ocfs2_cluster_connection **conn); int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn, int hangup_pending); void ocfs2_cluster_hangup(const char *group, int grouplen); -- cgit v1.2.3 From 66b116c9d8f70baadf5b2145dddb35af222df041 Mon Sep 17 00:00:00 2001 From: Coly Li <coly.li@suse.de> Date: Thu, 25 Feb 2010 14:57:13 +0800 Subject: ocfs2: fix warning in ocfs2_file_aio_write() This patch fixes a compiling warning in ocfs2_file_aio_write(). Signed-off-by: Coly Li <coly.li@suse.de> Signed-off-by: Joel Becker <joel.becker@oracle.com> --- fs/ocfs2/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index da097bd07b72..c8a4a2939e55 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2041,7 +2041,7 @@ out_dio: * async dio is going to do it in the future or an end_io after an * error has already done it. */ - if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) { + if ((ret == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) { rw_level = -1; have_alloc_sem = 0; } -- cgit v1.2.3 From cbaee472f274ea9a98aabe47025f6e5551acadcb Mon Sep 17 00:00:00 2001 From: Tao Ma <tao.ma@oracle.com> Date: Fri, 26 Feb 2010 10:54:52 +0800 Subject: ocfs2: Only bug out in direct io write for reflinked extent. In ocfs2_direct_IO_get_blocks, we only need to bug out in case of we are going to write a recounted extent rec. What a silly bug introduced by me! Signed-off-by: Tao Ma <tao.ma@oracle.com> Signed-off-by: Joel Becker <joel.becker@oracle.com> Cc: stable@kernel.org --- fs/ocfs2/aops.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 7e9df11260f4..4c2a6d282c4d 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -577,8 +577,9 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, goto bail; } - /* We should already CoW the refcounted extent. */ - BUG_ON(ext_flags & OCFS2_EXT_REFCOUNTED); + /* We should already CoW the refcounted extent in case of create. */ + BUG_ON(create && (ext_flags & OCFS2_EXT_REFCOUNTED)); + /* * get_more_blocks() expects us to describe a hole by clearing * the mapped bit on bh_result(). -- cgit v1.2.3 From bc9838c4d44a1713ab1bf24aa6675bc3a02b6a88 Mon Sep 17 00:00:00 2001 From: Srinivas Eeda <srinivas.eeda@oracle.com> Date: Fri, 26 Feb 2010 12:53:51 -0800 Subject: dlm: allow dlm do recovery during shutdown If a node down event happens while dlm shutdown in progress, dlm recovery should be done before dlm is shutdown. We can't migrate unrecovered locks, obviously. But dlm_reco_thread only does recovery if the dlm_state is in DLM_CTXT_JOINED. dlm_reco_thread should do recovery if dlm_state is in DLM_CTXT_JOINED or DLM_CTXT_IN_SHUTDOWN. Signed-off-by: Srinivas Eeda <srinivas.eeda@oracle.com> Signed-off-by: Joel Becker <joel.becker@oracle.com> --- fs/ocfs2/dlm/dlmrecovery.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 344bcf90cbf4..b4f99de2caf3 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -310,7 +310,7 @@ static int dlm_recovery_thread(void *data) mlog(0, "dlm thread running for %s...\n", dlm->name); while (!kthread_should_stop()) { - if (dlm_joined(dlm)) { + if (dlm_domain_fully_joined(dlm)) { status = dlm_do_recovery(dlm); if (status == -EAGAIN) { /* do not sleep, recheck immediately. */ -- cgit v1.2.3 From 4912002fffa377e66c5caefc2c311732a4ad5fb8 Mon Sep 17 00:00:00 2001 From: Christian Kujau <lists@nerdbynature.de> Date: Fri, 26 Feb 2010 17:25:14 +0000 Subject: Remove EXPERIMENTAL from NFS_FSCACHE There's currently an open Ubuntu bug[0], with the intent to compile NFS_FSCACHE (and possibly AFS_FSCACHE, 9P_FSCACHE) into the standard Ubuntu kernel. However, since *_FSCACHE still depends on EXPERIMENTAL, this won't happen. As Arjan van de Ven pointed out[1], the EXPERIMENTAL flag doesn't mean that much any more, I propose the following patch to fs/nfs/Kconfig. I'd do the same for fs/9p/Kconfig and fs/afs/Kconfig, but as I did not test 9p or AFS, I feel it would not be appropriate for me to remove the flag. [0] https://bugs.launchpad.net/ubuntu/+source/linux/+bug/440522/comments/5 [1] http://lkml.org/lkml/2010/1/23/145 Signed-off-by: Christian Kujau <lists@nerdbynature.de> Signed-off-by: David Howells <dhowells@redhat.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- fs/nfs/Kconfig | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig index 59e5673b4597..a43d07e7b924 100644 --- a/fs/nfs/Kconfig +++ b/fs/nfs/Kconfig @@ -95,8 +95,7 @@ config ROOT_NFS Most people say N here. config NFS_FSCACHE - bool "Provide NFS client caching support (EXPERIMENTAL)" - depends on EXPERIMENTAL + bool "Provide NFS client caching support" depends on NFS_FS=m && FSCACHE || NFS_FS=y && FSCACHE=y help Say Y here if you want NFS data to be cached locally on disc through -- cgit v1.2.3 From 9b915181af0a99fe94ef0152e6a4ca9990c3b6d0 Mon Sep 17 00:00:00 2001 From: Sunil Mushran <sunil.mushran@oracle.com> Date: Fri, 26 Feb 2010 19:42:44 -0800 Subject: ocfs2: Use a separate masklog for AST and BASTs This patch adds a new masklog and uses it allow tracing ASTs and BASTs in the dlmglue layer. This has been found to be very useful in debugging cluster locking issues. Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com> Signed-off-by: Joel Becker <joel.becker@oracle.com> --- fs/ocfs2/cluster/masklog.c | 1 + fs/ocfs2/cluster/masklog.h | 1 + fs/ocfs2/dlmglue.c | 90 +++++++++++++++++++++++++++++++++------------- 3 files changed, 67 insertions(+), 25 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c index 1cd2934de615..b39da877b12f 100644 --- a/fs/ocfs2/cluster/masklog.c +++ b/fs/ocfs2/cluster/masklog.c @@ -112,6 +112,7 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = { define_mask(XATTR), define_mask(QUOTA), define_mask(REFCOUNT), + define_mask(BASTS), define_mask(ERROR), define_mask(NOTICE), define_mask(KTHREAD), diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h index 0442366b3060..3dfddbec32f2 100644 --- a/fs/ocfs2/cluster/masklog.h +++ b/fs/ocfs2/cluster/masklog.h @@ -114,6 +114,7 @@ #define ML_XATTR 0x0000000020000000ULL /* ocfs2 extended attributes */ #define ML_QUOTA 0x0000000040000000ULL /* ocfs2 quota operations */ #define ML_REFCOUNT 0x0000000080000000ULL /* refcount tree operations */ +#define ML_BASTS 0x0000001000000000ULL /* dlmglue asts and basts */ /* bits that are infrequently given and frequently matched in the high word */ #define ML_ERROR 0x0000000100000000ULL /* sent to KERN_ERR */ #define ML_NOTICE 0x0000000200000000ULL /* setn to KERN_NOTICE */ diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index d009d7744d63..8298608d4165 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -932,6 +932,10 @@ static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres, lockres->l_blocking = level; } + mlog(ML_BASTS, "lockres %s, block %d, level %d, l_block %d, dwn %d\n", + lockres->l_name, level, lockres->l_level, lockres->l_blocking, + needs_downconvert); + if (needs_downconvert) lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED); @@ -1054,8 +1058,8 @@ static void ocfs2_blocking_ast(struct ocfs2_dlm_lksb *lksb, int level) BUG_ON(level <= DLM_LOCK_NL); - mlog(0, "BAST fired for lockres %s, blocking %d, level %d type %s\n", - lockres->l_name, level, lockres->l_level, + mlog(ML_BASTS, "BAST fired for lockres %s, blocking %d, level %d, " + "type %s\n", lockres->l_name, level, lockres->l_level, ocfs2_lock_type_string(lockres->l_type)); /* @@ -1099,6 +1103,10 @@ static void ocfs2_locking_ast(struct ocfs2_dlm_lksb *lksb) return; } + mlog(ML_BASTS, "AST fired for lockres %s, action %d, unlock %d, " + "level %d => %d\n", lockres->l_name, lockres->l_action, + lockres->l_unlock_action, lockres->l_level, lockres->l_requested); + switch(lockres->l_action) { case OCFS2_AST_ATTACH: ocfs2_generic_handle_attach_action(lockres); @@ -1111,8 +1119,8 @@ static void ocfs2_locking_ast(struct ocfs2_dlm_lksb *lksb) ocfs2_generic_handle_downconvert_action(lockres); break; default: - mlog(ML_ERROR, "lockres %s: ast fired with invalid action: %u " - "lockres flags = 0x%lx, unlock action: %u\n", + mlog(ML_ERROR, "lockres %s: AST fired with invalid action: %u, " + "flags 0x%lx, unlock: %u\n", lockres->l_name, lockres->l_action, lockres->l_flags, lockres->l_unlock_action); BUG(); @@ -1145,8 +1153,8 @@ static void ocfs2_unlock_ast(struct ocfs2_dlm_lksb *lksb, int error) mlog_entry_void(); - mlog(0, "UNLOCK AST called on lock %s, action = %d\n", lockres->l_name, - lockres->l_unlock_action); + mlog(ML_BASTS, "UNLOCK AST fired for lockres %s, action = %d\n", + lockres->l_name, lockres->l_unlock_action); spin_lock_irqsave(&lockres->l_lock, flags); if (error) { @@ -1497,7 +1505,7 @@ again: BUG_ON(level == DLM_LOCK_IV); BUG_ON(level == DLM_LOCK_NL); - mlog(0, "lock %s, convert from %d to level = %d\n", + mlog(ML_BASTS, "lockres %s, convert from %d to %d\n", lockres->l_name, lockres->l_level, level); /* call dlm_lock to upgrade lock now */ @@ -3314,13 +3322,20 @@ static unsigned int ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres, BUG_ON(lockres->l_blocking <= DLM_LOCK_NL); if (lockres->l_level <= new_level) { - mlog(ML_ERROR, "lockres->l_level (%d) <= new_level (%d)\n", - lockres->l_level, new_level); + mlog(ML_ERROR, "lockres %s, lvl %d <= %d, blcklst %d, mask %d, " + "type %d, flags 0x%lx, hold %d %d, act %d %d, req %d, " + "block %d, pgen %d\n", lockres->l_name, lockres->l_level, + new_level, list_empty(&lockres->l_blocked_list), + list_empty(&lockres->l_mask_waiters), lockres->l_type, + lockres->l_flags, lockres->l_ro_holders, + lockres->l_ex_holders, lockres->l_action, + lockres->l_unlock_action, lockres->l_requested, + lockres->l_blocking, lockres->l_pending_gen); BUG(); } - mlog(0, "lock %s, new_level = %d, l_blocking = %d\n", - lockres->l_name, new_level, lockres->l_blocking); + mlog(ML_BASTS, "lockres %s, level %d => %d, blocking %d\n", + lockres->l_name, lockres->l_level, new_level, lockres->l_blocking); lockres->l_action = OCFS2_AST_DOWNCONVERT; lockres->l_requested = new_level; @@ -3339,6 +3354,9 @@ static int ocfs2_downconvert_lock(struct ocfs2_super *osb, mlog_entry_void(); + mlog(ML_BASTS, "lockres %s, level %d => %d\n", lockres->l_name, + lockres->l_level, new_level); + if (lvb) dlm_flags |= DLM_LKF_VALBLK; @@ -3368,14 +3386,12 @@ static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb, assert_spin_locked(&lockres->l_lock); mlog_entry_void(); - mlog(0, "lock %s\n", lockres->l_name); if (lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) { /* If we're already trying to cancel a lock conversion * then just drop the spinlock and allow the caller to * requeue this lock. */ - - mlog(0, "Lockres %s, skip convert\n", lockres->l_name); + mlog(ML_BASTS, "lockres %s, skip convert\n", lockres->l_name); return 0; } @@ -3390,6 +3406,8 @@ static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb, "lock %s, invalid flags: 0x%lx\n", lockres->l_name, lockres->l_flags); + mlog(ML_BASTS, "lockres %s\n", lockres->l_name); + return 1; } @@ -3399,7 +3417,6 @@ static int ocfs2_cancel_convert(struct ocfs2_super *osb, int ret; mlog_entry_void(); - mlog(0, "lock %s\n", lockres->l_name); ret = ocfs2_dlm_unlock(osb->cconn, &lockres->l_lksb, DLM_LKF_CANCEL); @@ -3408,7 +3425,7 @@ static int ocfs2_cancel_convert(struct ocfs2_super *osb, ocfs2_recover_from_dlm_error(lockres, 0); } - mlog(0, "lock %s return from ocfs2_dlm_unlock\n", lockres->l_name); + mlog(ML_BASTS, "lockres %s\n", lockres->l_name); mlog_exit(ret); return ret; @@ -3465,8 +3482,11 @@ recheck: * at the same time they set OCFS2_DLM_BUSY. They must * clear OCFS2_DLM_PENDING after dlm_lock() returns. */ - if (lockres->l_flags & OCFS2_LOCK_PENDING) + if (lockres->l_flags & OCFS2_LOCK_PENDING) { + mlog(ML_BASTS, "lockres %s, ReQ: Pending\n", + lockres->l_name); goto leave_requeue; + } ctl->requeue = 1; ret = ocfs2_prepare_cancel_convert(osb, lockres); @@ -3498,6 +3518,7 @@ recheck: */ if (lockres->l_level == DLM_LOCK_NL) { BUG_ON(lockres->l_ex_holders || lockres->l_ro_holders); + mlog(ML_BASTS, "lockres %s, Aborting dc\n", lockres->l_name); lockres->l_blocking = DLM_LOCK_NL; lockres_clear_flags(lockres, OCFS2_LOCK_BLOCKED); spin_unlock_irqrestore(&lockres->l_lock, flags); @@ -3507,28 +3528,41 @@ recheck: /* if we're blocking an exclusive and we have *any* holders, * then requeue. */ if ((lockres->l_blocking == DLM_LOCK_EX) - && (lockres->l_ex_holders || lockres->l_ro_holders)) + && (lockres->l_ex_holders || lockres->l_ro_holders)) { + mlog(ML_BASTS, "lockres %s, ReQ: EX/PR Holders %u,%u\n", + lockres->l_name, lockres->l_ex_holders, + lockres->l_ro_holders); goto leave_requeue; + } /* If it's a PR we're blocking, then only * requeue if we've got any EX holders */ if (lockres->l_blocking == DLM_LOCK_PR && - lockres->l_ex_holders) + lockres->l_ex_holders) { + mlog(ML_BASTS, "lockres %s, ReQ: EX Holders %u\n", + lockres->l_name, lockres->l_ex_holders); goto leave_requeue; + } /* * Can we get a lock in this state if the holder counts are * zero? The meta data unblock code used to check this. */ if ((lockres->l_ops->flags & LOCK_TYPE_REQUIRES_REFRESH) - && (lockres->l_flags & OCFS2_LOCK_REFRESHING)) + && (lockres->l_flags & OCFS2_LOCK_REFRESHING)) { + mlog(ML_BASTS, "lockres %s, ReQ: Lock Refreshing\n", + lockres->l_name); goto leave_requeue; + } new_level = ocfs2_highest_compat_lock_level(lockres->l_blocking); if (lockres->l_ops->check_downconvert - && !lockres->l_ops->check_downconvert(lockres, new_level)) + && !lockres->l_ops->check_downconvert(lockres, new_level)) { + mlog(ML_BASTS, "lockres %s, ReQ: Checkpointing\n", + lockres->l_name); goto leave_requeue; + } /* If we get here, then we know that there are no more * incompatible holders (and anyone asking for an incompatible @@ -3546,13 +3580,19 @@ recheck: ctl->unblock_action = lockres->l_ops->downconvert_worker(lockres, blocking); - if (ctl->unblock_action == UNBLOCK_STOP_POST) + if (ctl->unblock_action == UNBLOCK_STOP_POST) { + mlog(ML_BASTS, "lockres %s, UNBLOCK_STOP_POST\n", + lockres->l_name); goto leave; + } spin_lock_irqsave(&lockres->l_lock, flags); if ((blocking != lockres->l_blocking) || (level != lockres->l_level)) { /* If this changed underneath us, then we can't drop * it just yet. */ + mlog(ML_BASTS, "lockres %s, block=%d:%d, level=%d:%d, " + "Recheck\n", lockres->l_name, blocking, + lockres->l_blocking, level, lockres->l_level); goto recheck; } @@ -3963,7 +4003,7 @@ static void ocfs2_process_blocked_lock(struct ocfs2_super *osb, BUG_ON(!lockres); BUG_ON(!lockres->l_ops); - mlog(0, "lockres %s blocked.\n", lockres->l_name); + mlog(ML_BASTS, "lockres %s blocked\n", lockres->l_name); /* Detect whether a lock has been marked as going away while * the downconvert thread was processing other things. A lock can @@ -3986,7 +4026,7 @@ unqueue: } else ocfs2_schedule_blocked_lock(osb, lockres); - mlog(0, "lockres %s, requeue = %s.\n", lockres->l_name, + mlog(ML_BASTS, "lockres %s, requeue = %s.\n", lockres->l_name, ctl.requeue ? "yes" : "no"); spin_unlock_irqrestore(&lockres->l_lock, flags); @@ -4008,7 +4048,7 @@ static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb, /* Do not schedule a lock for downconvert when it's on * the way to destruction - any nodes wanting access * to the resource will get it soon. */ - mlog(0, "Lockres %s won't be scheduled: flags 0x%lx\n", + mlog(ML_BASTS, "lockres %s won't be scheduled: flags 0x%lx\n", lockres->l_name, lockres->l_flags); return; } -- cgit v1.2.3 From 6fcef3f04a1a0f8d7a086147d2f2e650c8cc2754 Mon Sep 17 00:00:00 2001 From: Sunil Mushran <sunil.mushran@oracle.com> Date: Fri, 26 Feb 2010 19:42:45 -0800 Subject: ocfs2/userdlm: Add tracing in userdlm Make use of the newly added BASTS masklog to trace ASTs and BASTs in userdlm. Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com> Signed-off-by: Joel Becker <joel.becker@oracle.com> --- fs/ocfs2/dlmfs/userdlm.c | 44 +++++++++++++++++++++++++------------------- 1 file changed, 25 insertions(+), 19 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/dlmfs/userdlm.c b/fs/ocfs2/dlmfs/userdlm.c index 2858ee6003c7..0499e3fb7bdb 100644 --- a/fs/ocfs2/dlmfs/userdlm.c +++ b/fs/ocfs2/dlmfs/userdlm.c @@ -128,8 +128,9 @@ static void user_ast(struct ocfs2_dlm_lksb *lksb) struct user_lock_res *lockres = user_lksb_to_lock_res(lksb); int status; - mlog(0, "AST fired for lockres %.*s\n", lockres->l_namelen, - lockres->l_name); + mlog(ML_BASTS, "AST fired for lockres %.*s, level %d => %d\n", + lockres->l_namelen, lockres->l_name, lockres->l_level, + lockres->l_requested); spin_lock(&lockres->l_lock); @@ -214,8 +215,8 @@ static void user_bast(struct ocfs2_dlm_lksb *lksb, int level) { struct user_lock_res *lockres = user_lksb_to_lock_res(lksb); - mlog(0, "Blocking AST fired for lockres %.*s. Blocking level %d\n", - lockres->l_namelen, lockres->l_name, level); + mlog(ML_BASTS, "BAST fired for lockres %.*s, blocking %d, level %d\n", + lockres->l_namelen, lockres->l_name, level, lockres->l_level); spin_lock(&lockres->l_lock); lockres->l_flags |= USER_LOCK_BLOCKED; @@ -232,8 +233,8 @@ static void user_unlock_ast(struct ocfs2_dlm_lksb *lksb, int status) { struct user_lock_res *lockres = user_lksb_to_lock_res(lksb); - mlog(0, "UNLOCK AST called on lock %.*s\n", lockres->l_namelen, - lockres->l_name); + mlog(ML_BASTS, "UNLOCK AST fired for lockres %.*s, flags 0x%x\n", + lockres->l_namelen, lockres->l_name, lockres->l_flags); if (status) mlog(ML_ERROR, "dlm returns status %d\n", status); @@ -302,8 +303,7 @@ static void user_dlm_unblock_lock(struct work_struct *work) struct ocfs2_cluster_connection *conn = cluster_connection_from_user_lockres(lockres); - mlog(0, "processing lockres %.*s\n", lockres->l_namelen, - lockres->l_name); + mlog(0, "lockres %.*s\n", lockres->l_namelen, lockres->l_name); spin_lock(&lockres->l_lock); @@ -321,17 +321,23 @@ static void user_dlm_unblock_lock(struct work_struct *work) * flag, and finally we might get another bast which re-queues * us before our ast for the downconvert is called. */ if (!(lockres->l_flags & USER_LOCK_BLOCKED)) { + mlog(ML_BASTS, "lockres %.*s USER_LOCK_BLOCKED\n", + lockres->l_namelen, lockres->l_name); spin_unlock(&lockres->l_lock); goto drop_ref; } if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) { + mlog(ML_BASTS, "lockres %.*s USER_LOCK_IN_TEARDOWN\n", + lockres->l_namelen, lockres->l_name); spin_unlock(&lockres->l_lock); goto drop_ref; } if (lockres->l_flags & USER_LOCK_BUSY) { if (lockres->l_flags & USER_LOCK_IN_CANCEL) { + mlog(ML_BASTS, "lockres %.*s USER_LOCK_IN_CANCEL\n", + lockres->l_namelen, lockres->l_name); spin_unlock(&lockres->l_lock); goto drop_ref; } @@ -352,16 +358,18 @@ static void user_dlm_unblock_lock(struct work_struct *work) if ((lockres->l_blocking == DLM_LOCK_EX) && (lockres->l_ex_holders || lockres->l_ro_holders)) { spin_unlock(&lockres->l_lock); - mlog(0, "can't downconvert for ex: ro = %u, ex = %u\n", - lockres->l_ro_holders, lockres->l_ex_holders); + mlog(ML_BASTS, "lockres %.*s, EX/PR Holders %u,%u\n", + lockres->l_namelen, lockres->l_name, + lockres->l_ex_holders, lockres->l_ro_holders); goto drop_ref; } if ((lockres->l_blocking == DLM_LOCK_PR) && lockres->l_ex_holders) { spin_unlock(&lockres->l_lock); - mlog(0, "can't downconvert for pr: ex = %u\n", - lockres->l_ex_holders); + mlog(ML_BASTS, "lockres %.*s, EX Holders %u\n", + lockres->l_namelen, lockres->l_name, + lockres->l_ex_holders); goto drop_ref; } @@ -369,8 +377,8 @@ static void user_dlm_unblock_lock(struct work_struct *work) new_level = user_highest_compat_lock_level(lockres->l_blocking); lockres->l_requested = new_level; lockres->l_flags |= USER_LOCK_BUSY; - mlog(0, "Downconvert lock from %d to %d\n", - lockres->l_level, new_level); + mlog(ML_BASTS, "lockres %.*s, downconvert %d => %d\n", + lockres->l_namelen, lockres->l_name, lockres->l_level, new_level); spin_unlock(&lockres->l_lock); /* need lock downconvert request now... */ @@ -430,10 +438,8 @@ int user_dlm_cluster_lock(struct user_lock_res *lockres, goto bail; } - mlog(0, "lockres %.*s: asking for %s lock, passed flags = 0x%x\n", - lockres->l_namelen, lockres->l_name, - (level == DLM_LOCK_EX) ? "DLM_LOCK_EX" : "DLM_LOCK_PR", - lkm_flags); + mlog(ML_BASTS, "lockres %.*s, level %d, flags = 0x%x\n", + lockres->l_namelen, lockres->l_name, level, lkm_flags); again: if (signal_pending(current)) { @@ -603,7 +609,7 @@ int user_dlm_destroy_lock(struct user_lock_res *lockres) struct ocfs2_cluster_connection *conn = cluster_connection_from_user_lockres(lockres); - mlog(0, "asked to destroy %.*s\n", lockres->l_namelen, lockres->l_name); + mlog(ML_BASTS, "lockres %.*s\n", lockres->l_namelen, lockres->l_name); spin_lock(&lockres->l_lock); if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) { -- cgit v1.2.3 From 5051f76883897ea3d3d034c92e7b84236da2ec57 Mon Sep 17 00:00:00 2001 From: Wengang Wang <wen.gang.wang@oracle.com> Date: Fri, 26 Feb 2010 18:18:25 +0800 Subject: ocfs2: send SIGXFSZ if new filesize exceeds limit -v2 This patch makes ocfs2 send SIGXFSZ if new file size exceeds the rlimit. Processes may get SIGXFSZ on one node (in the cluster) while others will not on another if file size limits are different on the two nodes. Signed-off-by: Wengang Wang <wen.gang.wang@oracle.com> Signed-off-by: Joel Becker <joel.becker@oracle.com> --- fs/ocfs2/file.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index c8a4a2939e55..5b52547d6299 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -993,10 +993,9 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) } if (size_change && attr->ia_size != i_size_read(inode)) { - if (attr->ia_size > sb->s_maxbytes) { - status = -EFBIG; + status = inode_newsize_ok(inode, attr->ia_size); + if (status) goto bail_unlock; - } if (i_size_read(inode) > attr->ia_size) { if (ocfs2_should_order_data(inode)) { -- cgit v1.2.3 From 34ce4e7c23e3da578e459b05c6fb17edecb19e6b Mon Sep 17 00:00:00 2001 From: Boaz Harrosh <bharrosh@panasas.com> Date: Tue, 15 Dec 2009 19:34:17 +0200 Subject: exofs: debug print even less * Last debug trimming left in some stupid print, remove them. Fixup some other prints * Shift printing from inode.c to ios.c * Add couple of prints when memory allocation fails. Signed-off-by: Boaz Harrosh <bharrosh@panasas.com> --- fs/exofs/inode.c | 23 +++++++++++++---------- fs/exofs/ios.c | 38 ++++++++++++++++++++++++++++++++------ 2 files changed, 45 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index 2afbcebeda71..c88a0c5250cb 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -193,7 +193,7 @@ static int __readpages_done(struct page_collect *pcol, bool do_unlock) else good_bytes = pcol->length - resid; - EXOFS_DBGMSG("readpages_done(0x%lx) good_bytes=0x%llx" + EXOFS_DBGMSG2("readpages_done(0x%lx) good_bytes=0x%llx" " length=0x%lx nr_pages=%u\n", pcol->inode->i_ino, _LLU(good_bytes), pcol->length, pcol->nr_pages); @@ -222,7 +222,7 @@ static int __readpages_done(struct page_collect *pcol, bool do_unlock) } pcol_free(pcol); - EXOFS_DBGMSG("readpages_done END\n"); + EXOFS_DBGMSG2("readpages_done END\n"); return ret; } @@ -290,7 +290,7 @@ static int read_exec(struct page_collect *pcol, bool is_sync) atomic_inc(&pcol->sbi->s_curr_pending); - EXOFS_DBGMSG("read_exec obj=0x%llx start=0x%llx length=0x%lx\n", + EXOFS_DBGMSG2("read_exec obj=0x%llx start=0x%llx length=0x%lx\n", ios->obj.id, _LLU(ios->offset), pcol->length); /* pages ownership was passed to pcol_copy */ @@ -462,7 +462,7 @@ static void writepages_done(struct exofs_io_state *ios, void *p) else good_bytes = pcol->length - resid; - EXOFS_DBGMSG("writepages_done(0x%lx) good_bytes=0x%llx" + EXOFS_DBGMSG2("writepages_done(0x%lx) good_bytes=0x%llx" " length=0x%lx nr_pages=%u\n", pcol->inode->i_ino, _LLU(good_bytes), pcol->length, pcol->nr_pages); @@ -490,7 +490,7 @@ static void writepages_done(struct exofs_io_state *ios, void *p) pcol_free(pcol); kfree(pcol); - EXOFS_DBGMSG("writepages_done END\n"); + EXOFS_DBGMSG2("writepages_done END\n"); } static int write_exec(struct page_collect *pcol) @@ -527,7 +527,7 @@ static int write_exec(struct page_collect *pcol) } atomic_inc(&pcol->sbi->s_curr_pending); - EXOFS_DBGMSG("write_exec(0x%lx, 0x%llx) start=0x%llx length=0x%lx\n", + EXOFS_DBGMSG2("write_exec(0x%lx, 0x%llx) start=0x%llx length=0x%lx\n", pcol->inode->i_ino, pcol->pg_first, _LLU(ios->offset), pcol->length); /* pages ownership was passed to pcol_copy */ @@ -616,7 +616,7 @@ try_again: ret = pcol_add_page(pcol, page, len); if (unlikely(ret)) { - EXOFS_DBGMSG("Failed pcol_add_page " + EXOFS_DBGMSG2("Failed pcol_add_page " "nr_pages=%u total_length=0x%lx\n", pcol->nr_pages, pcol->length); @@ -663,7 +663,7 @@ static int exofs_writepages(struct address_space *mapping, if (expected_pages < 32L) expected_pages = 32L; - EXOFS_DBGMSG("inode(0x%lx) wbc->start=0x%llx wbc->end=0x%llx " + EXOFS_DBGMSG2("inode(0x%lx) wbc->start=0x%llx wbc->end=0x%llx " "nrpages=%lu start=0x%lx end=0x%lx expected_pages=%ld\n", mapping->host->i_ino, wbc->range_start, wbc->range_end, mapping->nrpages, start, end, expected_pages); @@ -1170,8 +1170,10 @@ static int exofs_update_inode(struct inode *inode, int do_sync) int ret; args = kzalloc(sizeof(*args), GFP_KERNEL); - if (!args) + if (!args) { + EXOFS_DBGMSG("Faild kzalloc of args\n"); return -ENOMEM; + } fcb = &args->fcb; @@ -1234,7 +1236,8 @@ static int exofs_update_inode(struct inode *inode, int do_sync) free_args: kfree(args); out: - EXOFS_DBGMSG("ret=>%d\n", ret); + EXOFS_DBGMSG("(0x%lx) do_sync=%d ret=>%d\n", + inode->i_ino, do_sync, ret); return ret; } diff --git a/fs/exofs/ios.c b/fs/exofs/ios.c index 5bad01fa1f9f..3cc0dd3f0eb2 100644 --- a/fs/exofs/ios.c +++ b/fs/exofs/ios.c @@ -26,6 +26,9 @@ #include "exofs.h" +#define EXOFS_DBGMSG2(M...) do {} while (0) +/* #define EXOFS_DBGMSG2 EXOFS_DBGMSG */ + void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], const struct osd_obj_id *obj) { osd_sec_init_nosec_doall_caps(cred_a, obj, false, true); @@ -73,6 +76,8 @@ int exofs_get_io_state(struct exofs_sb_info *sbi, struct exofs_io_state** pios) */ ios = kzalloc(exofs_io_state_size(sbi->s_numdevs), GFP_KERNEL); if (unlikely(!ios)) { + EXOFS_DBGMSG("Faild kzalloc bytes=%d\n", + exofs_io_state_size(sbi->s_numdevs)); *pios = NULL; return -ENOMEM; } @@ -276,6 +281,9 @@ int exofs_sbi_write(struct exofs_io_state *ios) bio = bio_kmalloc(GFP_KERNEL, ios->bio->bi_max_vecs); if (unlikely(!bio)) { + EXOFS_DBGMSG( + "Faild to allocate BIO size=%u\n", + ios->bio->bi_max_vecs); ret = -ENOMEM; goto out; } @@ -290,14 +298,21 @@ int exofs_sbi_write(struct exofs_io_state *ios) osd_req_write(or, &ios->obj, ios->offset, bio, ios->length); -/* EXOFS_DBGMSG("write sync=%d\n", sync);*/ + EXOFS_DBGMSG("write(0x%llx) offset=0x%llx " + "length=0x%llx dev=%d\n", + _LLU(ios->obj.id), _LLU(ios->offset), + _LLU(ios->length), i); } else if (ios->kern_buff) { osd_req_write_kern(or, &ios->obj, ios->offset, ios->kern_buff, ios->length); -/* EXOFS_DBGMSG("write_kern sync=%d\n", sync);*/ + EXOFS_DBGMSG2("write_kern(0x%llx) offset=0x%llx " + "length=0x%llx dev=%d\n", + _LLU(ios->obj.id), _LLU(ios->offset), + _LLU(ios->length), i); } else { osd_req_set_attributes(or, &ios->obj); -/* EXOFS_DBGMSG("set_attributes sync=%d\n", sync);*/ + EXOFS_DBGMSG2("obj(0x%llx) set_attributes=%d dev=%d\n", + _LLU(ios->obj.id), ios->out_attr_len, i); } if (ios->out_attr) @@ -335,14 +350,25 @@ int exofs_sbi_read(struct exofs_io_state *ios) if (ios->bio) { osd_req_read(or, &ios->obj, ios->offset, ios->bio, ios->length); -/* EXOFS_DBGMSG("read sync=%d\n", sync);*/ + EXOFS_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx" + " dev=%d\n", _LLU(ios->obj.id), + _LLU(ios->offset), + _LLU(ios->length), + first_dev); } else if (ios->kern_buff) { osd_req_read_kern(or, &ios->obj, ios->offset, ios->kern_buff, ios->length); -/* EXOFS_DBGMSG("read_kern sync=%d\n", sync);*/ + EXOFS_DBGMSG2("read_kern(0x%llx) offset=0x%llx " + "length=0x%llx dev=%d\n", + _LLU(ios->obj.id), + _LLU(ios->offset), + _LLU(ios->length), + first_dev); } else { osd_req_get_attributes(or, &ios->obj); -/* EXOFS_DBGMSG("get_attributes sync=%d\n", sync);*/ + EXOFS_DBGMSG2("obj(0x%llx) get_attributes=%d dev=%d\n", + _LLU(ios->obj.id), ios->in_attr_len, + first_dev); } if (ios->out_attr) -- cgit v1.2.3 From 518f167a37b3c53f3cf44d27800455ca24e920f6 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh <bharrosh@panasas.com> Date: Thu, 21 Jan 2010 20:00:02 +0200 Subject: exofs: Micro-optimize exofs_i_info optimize the exofs_i_info struct usage by moving the embedded vfs_inode to be first. A compiler might optimize away an "add" operation with constant zero. (Which it cannot with other constants) Signed-off-by: Boaz Harrosh <bharrosh@panasas.com> --- fs/exofs/exofs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h index c35fd4623986..13663da2b119 100644 --- a/fs/exofs/exofs.h +++ b/fs/exofs/exofs.h @@ -78,13 +78,13 @@ struct exofs_sb_info { * our extension to the in-memory inode */ struct exofs_i_info { + struct inode vfs_inode; /* normal in-memory inode */ + wait_queue_head_t i_wq; /* wait queue for inode */ unsigned long i_flags; /* various atomic flags */ uint32_t i_data[EXOFS_IDATA];/*short symlink names and device #s*/ uint32_t i_dir_start_lookup; /* which page to start lookup */ - wait_queue_head_t i_wq; /* wait queue for inode */ uint64_t i_commit_size; /* the object's written length */ uint8_t i_cred[OSD_CAP_LEN];/* all-powerful credential */ - struct inode vfs_inode; /* normal in-memory inode */ }; static inline osd_id exofs_oi_objno(struct exofs_i_info *oi) -- cgit v1.2.3 From 22ddc556380cf5645c52292b6d980766646eb864 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh <bharrosh@panasas.com> Date: Tue, 19 Jan 2010 19:24:45 +0200 Subject: exofs: Recover in the case of read-passed-end-of-file In check_io, implement the case of reading passed end of file, by clearing the pages and recover with no error. In a raid arrangement this can become a legitimate situation in case of holes in the file. Signed-off-by: Boaz Harrosh <bharrosh@panasas.com> --- fs/exofs/ios.c | 36 ++++++++++++++++++++++++++++++------ 1 file changed, 30 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/exofs/ios.c b/fs/exofs/ios.c index 3cc0dd3f0eb2..439c5d097b27 100644 --- a/fs/exofs/ios.c +++ b/fs/exofs/ios.c @@ -173,6 +173,21 @@ static int exofs_io_execute(struct exofs_io_state *ios) return ret; } +static void _clear_bio(struct bio *bio) +{ + struct bio_vec *bv; + unsigned i; + + __bio_for_each_segment(bv, bio, i, 0) { + unsigned this_count = bv->bv_len; + + if (likely(PAGE_SIZE == this_count)) + clear_highpage(bv->bv_page); + else + zero_user(bv->bv_page, bv->bv_offset, this_count); + } +} + int exofs_check_io(struct exofs_io_state *ios, u64 *resid) { enum osd_err_priority acumulated_osd_err = 0; @@ -181,16 +196,25 @@ int exofs_check_io(struct exofs_io_state *ios, u64 *resid) for (i = 0; i < ios->numdevs; i++) { struct osd_sense_info osi; - int ret = osd_req_decode_sense(ios->per_dev[i].or, &osi); + struct osd_request *or = ios->per_dev[i].or; + int ret; + + if (unlikely(!or)) + continue; + ret = osd_req_decode_sense(or, &osi); if (likely(!ret)) continue; - if (unlikely(ret == -EFAULT)) { - EXOFS_DBGMSG("%s: EFAULT Need page clear\n", __func__); - /*FIXME: All the pages in this device range should: - * clear_highpage(page); - */ + if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) { + /* start read offset passed endof file */ + _clear_bio(ios->per_dev[i].bio); + EXOFS_DBGMSG("start read offset passed end of file " + "offset=0x%llx, length=0x%llx\n", + _LLU(ios->offset), + _LLU(ios->length)); + + continue; /* we recovered */ } if (osi.osd_err_pri >= acumulated_osd_err) { -- cgit v1.2.3 From 45d3abcb1a7388b2b97582e13bf9dd21784dcaa5 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh <bharrosh@panasas.com> Date: Thu, 28 Jan 2010 11:46:16 +0200 Subject: exofs: Move layout related members to a layout structure * Abstract away those members in exofs_sb_info that are related/needed by a layout into a new exofs_layout structure. Embed it in exofs_sb_info. * At exofs_io_state receive/keep a pointer to an exofs_layout. No need for an exofs_sb_info pointer, all we need is at exofs_layout. * Change any usage of above exofs_sb_info members to their new name. Signed-off-by: Boaz Harrosh <bharrosh@panasas.com> --- fs/exofs/exofs.h | 24 ++++++++++++++++++------ fs/exofs/inode.c | 14 +++++++------- fs/exofs/ios.c | 36 +++++++++++++++++++----------------- fs/exofs/super.c | 51 +++++++++++++++++++++++++++------------------------ 4 files changed, 71 insertions(+), 54 deletions(-) (limited to 'fs') diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h index 13663da2b119..33c68568b338 100644 --- a/fs/exofs/exofs.h +++ b/fs/exofs/exofs.h @@ -55,12 +55,18 @@ /* u64 has problems with printk this will cast it to unsigned long long */ #define _LLU(x) (unsigned long long)(x) +struct exofs_layout { + osd_id s_pid; /* partition ID of file system*/ + + unsigned s_numdevs; /* Num of devices in array */ + struct osd_dev *s_ods[0]; /* Variable length */ +}; + /* * our extension to the in-memory superblock */ struct exofs_sb_info { struct exofs_fscb s_fscb; /* Written often, pre-allocate*/ - osd_id s_pid; /* partition ID of file system*/ int s_timeout; /* timeout for OSD operations */ uint64_t s_nextid; /* highest object ID used */ uint32_t s_numfiles; /* number of files on fs */ @@ -69,9 +75,14 @@ struct exofs_sb_info { atomic_t s_curr_pending; /* number of pending commands */ uint8_t s_cred[OSD_CAP_LEN]; /* credential for the fscb */ - struct pnfs_osd_data_map data_map; /* Default raid to use */ - unsigned s_numdevs; /* Num of devices in array */ - struct osd_dev *s_ods[1]; /* Variable length, minimum 1 */ + struct pnfs_osd_data_map data_map; /* Default raid to use + * FIXME: Needed ? + */ +/* struct exofs_layout dir_layout;*/ /* Default dir layout */ + struct exofs_layout layout; /* Default files layout, + * contains the variable osd_dev + * array. Keep last */ + struct osd_dev *_min_one_dev[1]; /* Place holder for one dev */ }; /* @@ -101,7 +112,7 @@ struct exofs_io_state { void *private; exofs_io_done_fn done; - struct exofs_sb_info *sbi; + struct exofs_layout *layout; struct osd_obj_id obj; u8 *cred; @@ -189,7 +200,8 @@ void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj, u64 offset, void *p, unsigned length); -int exofs_get_io_state(struct exofs_sb_info *sbi, struct exofs_io_state** ios); +int exofs_get_io_state(struct exofs_layout *layout, + struct exofs_io_state **ios); void exofs_put_io_state(struct exofs_io_state *ios); int exofs_check_io(struct exofs_io_state *ios, u64 *resid); diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index c88a0c5250cb..03189a958b33 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -65,7 +65,7 @@ static void _pcol_init(struct page_collect *pcol, unsigned expected_pages, /* Create master bios on first Q, later on cloning, each clone will be * allocated on it's destination Q */ - pcol->req_q = osd_request_queue(sbi->s_ods[0]); + pcol->req_q = osd_request_queue(sbi->layout.s_ods[0]); pcol->inode = inode; pcol->expected_pages = expected_pages; @@ -99,7 +99,7 @@ static int pcol_try_alloc(struct page_collect *pcol) BIO_MAX_PAGES_KMALLOC); if (!pcol->ios) { /* First time allocate io_state */ - int ret = exofs_get_io_state(pcol->sbi, &pcol->ios); + int ret = exofs_get_io_state(&pcol->sbi->layout, &pcol->ios); if (ret) return ret; @@ -872,7 +872,7 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi, int ret; *obj_size = ~0; - ret = exofs_get_io_state(sbi, &ios); + ret = exofs_get_io_state(&sbi->layout, &ios); if (unlikely(ret)) { EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__); return ret; @@ -1043,7 +1043,7 @@ static void create_done(struct exofs_io_state *ios, void *p) if (unlikely(ret)) { EXOFS_ERR("object=0x%llx creation faild in pid=0x%llx", - _LLU(exofs_oi_objno(oi)), _LLU(sbi->s_pid)); + _LLU(exofs_oi_objno(oi)), _LLU(sbi->layout.s_pid)); /*TODO: When FS is corrupted creation can fail, object already * exist. Get rid of this asynchronous creation, if exist * increment the obj counter and try the next object. Until we @@ -1104,7 +1104,7 @@ struct inode *exofs_new_inode(struct inode *dir, int mode) mark_inode_dirty(inode); - ret = exofs_get_io_state(sbi, &ios); + ret = exofs_get_io_state(&sbi->layout, &ios); if (unlikely(ret)) { EXOFS_ERR("exofs_new_inode: exofs_get_io_state failed\n"); return ERR_PTR(ret); @@ -1202,7 +1202,7 @@ static int exofs_update_inode(struct inode *inode, int do_sync) } else memcpy(fcb->i_data, oi->i_data, sizeof(fcb->i_data)); - ret = exofs_get_io_state(sbi, &ios); + ret = exofs_get_io_state(&sbi->layout, &ios); if (unlikely(ret)) { EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__); goto free_args; @@ -1286,7 +1286,7 @@ void exofs_delete_inode(struct inode *inode) clear_inode(inode); - ret = exofs_get_io_state(sbi, &ios); + ret = exofs_get_io_state(&sbi->layout, &ios); if (unlikely(ret)) { EXOFS_ERR("%s: exofs_get_io_state failed\n", __func__); return; diff --git a/fs/exofs/ios.c b/fs/exofs/ios.c index 439c5d097b27..83e54a77b992 100644 --- a/fs/exofs/ios.c +++ b/fs/exofs/ios.c @@ -67,23 +67,24 @@ out: return ret; } -int exofs_get_io_state(struct exofs_sb_info *sbi, struct exofs_io_state** pios) +int exofs_get_io_state(struct exofs_layout *layout, + struct exofs_io_state **pios) { struct exofs_io_state *ios; /*TODO: Maybe use kmem_cach per sbi of size - * exofs_io_state_size(sbi->s_numdevs) + * exofs_io_state_size(layout->s_numdevs) */ - ios = kzalloc(exofs_io_state_size(sbi->s_numdevs), GFP_KERNEL); + ios = kzalloc(exofs_io_state_size(layout->s_numdevs), GFP_KERNEL); if (unlikely(!ios)) { EXOFS_DBGMSG("Faild kzalloc bytes=%d\n", - exofs_io_state_size(sbi->s_numdevs)); + exofs_io_state_size(layout->s_numdevs)); *pios = NULL; return -ENOMEM; } - ios->sbi = sbi; - ios->obj.partition = sbi->s_pid; + ios->layout = layout; + ios->obj.partition = layout->s_pid; *pios = ios; return 0; } @@ -238,10 +239,10 @@ int exofs_sbi_create(struct exofs_io_state *ios) { int i, ret; - for (i = 0; i < ios->sbi->s_numdevs; i++) { + for (i = 0; i < ios->layout->s_numdevs; i++) { struct osd_request *or; - or = osd_start_request(ios->sbi->s_ods[i], GFP_KERNEL); + or = osd_start_request(ios->layout->s_ods[i], GFP_KERNEL); if (unlikely(!or)) { EXOFS_ERR("%s: osd_start_request failed\n", __func__); ret = -ENOMEM; @@ -262,10 +263,10 @@ int exofs_sbi_remove(struct exofs_io_state *ios) { int i, ret; - for (i = 0; i < ios->sbi->s_numdevs; i++) { + for (i = 0; i < ios->layout->s_numdevs; i++) { struct osd_request *or; - or = osd_start_request(ios->sbi->s_ods[i], GFP_KERNEL); + or = osd_start_request(ios->layout->s_ods[i], GFP_KERNEL); if (unlikely(!or)) { EXOFS_ERR("%s: osd_start_request failed\n", __func__); ret = -ENOMEM; @@ -286,10 +287,10 @@ int exofs_sbi_write(struct exofs_io_state *ios) { int i, ret; - for (i = 0; i < ios->sbi->s_numdevs; i++) { + for (i = 0; i < ios->layout->s_numdevs; i++) { struct osd_request *or; - or = osd_start_request(ios->sbi->s_ods[i], GFP_KERNEL); + or = osd_start_request(ios->layout->s_ods[i], GFP_KERNEL); if (unlikely(!or)) { EXOFS_ERR("%s: osd_start_request failed\n", __func__); ret = -ENOMEM; @@ -361,8 +362,9 @@ int exofs_sbi_read(struct exofs_io_state *ios) struct osd_request *or; unsigned first_dev = (unsigned)ios->obj.id; - first_dev %= ios->sbi->s_numdevs; - or = osd_start_request(ios->sbi->s_ods[first_dev], GFP_KERNEL); + first_dev %= ios->layout->s_numdevs; + or = osd_start_request(ios->layout->s_ods[first_dev], + GFP_KERNEL); if (unlikely(!or)) { EXOFS_ERR("%s: osd_start_request failed\n", __func__); ret = -ENOMEM; @@ -438,7 +440,7 @@ int exofs_oi_truncate(struct exofs_i_info *oi, u64 size) __be64 newsize; int i, ret; - if (exofs_get_io_state(sbi, &ios)) + if (exofs_get_io_state(&sbi->layout, &ios)) return -ENOMEM; ios->obj.id = exofs_oi_objno(oi); @@ -448,10 +450,10 @@ int exofs_oi_truncate(struct exofs_i_info *oi, u64 size) attr = g_attr_logical_length; attr.val_ptr = &newsize; - for (i = 0; i < sbi->s_numdevs; i++) { + for (i = 0; i < sbi->layout.s_numdevs; i++) { struct osd_request *or; - or = osd_start_request(sbi->s_ods[i], GFP_KERNEL); + or = osd_start_request(sbi->layout.s_ods[i], GFP_KERNEL); if (unlikely(!or)) { EXOFS_ERR("%s: osd_start_request failed\n", __func__); ret = -ENOMEM; diff --git a/fs/exofs/super.c b/fs/exofs/super.c index a1d1e77b12eb..fc8875186ae8 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -210,7 +210,7 @@ int exofs_sync_fs(struct super_block *sb, int wait) sbi = sb->s_fs_info; fscb = &sbi->s_fscb; - ret = exofs_get_io_state(sbi, &ios); + ret = exofs_get_io_state(&sbi->layout, &ios); if (ret) goto out; @@ -264,12 +264,12 @@ static void _exofs_print_device(const char *msg, const char *dev_path, void exofs_free_sbi(struct exofs_sb_info *sbi) { - while (sbi->s_numdevs) { - int i = --sbi->s_numdevs; - struct osd_dev *od = sbi->s_ods[i]; + while (sbi->layout.s_numdevs) { + int i = --sbi->layout.s_numdevs; + struct osd_dev *od = sbi->layout.s_ods[i]; if (od) { - sbi->s_ods[i] = NULL; + sbi->layout.s_ods[i] = NULL; osduld_put_device(od); } } @@ -298,7 +298,8 @@ static void exofs_put_super(struct super_block *sb) msecs_to_jiffies(100)); } - _exofs_print_device("Unmounting", NULL, sbi->s_ods[0], sbi->s_pid); + _exofs_print_device("Unmounting", NULL, sbi->layout.s_ods[0], + sbi->layout.s_pid); exofs_free_sbi(sbi); sb->s_fs_info = NULL; @@ -361,7 +362,7 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi, { struct exofs_sb_info *sbi = *psbi; struct osd_dev *fscb_od; - struct osd_obj_id obj = {.partition = sbi->s_pid, + struct osd_obj_id obj = {.partition = sbi->layout.s_pid, .id = EXOFS_DEVTABLE_ID}; struct exofs_device_table *dt; unsigned table_bytes = table_count * sizeof(dt->dt_dev_table[0]) + @@ -376,9 +377,9 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi, return -ENOMEM; } - fscb_od = sbi->s_ods[0]; - sbi->s_ods[0] = NULL; - sbi->s_numdevs = 0; + fscb_od = sbi->layout.s_ods[0]; + sbi->layout.s_ods[0] = NULL; + sbi->layout.s_numdevs = 0; ret = exofs_read_kern(fscb_od, sbi->s_cred, &obj, 0, dt, table_bytes); if (unlikely(ret)) { EXOFS_ERR("ERROR: reading device table\n"); @@ -397,14 +398,15 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi, goto out; if (likely(numdevs > 1)) { - unsigned size = numdevs * sizeof(sbi->s_ods[0]); + unsigned size = numdevs * sizeof(sbi->layout.s_ods[0]); sbi = krealloc(sbi, sizeof(*sbi) + size, GFP_KERNEL); if (unlikely(!sbi)) { ret = -ENOMEM; goto out; } - memset(&sbi->s_ods[1], 0, size - sizeof(sbi->s_ods[0])); + memset(&sbi->layout.s_ods[1], 0, + size - sizeof(sbi->layout.s_ods[0])); *psbi = sbi; } @@ -427,8 +429,8 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi, * line. We always keep them in device-table order. */ if (fscb_od && osduld_device_same(fscb_od, &odi)) { - sbi->s_ods[i] = fscb_od; - ++sbi->s_numdevs; + sbi->layout.s_ods[i] = fscb_od; + ++sbi->layout.s_numdevs; fscb_od = NULL; continue; } @@ -441,8 +443,8 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi, goto out; } - sbi->s_ods[i] = od; - ++sbi->s_numdevs; + sbi->layout.s_ods[i] = od; + ++sbi->layout.s_numdevs; /* Read the fscb of the other devices to make sure the FS * partition is there. @@ -499,9 +501,10 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) goto free_sbi; } - sbi->s_ods[0] = od; - sbi->s_numdevs = 1; - sbi->s_pid = opts->pid; + /* Default layout in case we do not have a device-table */ + sbi->layout.s_ods[0] = od; + sbi->layout.s_numdevs = 1; + sbi->layout.s_pid = opts->pid; sbi->s_timeout = opts->timeout; /* fill in some other data by hand */ @@ -514,7 +517,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) sb->s_bdev = NULL; sb->s_dev = 0; - obj.partition = sbi->s_pid; + obj.partition = sbi->layout.s_pid; obj.id = EXOFS_SUPER_ID; exofs_make_credential(sbi->s_cred, &obj); @@ -578,13 +581,13 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) goto free_sbi; } - _exofs_print_device("Mounting", opts->dev_name, sbi->s_ods[0], - sbi->s_pid); + _exofs_print_device("Mounting", opts->dev_name, sbi->layout.s_ods[0], + sbi->layout.s_pid); return 0; free_sbi: EXOFS_ERR("Unable to mount exofs on %s pid=0x%llx err=%d\n", - opts->dev_name, sbi->s_pid, ret); + opts->dev_name, sbi->layout.s_pid, ret); exofs_free_sbi(sbi); return ret; } @@ -627,7 +630,7 @@ static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf) uint8_t cred_a[OSD_CAP_LEN]; int ret; - ret = exofs_get_io_state(sbi, &ios); + ret = exofs_get_io_state(&sbi->layout, &ios); if (ret) { EXOFS_DBGMSG("exofs_get_io_state failed.\n"); return ret; -- cgit v1.2.3 From 46f4d973f6874c06b7a41a3bf8f4c1717d90f97a Mon Sep 17 00:00:00 2001 From: Boaz Harrosh <bharrosh@panasas.com> Date: Mon, 1 Feb 2010 11:37:30 +0200 Subject: exofs: unindent exofs_sbi_read The original idea was that a mirror read can be sub-divided to multiple devices. But this has very little gain and only at very large IOes so it's not going to be implemented soon. Signed-off-by: Boaz Harrosh <bharrosh@panasas.com> --- fs/exofs/ios.c | 87 +++++++++++++++++++++++++--------------------------------- 1 file changed, 38 insertions(+), 49 deletions(-) (limited to 'fs') diff --git a/fs/exofs/ios.c b/fs/exofs/ios.c index 83e54a77b992..4f679317ca54 100644 --- a/fs/exofs/ios.c +++ b/fs/exofs/ios.c @@ -356,59 +356,48 @@ out: int exofs_sbi_read(struct exofs_io_state *ios) { - int i, ret; - - for (i = 0; i < 1; i++) { - struct osd_request *or; - unsigned first_dev = (unsigned)ios->obj.id; - - first_dev %= ios->layout->s_numdevs; - or = osd_start_request(ios->layout->s_ods[first_dev], - GFP_KERNEL); - if (unlikely(!or)) { - EXOFS_ERR("%s: osd_start_request failed\n", __func__); - ret = -ENOMEM; - goto out; - } - ios->per_dev[i].or = or; - ios->numdevs++; + struct osd_request *or; + struct exofs_per_dev_state *per_dev = &ios->per_dev[0]; + unsigned first_dev = (unsigned)ios->obj.id; - if (ios->bio) { - osd_req_read(or, &ios->obj, ios->offset, ios->bio, - ios->length); - EXOFS_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx" - " dev=%d\n", _LLU(ios->obj.id), - _LLU(ios->offset), - _LLU(ios->length), - first_dev); - } else if (ios->kern_buff) { - osd_req_read_kern(or, &ios->obj, ios->offset, - ios->kern_buff, ios->length); - EXOFS_DBGMSG2("read_kern(0x%llx) offset=0x%llx " - "length=0x%llx dev=%d\n", - _LLU(ios->obj.id), - _LLU(ios->offset), - _LLU(ios->length), - first_dev); - } else { - osd_req_get_attributes(or, &ios->obj); - EXOFS_DBGMSG2("obj(0x%llx) get_attributes=%d dev=%d\n", - _LLU(ios->obj.id), ios->in_attr_len, - first_dev); - } + first_dev %= ios->layout->s_numdevs; + or = osd_start_request(ios->layout->s_ods[first_dev], GFP_KERNEL); + if (unlikely(!or)) { + EXOFS_ERR("%s: osd_start_request failed\n", __func__); + return -ENOMEM; + } + per_dev->or = or; + ios->numdevs++; + + if (ios->bio) { + osd_req_read(or, &ios->obj, ios->offset, ios->bio, ios->length); + EXOFS_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx" + " dev=%d\n", _LLU(ios->obj.id), + _LLU(ios->offset), _LLU(ios->length), + first_dev); + } else if (ios->kern_buff) { + int ret = osd_req_read_kern(or, &ios->obj, ios->offset, + ios->kern_buff, ios->length); + + EXOFS_DBGMSG2("read_kern(0x%llx) offset=0x%llx " + "length=0x%llx dev=%d ret=>%d\n", + _LLU(ios->obj.id), _LLU(ios->offset), + _LLU(ios->length), first_dev, ret); + if (unlikely(ret)) + return ret; + } else { + osd_req_get_attributes(or, &ios->obj); + EXOFS_DBGMSG2("obj(0x%llx) get_attributes=%d dev=%d\n", + _LLU(ios->obj.id), ios->in_attr_len, first_dev); + } - if (ios->out_attr) - osd_req_add_set_attr_list(or, ios->out_attr, - ios->out_attr_len); + if (ios->out_attr) + osd_req_add_set_attr_list(or, ios->out_attr, ios->out_attr_len); - if (ios->in_attr) - osd_req_add_get_attr_list(or, ios->in_attr, - ios->in_attr_len); - } - ret = exofs_io_execute(ios); + if (ios->in_attr) + osd_req_add_get_attr_list(or, ios->in_attr, ios->in_attr_len); -out: - return ret; + return exofs_io_execute(ios); } int extract_attr_from_ios(struct exofs_io_state *ios, struct osd_attr *attr) -- cgit v1.2.3 From d9c740d2253e75db8cef8f87a3125c450f3ebd82 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh <bharrosh@panasas.com> Date: Thu, 28 Jan 2010 11:58:08 +0200 Subject: exofs: Define on-disk per-inode optional layout attribute * Layouts describe the way a file is spread on multiple devices. The layout information is stored in the objects attribute introduced in this patch. * There can be multiple generating function for the layout. Currently defined: - No attribute present - use below moving-window on global device table, all devices. (This is the only one currently used in exofs) - an obj_id generated moving window - the obj_id is a randomizing factor in the otherwise global map layout. - An explicit layout stored, including a data_map and a device index list. - More might be defined in future ... * There are two attributes defined of the same structure: A-data-files-layout - This layout is used by data-files. If present at a directory, all files of that directory will be created with this layout. A-meta-data-layout - This layout is used by a directory and other meta-data information. Also inherited at creation of subdirectories. * At creation time inodes are created with the layout specified above. A usermode utility may change the creation layout on a give directory or file. Which in the case of directories, will also apply to newly created files/subdirectories, children of that directory. In the simple unaltered case of a newly created exofs, no layout attributes are present, and all layouts adhere to the layout specified at the device-table. * In case of a future file system loaded in an old exofs-driver. At iget(), the generating_function is inspected and if not supported will return an IO error to the application and the inode will not be loaded. So not to damage any data. Note: After this patch we do not yet support any type of layout only the RAID0 patch that enables striping at the super-block level will add support for RAID0 layouts above. This way we are past and future compatible and fully bisectable. * Access to the device table is done by an accessor since it will change according to above information. Signed-off-by: Boaz Harrosh <bharrosh@panasas.com> --- fs/exofs/common.h | 39 ++++++++++++++++++++++++++++++++++++++ fs/exofs/exofs.h | 6 ++++++ fs/exofs/inode.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++----- fs/exofs/ios.c | 23 ++++++++++++++++++----- 4 files changed, 114 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/exofs/common.h b/fs/exofs/common.h index b1b178e61718..f0d520312d8b 100644 --- a/fs/exofs/common.h +++ b/fs/exofs/common.h @@ -55,6 +55,8 @@ /* exofs Application specific page/attribute */ # define EXOFS_APAGE_FS_DATA (OSD_APAGE_APP_DEFINED_FIRST + 3) # define EXOFS_ATTR_INODE_DATA 1 +# define EXOFS_ATTR_INODE_FILE_LAYOUT 2 +# define EXOFS_ATTR_INODE_DIR_LAYOUT 3 /* * The maximum number of files we can have is limited by the size of the @@ -206,4 +208,41 @@ enum { (((name_len) + offsetof(struct exofs_dir_entry, name) + \ EXOFS_DIR_ROUND) & ~EXOFS_DIR_ROUND) +/* + * The on-disk (optional) layout structure. + * sits in an EXOFS_ATTR_INODE_FILE_LAYOUT or EXOFS_ATTR_INODE_DIR_LAYOUT + * attribute, attached to any inode, usually to a directory. + */ + +enum exofs_inode_layout_gen_functions { + LAYOUT_MOVING_WINDOW = 0, + LAYOUT_IMPLICT = 1, +}; + +struct exofs_on_disk_inode_layout { + __le16 gen_func; /* One of enum exofs_inode_layout_gen_functions */ + __le16 pad; + union { + /* gen_func == LAYOUT_MOVING_WINDOW (default) */ + struct exofs_layout_sliding_window { + __le32 num_devices; /* first n devices in global-table*/ + } sliding_window __packed; + + /* gen_func == LAYOUT_IMPLICT */ + struct exofs_layout_implict_list { + struct exofs_dt_data_map data_map; + /* Variable array of size data_map.cb_num_comps. These + * are device indexes of the devices in the global table + */ + __le32 dev_indexes[]; + } implict __packed; + }; +} __packed; + +static inline size_t exofs_on_disk_inode_layout_size(unsigned max_devs) +{ + return sizeof(struct exofs_on_disk_inode_layout) + + max_devs * sizeof(__le32); +} + #endif /*ifndef __EXOFS_COM_H__*/ diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h index 33c68568b338..09e331935514 100644 --- a/fs/exofs/exofs.h +++ b/fs/exofs/exofs.h @@ -185,6 +185,12 @@ static inline struct exofs_i_info *exofs_i(struct inode *inode) return container_of(inode, struct exofs_i_info, vfs_inode); } +/* + * Given a layout, object_number and stripe_index return the associated global + * dev_index + */ +unsigned exofs_layout_od_id(struct exofs_layout *layout, + osd_id obj_no, unsigned layout_index); /* * Maximum count of links to a file */ diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index 03189a958b33..0163546ba05a 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -859,6 +859,15 @@ int exofs_setattr(struct dentry *dentry, struct iattr *iattr) return error; } +static const struct osd_attr g_attr_inode_file_layout = ATTR_DEF( + EXOFS_APAGE_FS_DATA, + EXOFS_ATTR_INODE_FILE_LAYOUT, + 0); +static const struct osd_attr g_attr_inode_dir_layout = ATTR_DEF( + EXOFS_APAGE_FS_DATA, + EXOFS_ATTR_INODE_DIR_LAYOUT, + 0); + /* * Read an inode from the OSD, and return it as is. We also return the size * attribute in the 'obj_size' argument. @@ -867,11 +876,16 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi, struct exofs_fcb *inode, uint64_t *obj_size) { struct exofs_sb_info *sbi = sb->s_fs_info; - struct osd_attr attrs[2]; + struct osd_attr attrs[] = { + [0] = g_attr_inode_data, + [1] = g_attr_inode_file_layout, + [2] = g_attr_inode_dir_layout, + [3] = g_attr_logical_length, + }; struct exofs_io_state *ios; + struct exofs_on_disk_inode_layout *layout; int ret; - *obj_size = ~0; ret = exofs_get_io_state(&sbi->layout, &ios); if (unlikely(ret)) { EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__); @@ -882,8 +896,9 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi, exofs_make_credential(oi->i_cred, &ios->obj); ios->cred = oi->i_cred; - attrs[0] = g_attr_inode_data; - attrs[1] = g_attr_logical_length; + attrs[1].len = exofs_on_disk_inode_layout_size(sbi->layout.s_numdevs); + attrs[2].len = exofs_on_disk_inode_layout_size(sbi->layout.s_numdevs); + ios->in_attr = attrs; ios->in_attr_len = ARRAY_SIZE(attrs); @@ -900,12 +915,43 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi, memcpy(inode, attrs[0].val_ptr, EXOFS_INO_ATTR_SIZE); ret = extract_attr_from_ios(ios, &attrs[1]); + if (ret) { + EXOFS_ERR("%s: extract_attr of inode_data failed\n", __func__); + goto out; + } + if (attrs[1].len) { + layout = attrs[1].val_ptr; + if (layout->gen_func != cpu_to_le16(LAYOUT_MOVING_WINDOW)) { + EXOFS_ERR("%s: unsupported files layout %d\n", + __func__, layout->gen_func); + ret = -ENOTSUPP; + goto out; + } + } + + ret = extract_attr_from_ios(ios, &attrs[2]); + if (ret) { + EXOFS_ERR("%s: extract_attr of inode_data failed\n", __func__); + goto out; + } + if (attrs[2].len) { + layout = attrs[2].val_ptr; + if (layout->gen_func != cpu_to_le16(LAYOUT_MOVING_WINDOW)) { + EXOFS_ERR("%s: unsupported meta-data layout %d\n", + __func__, layout->gen_func); + ret = -ENOTSUPP; + goto out; + } + } + + *obj_size = ~0; + ret = extract_attr_from_ios(ios, &attrs[3]); if (ret) { EXOFS_ERR("%s: extract_attr of logical_length failed\n", __func__); goto out; } - *obj_size = get_unaligned_be64(attrs[1].val_ptr); + *obj_size = get_unaligned_be64(attrs[3].val_ptr); out: exofs_put_io_state(ios); diff --git a/fs/exofs/ios.c b/fs/exofs/ios.c index 4f679317ca54..2b81f99fd62c 100644 --- a/fs/exofs/ios.c +++ b/fs/exofs/ios.c @@ -107,6 +107,19 @@ void exofs_put_io_state(struct exofs_io_state *ios) } } +unsigned exofs_layout_od_id(struct exofs_layout *layout, + osd_id obj_no, unsigned layout_index) +{ + return layout_index; +} + +static inline struct osd_dev *exofs_ios_od(struct exofs_io_state *ios, + unsigned layout_index) +{ + return ios->layout->s_ods[ + exofs_layout_od_id(ios->layout, ios->obj.id, layout_index)]; +} + static void _sync_done(struct exofs_io_state *ios, void *p) { struct completion *waiting = p; @@ -242,7 +255,7 @@ int exofs_sbi_create(struct exofs_io_state *ios) for (i = 0; i < ios->layout->s_numdevs; i++) { struct osd_request *or; - or = osd_start_request(ios->layout->s_ods[i], GFP_KERNEL); + or = osd_start_request(exofs_ios_od(ios, i), GFP_KERNEL); if (unlikely(!or)) { EXOFS_ERR("%s: osd_start_request failed\n", __func__); ret = -ENOMEM; @@ -266,7 +279,7 @@ int exofs_sbi_remove(struct exofs_io_state *ios) for (i = 0; i < ios->layout->s_numdevs; i++) { struct osd_request *or; - or = osd_start_request(ios->layout->s_ods[i], GFP_KERNEL); + or = osd_start_request(exofs_ios_od(ios, i), GFP_KERNEL); if (unlikely(!or)) { EXOFS_ERR("%s: osd_start_request failed\n", __func__); ret = -ENOMEM; @@ -290,7 +303,7 @@ int exofs_sbi_write(struct exofs_io_state *ios) for (i = 0; i < ios->layout->s_numdevs; i++) { struct osd_request *or; - or = osd_start_request(ios->layout->s_ods[i], GFP_KERNEL); + or = osd_start_request(exofs_ios_od(ios, i), GFP_KERNEL); if (unlikely(!or)) { EXOFS_ERR("%s: osd_start_request failed\n", __func__); ret = -ENOMEM; @@ -361,7 +374,7 @@ int exofs_sbi_read(struct exofs_io_state *ios) unsigned first_dev = (unsigned)ios->obj.id; first_dev %= ios->layout->s_numdevs; - or = osd_start_request(ios->layout->s_ods[first_dev], GFP_KERNEL); + or = osd_start_request(exofs_ios_od(ios, first_dev), GFP_KERNEL); if (unlikely(!or)) { EXOFS_ERR("%s: osd_start_request failed\n", __func__); return -ENOMEM; @@ -442,7 +455,7 @@ int exofs_oi_truncate(struct exofs_i_info *oi, u64 size) for (i = 0; i < sbi->layout.s_numdevs; i++) { struct osd_request *or; - or = osd_start_request(sbi->layout.s_ods[i], GFP_KERNEL); + or = osd_start_request(exofs_ios_od(ios, i), GFP_KERNEL); if (unlikely(!or)) { EXOFS_ERR("%s: osd_start_request failed\n", __func__); ret = -ENOMEM; -- cgit v1.2.3 From 5d952b8391692553c31e620a92d6e09262a9a307 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh <bharrosh@panasas.com> Date: Mon, 1 Feb 2010 13:35:51 +0200 Subject: exofs: RAID0 support We now support striping over mirror devices. Including variable sized stripe_unit. Some limits: * stripe_unit must be a multiple of PAGE_SIZE * stripe_unit * stripe_count is maximum upto 32-bit (4Gb) Tested RAID0 over mirrors, RAID0 only, mirrors only. All check. Design notes: * I'm not using a vectored raid-engine mechanism yet. Following the pnfs-objects-layout data-map structure, "Mirror" is just a private case of "group_width" == 1, and RAID0 is a private case of "Mirrors" == 1. The performance lose of the general case over the particular special case optimization is totally negligible, also considering the extra code size. * In general I added a prepare_stripes() stage that divides the to-be-io pages to the participating devices, the previous exofs_ios_write/read, now becomes _write/read_mirrors and a new write/read upper layer loops on all devices calling _write/read_mirrors. Effectively the prepare_stripes stage is the all secret. Also truncate need fixing to accommodate for striping. * In a RAID0 arrangement, in a regular usage scenario, if all inode layouts will start at the same device, the small files fill up the first device and the later devices stay empty, the farther the device the emptier it is. To fix that, each inode will start at a different stripe_unit, according to it's obj_id modulus number-of-stripe-units. And will then span all stripe-units in the same incrementing order wrapping back to the beginning of the device table. We call it a stripe-units moving window. Special consideration was taken to keep all devices in a mirror arrangement identical. So a broken osd-device could just be cloned from one of the mirrors and no FS scrubbing is needed. (We do that by rotating stripe-unit at a time and not a single device at a time.) TODO: We no longer verify object_length == inode->i_size in exofs_iget. (since i_size is stripped on multiple objects now). I should introduce a multiple-device attribute reading, and use it in exofs_iget. Signed-off-by: Boaz Harrosh <bharrosh@panasas.com> --- fs/exofs/exofs.h | 11 ++ fs/exofs/inode.c | 26 +---- fs/exofs/ios.c | 327 ++++++++++++++++++++++++++++++++++++++++++++++--------- fs/exofs/super.c | 52 +++++++-- 4 files changed, 333 insertions(+), 83 deletions(-) (limited to 'fs') diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h index 09e331935514..0d8a34b21ae1 100644 --- a/fs/exofs/exofs.h +++ b/fs/exofs/exofs.h @@ -58,6 +58,14 @@ struct exofs_layout { osd_id s_pid; /* partition ID of file system*/ + /* Our way of looking at the data_map */ + unsigned stripe_unit; + unsigned mirrors_p1; + + unsigned group_width; + + enum exofs_inode_layout_gen_functions lay_func; + unsigned s_numdevs; /* Num of devices in array */ struct osd_dev *s_ods[0]; /* Variable length */ }; @@ -133,6 +141,9 @@ struct exofs_io_state { struct exofs_per_dev_state { struct osd_request *or; struct bio *bio; + loff_t offset; + unsigned length; + unsigned dev; } per_dev[]; }; diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index 0163546ba05a..2b3163ea56eb 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -869,18 +869,17 @@ static const struct osd_attr g_attr_inode_dir_layout = ATTR_DEF( 0); /* - * Read an inode from the OSD, and return it as is. We also return the size - * attribute in the 'obj_size' argument. + * Read the Linux inode info from the OSD, and return it as is. In exofs the + * inode info is in an application specific page/attribute of the osd-object. */ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi, - struct exofs_fcb *inode, uint64_t *obj_size) + struct exofs_fcb *inode) { struct exofs_sb_info *sbi = sb->s_fs_info; struct osd_attr attrs[] = { [0] = g_attr_inode_data, [1] = g_attr_inode_file_layout, [2] = g_attr_inode_dir_layout, - [3] = g_attr_logical_length, }; struct exofs_io_state *ios; struct exofs_on_disk_inode_layout *layout; @@ -944,15 +943,6 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi, } } - *obj_size = ~0; - ret = extract_attr_from_ios(ios, &attrs[3]); - if (ret) { - EXOFS_ERR("%s: extract_attr of logical_length failed\n", - __func__); - goto out; - } - *obj_size = get_unaligned_be64(attrs[3].val_ptr); - out: exofs_put_io_state(ios); return ret; @@ -971,7 +961,6 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino) struct exofs_i_info *oi; struct exofs_fcb fcb; struct inode *inode; - uint64_t obj_size; int ret; inode = iget_locked(sb, ino); @@ -983,7 +972,7 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino) __oi_init(oi); /* read the inode from the osd */ - ret = exofs_get_inode(sb, oi, &fcb, &obj_size); + ret = exofs_get_inode(sb, oi, &fcb); if (ret) goto bad_inode; @@ -1004,13 +993,6 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino) inode->i_blkbits = EXOFS_BLKSHIFT; inode->i_generation = le32_to_cpu(fcb.i_generation); - if ((inode->i_size != obj_size) && - (!exofs_inode_is_fast_symlink(inode))) { - EXOFS_ERR("WARNING: Size of inode=%llu != object=%llu\n", - inode->i_size, _LLU(obj_size)); - /* FIXME: call exofs_inode_recovery() */ - } - oi->i_dir_start_lookup = 0; if ((inode->i_nlink == 0) && (inode->i_mode == 0)) { diff --git a/fs/exofs/ios.c b/fs/exofs/ios.c index 2b81f99fd62c..6e446b2670b9 100644 --- a/fs/exofs/ios.c +++ b/fs/exofs/ios.c @@ -23,6 +23,7 @@ */ #include <scsi/scsi_device.h> +#include <asm/div64.h> #include "exofs.h" @@ -110,7 +111,17 @@ void exofs_put_io_state(struct exofs_io_state *ios) unsigned exofs_layout_od_id(struct exofs_layout *layout, osd_id obj_no, unsigned layout_index) { - return layout_index; +/* switch (layout->lay_func) { + case LAYOUT_MOVING_WINDOW: + {*/ + unsigned dev_mod = obj_no; + + return (layout_index + dev_mod * layout->mirrors_p1) % + layout->s_numdevs; +/* } + case LAYOUT_FUNC_IMPLICT: + return layout->devs[layout_index]; + }*/ } static inline struct osd_dev *exofs_ios_od(struct exofs_io_state *ios, @@ -225,8 +236,8 @@ int exofs_check_io(struct exofs_io_state *ios, u64 *resid) _clear_bio(ios->per_dev[i].bio); EXOFS_DBGMSG("start read offset passed end of file " "offset=0x%llx, length=0x%llx\n", - _LLU(ios->offset), - _LLU(ios->length)); + _LLU(ios->per_dev[i].offset), + _LLU(ios->per_dev[i].length)); continue; /* we recovered */ } @@ -248,6 +259,127 @@ int exofs_check_io(struct exofs_io_state *ios, u64 *resid) return acumulated_lin_err; } +/* REMOVEME: After review + Some quoteing from the standard + + L = logical offset into the file + W = number of data components in a stripe + S = W * stripe_unit (S is Stripe length) + N = L / S (N is the stripe Number) + C = (L-(N*S)) / stripe_unit (C is the component) + O = (N*stripe_unit)+(L%stripe_unit) (O is the object's offset) +*/ + +static void _offset_dev_unit_off(struct exofs_io_state *ios, u64 file_offset, + u64 *obj_offset, unsigned *dev, unsigned *unit_off) +{ + unsigned stripe_unit = ios->layout->stripe_unit; + unsigned stripe_length = stripe_unit * ios->layout->group_width; + u64 stripe_no = file_offset; + unsigned stripe_mod = do_div(stripe_no, stripe_length); + + *unit_off = stripe_mod % stripe_unit; + *obj_offset = stripe_no * stripe_unit + *unit_off; + *dev = stripe_mod / stripe_unit * ios->layout->mirrors_p1; +} + +static int _add_stripe_unit(struct exofs_io_state *ios, unsigned *cur_bvec, + struct exofs_per_dev_state *per_dev, int cur_len) +{ + unsigned bv = *cur_bvec; + struct request_queue *q = + osd_request_queue(exofs_ios_od(ios, per_dev->dev)); + + per_dev->length += cur_len; + + if (per_dev->bio == NULL) { + unsigned pages_in_stripe = ios->layout->group_width * + (ios->layout->stripe_unit / PAGE_SIZE); + unsigned bio_size = (ios->bio->bi_vcnt + pages_in_stripe) / + ios->layout->group_width; + + per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size); + if (unlikely(!per_dev->bio)) { + EXOFS_DBGMSG("Faild to allocate BIO size=%u\n", + bio_size); + return -ENOMEM; + } + } + + while (cur_len > 0) { + int added_len; + struct bio_vec *bvec = &ios->bio->bi_io_vec[bv]; + + BUG_ON(ios->bio->bi_vcnt <= bv); + cur_len -= bvec->bv_len; + + added_len = bio_add_pc_page(q, per_dev->bio, bvec->bv_page, + bvec->bv_len, bvec->bv_offset); + if (unlikely(bvec->bv_len != added_len)) + return -ENOMEM; + ++bv; + } + BUG_ON(cur_len); + + *cur_bvec = bv; + return 0; +} + +static int _prepare_for_striping(struct exofs_io_state *ios) +{ + u64 length = ios->length; + u64 offset = ios->offset; + unsigned stripe_unit = ios->layout->stripe_unit; + unsigned comp = 0; + unsigned stripes = 0; + unsigned cur_bvec = 0; + int ret; + + if (!ios->bio) { + if (ios->kern_buff) { + struct exofs_per_dev_state *per_dev = &ios->per_dev[0]; + unsigned unit_off; + + _offset_dev_unit_off(ios, offset, &per_dev->offset, + &per_dev->dev, &unit_off); + /* no cross device without page array */ + BUG_ON((ios->layout->group_width > 1) && + (unit_off + length > stripe_unit)); + } + ios->numdevs = ios->layout->mirrors_p1; + return 0; + } + + while (length) { + struct exofs_per_dev_state *per_dev = &ios->per_dev[comp]; + unsigned cur_len; + + if (!per_dev->length) { + unsigned unit_off; + + _offset_dev_unit_off(ios, offset, &per_dev->offset, + &per_dev->dev, &unit_off); + stripes++; + cur_len = min_t(u64, stripe_unit - unit_off, length); + offset += cur_len; + } else { + cur_len = min_t(u64, stripe_unit, length); + } + + ret = _add_stripe_unit(ios, &cur_bvec, per_dev, cur_len); + if (unlikely(ret)) + goto out; + + comp += ios->layout->mirrors_p1; + comp %= ios->layout->s_numdevs; + + length -= cur_len; + } +out: + ios->numdevs = stripes * ios->layout->mirrors_p1; + return ret; +} + int exofs_sbi_create(struct exofs_io_state *ios) { int i, ret; @@ -296,61 +428,71 @@ out: return ret; } -int exofs_sbi_write(struct exofs_io_state *ios) +static int _sbi_write_mirror(struct exofs_io_state *ios, int cur_comp) { - int i, ret; + struct exofs_per_dev_state *master_dev = &ios->per_dev[cur_comp]; + unsigned dev = ios->per_dev[cur_comp].dev; + unsigned last_comp = cur_comp + ios->layout->mirrors_p1; + int ret = 0; - for (i = 0; i < ios->layout->s_numdevs; i++) { + for (; cur_comp < last_comp; ++cur_comp, ++dev) { + struct exofs_per_dev_state *per_dev = &ios->per_dev[cur_comp]; struct osd_request *or; - or = osd_start_request(exofs_ios_od(ios, i), GFP_KERNEL); + or = osd_start_request(exofs_ios_od(ios, dev), GFP_KERNEL); if (unlikely(!or)) { EXOFS_ERR("%s: osd_start_request failed\n", __func__); ret = -ENOMEM; goto out; } - ios->per_dev[i].or = or; - ios->numdevs++; + per_dev->or = or; + per_dev->offset = master_dev->offset; if (ios->bio) { struct bio *bio; - if (i != 0) { + if (per_dev != master_dev) { bio = bio_kmalloc(GFP_KERNEL, - ios->bio->bi_max_vecs); + master_dev->bio->bi_max_vecs); if (unlikely(!bio)) { EXOFS_DBGMSG( "Faild to allocate BIO size=%u\n", - ios->bio->bi_max_vecs); + master_dev->bio->bi_max_vecs); ret = -ENOMEM; goto out; } - __bio_clone(bio, ios->bio); + __bio_clone(bio, master_dev->bio); bio->bi_bdev = NULL; bio->bi_next = NULL; - ios->per_dev[i].bio = bio; + per_dev->length = master_dev->length; + per_dev->bio = bio; + per_dev->dev = dev; } else { - bio = ios->bio; + bio = master_dev->bio; + /* FIXME: bio_set_dir() */ + bio->bi_rw |= (1 << BIO_RW); } - osd_req_write(or, &ios->obj, ios->offset, bio, - ios->length); + osd_req_write(or, &ios->obj, per_dev->offset, bio, + per_dev->length); EXOFS_DBGMSG("write(0x%llx) offset=0x%llx " "length=0x%llx dev=%d\n", - _LLU(ios->obj.id), _LLU(ios->offset), - _LLU(ios->length), i); + _LLU(ios->obj.id), _LLU(per_dev->offset), + _LLU(per_dev->length), dev); } else if (ios->kern_buff) { - osd_req_write_kern(or, &ios->obj, ios->offset, + ret = osd_req_write_kern(or, &ios->obj, per_dev->offset, ios->kern_buff, ios->length); + if (unlikely(ret)) + goto out; EXOFS_DBGMSG2("write_kern(0x%llx) offset=0x%llx " "length=0x%llx dev=%d\n", - _LLU(ios->obj.id), _LLU(ios->offset), - _LLU(ios->length), i); + _LLU(ios->obj.id), _LLU(per_dev->offset), + _LLU(ios->length), dev); } else { osd_req_set_attributes(or, &ios->obj); EXOFS_DBGMSG2("obj(0x%llx) set_attributes=%d dev=%d\n", - _LLU(ios->obj.id), ios->out_attr_len, i); + _LLU(ios->obj.id), ios->out_attr_len, dev); } if (ios->out_attr) @@ -361,40 +503,57 @@ int exofs_sbi_write(struct exofs_io_state *ios) osd_req_add_get_attr_list(or, ios->in_attr, ios->in_attr_len); } - ret = exofs_io_execute(ios); out: return ret; } -int exofs_sbi_read(struct exofs_io_state *ios) +int exofs_sbi_write(struct exofs_io_state *ios) +{ + int i; + int ret; + + ret = _prepare_for_striping(ios); + if (unlikely(ret)) + return ret; + + for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) { + ret = _sbi_write_mirror(ios, i); + if (unlikely(ret)) + return ret; + } + + ret = exofs_io_execute(ios); + return ret; +} + +static int _sbi_read_mirror(struct exofs_io_state *ios, unsigned cur_comp) { struct osd_request *or; - struct exofs_per_dev_state *per_dev = &ios->per_dev[0]; + struct exofs_per_dev_state *per_dev = &ios->per_dev[cur_comp]; unsigned first_dev = (unsigned)ios->obj.id; - first_dev %= ios->layout->s_numdevs; + first_dev = per_dev->dev + first_dev % ios->layout->mirrors_p1; or = osd_start_request(exofs_ios_od(ios, first_dev), GFP_KERNEL); if (unlikely(!or)) { EXOFS_ERR("%s: osd_start_request failed\n", __func__); return -ENOMEM; } per_dev->or = or; - ios->numdevs++; if (ios->bio) { - osd_req_read(or, &ios->obj, ios->offset, ios->bio, ios->length); + osd_req_read(or, &ios->obj, per_dev->offset, + per_dev->bio, per_dev->length); EXOFS_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx" " dev=%d\n", _LLU(ios->obj.id), - _LLU(ios->offset), _LLU(ios->length), + _LLU(per_dev->offset), _LLU(per_dev->length), first_dev); } else if (ios->kern_buff) { - int ret = osd_req_read_kern(or, &ios->obj, ios->offset, + int ret = osd_req_read_kern(or, &ios->obj, per_dev->offset, ios->kern_buff, ios->length); - EXOFS_DBGMSG2("read_kern(0x%llx) offset=0x%llx " "length=0x%llx dev=%d ret=>%d\n", - _LLU(ios->obj.id), _LLU(ios->offset), + _LLU(ios->obj.id), _LLU(per_dev->offset), _LLU(ios->length), first_dev, ret); if (unlikely(ret)) return ret; @@ -403,14 +562,32 @@ int exofs_sbi_read(struct exofs_io_state *ios) EXOFS_DBGMSG2("obj(0x%llx) get_attributes=%d dev=%d\n", _LLU(ios->obj.id), ios->in_attr_len, first_dev); } - if (ios->out_attr) osd_req_add_set_attr_list(or, ios->out_attr, ios->out_attr_len); if (ios->in_attr) osd_req_add_get_attr_list(or, ios->in_attr, ios->in_attr_len); - return exofs_io_execute(ios); + return 0; +} + +int exofs_sbi_read(struct exofs_io_state *ios) +{ + int i; + int ret; + + ret = _prepare_for_striping(ios); + if (unlikely(ret)) + return ret; + + for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) { + ret = _sbi_read_mirror(ios, i); + if (unlikely(ret)) + return ret; + } + + ret = exofs_io_execute(ios); + return ret; } int extract_attr_from_ios(struct exofs_io_state *ios, struct osd_attr *attr) @@ -434,42 +611,84 @@ int extract_attr_from_ios(struct exofs_io_state *ios, struct osd_attr *attr) return -EIO; } +static int _truncate_mirrors(struct exofs_io_state *ios, unsigned cur_comp, + struct osd_attr *attr) +{ + int last_comp = cur_comp + ios->layout->mirrors_p1; + + for (; cur_comp < last_comp; ++cur_comp) { + struct exofs_per_dev_state *per_dev = &ios->per_dev[cur_comp]; + struct osd_request *or; + + or = osd_start_request(exofs_ios_od(ios, cur_comp), GFP_KERNEL); + if (unlikely(!or)) { + EXOFS_ERR("%s: osd_start_request failed\n", __func__); + return -ENOMEM; + } + per_dev->or = or; + + osd_req_set_attributes(or, &ios->obj); + osd_req_add_set_attr_list(or, attr, 1); + } + + return 0; +} + int exofs_oi_truncate(struct exofs_i_info *oi, u64 size) { struct exofs_sb_info *sbi = oi->vfs_inode.i_sb->s_fs_info; struct exofs_io_state *ios; - struct osd_attr attr; - __be64 newsize; + struct exofs_trunc_attr { + struct osd_attr attr; + __be64 newsize; + } *size_attrs; + u64 this_obj_size; + unsigned dev; + unsigned unit_off; int i, ret; - if (exofs_get_io_state(&sbi->layout, &ios)) - return -ENOMEM; + ret = exofs_get_io_state(&sbi->layout, &ios); + if (unlikely(ret)) + return ret; + + size_attrs = kcalloc(ios->layout->group_width, sizeof(*size_attrs), + GFP_KERNEL); + if (unlikely(!size_attrs)) { + ret = -ENOMEM; + goto out; + } ios->obj.id = exofs_oi_objno(oi); ios->cred = oi->i_cred; - newsize = cpu_to_be64(size); - attr = g_attr_logical_length; - attr.val_ptr = &newsize; + ios->numdevs = ios->layout->s_numdevs; + _offset_dev_unit_off(ios, size, &this_obj_size, &dev, &unit_off); - for (i = 0; i < sbi->layout.s_numdevs; i++) { - struct osd_request *or; + for (i = 0; i < ios->layout->group_width; ++i) { + struct exofs_trunc_attr *size_attr = &size_attrs[i]; + u64 obj_size; - or = osd_start_request(exofs_ios_od(ios, i), GFP_KERNEL); - if (unlikely(!or)) { - EXOFS_ERR("%s: osd_start_request failed\n", __func__); - ret = -ENOMEM; - goto out; - } - ios->per_dev[i].or = or; - ios->numdevs++; + if (i < dev) + obj_size = this_obj_size + + ios->layout->stripe_unit - unit_off; + else if (i == dev) + obj_size = this_obj_size; + else /* i > dev */ + obj_size = this_obj_size - unit_off; - osd_req_set_attributes(or, &ios->obj); - osd_req_add_set_attr_list(or, &attr, 1); + size_attr->newsize = cpu_to_be64(obj_size); + size_attr->attr = g_attr_logical_length; + size_attr->attr.val_ptr = &size_attr->newsize; + + ret = _truncate_mirrors(ios, i * ios->layout->mirrors_p1, + &size_attr->attr); + if (unlikely(ret)) + goto out; } ret = exofs_io_execute(ios); out: + kfree(size_attrs); exofs_put_io_state(ios); return ret; } diff --git a/fs/exofs/super.c b/fs/exofs/super.c index fc8875186ae8..8f4e4b37a578 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -308,6 +308,8 @@ static void exofs_put_super(struct super_block *sb) static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs, struct exofs_device_table *dt) { + u64 stripe_length; + sbi->data_map.odm_num_comps = le32_to_cpu(dt->dt_data_map.cb_num_comps); sbi->data_map.odm_stripe_unit = @@ -321,14 +323,47 @@ static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs, sbi->data_map.odm_raid_algorithm = le32_to_cpu(dt->dt_data_map.cb_raid_algorithm); -/* FIXME: Hard coded mirror only for now. if not so do not mount */ - if ((sbi->data_map.odm_num_comps != numdevs) || - (sbi->data_map.odm_stripe_unit != EXOFS_BLKSIZE) || - (sbi->data_map.odm_raid_algorithm != PNFS_OSD_RAID_0) || - (sbi->data_map.odm_mirror_cnt != (numdevs - 1))) +/* FIXME: Only raid0 !group_width/depth for now. if not so, do not mount */ + if (sbi->data_map.odm_group_width || sbi->data_map.odm_group_depth) { + EXOFS_ERR("Group width/depth not supported\n"); return -EINVAL; - else - return 0; + } + if (sbi->data_map.odm_num_comps != numdevs) { + EXOFS_ERR("odm_num_comps(%u) != numdevs(%u)\n", + sbi->data_map.odm_num_comps, numdevs); + return -EINVAL; + } + if (sbi->data_map.odm_raid_algorithm != PNFS_OSD_RAID_0) { + EXOFS_ERR("Only RAID_0 for now\n"); + return -EINVAL; + } + if (0 != (numdevs % (sbi->data_map.odm_mirror_cnt + 1))) { + EXOFS_ERR("Data Map wrong, numdevs=%d mirrors=%d\n", + numdevs, sbi->data_map.odm_mirror_cnt); + return -EINVAL; + } + + stripe_length = sbi->data_map.odm_stripe_unit * + (numdevs / (sbi->data_map.odm_mirror_cnt + 1)); + if (stripe_length >= (1ULL << 32)) { + EXOFS_ERR("Total Stripe length(0x%llx)" + " >= 32bit is not supported\n", _LLU(stripe_length)); + return -EINVAL; + } + + if (0 != (sbi->data_map.odm_stripe_unit & ~PAGE_MASK)) { + EXOFS_ERR("Stripe Unit(0x%llx)" + " must be Multples of PAGE_SIZE(0x%lx)\n", + _LLU(sbi->data_map.odm_stripe_unit), PAGE_SIZE); + return -EINVAL; + } + + sbi->layout.stripe_unit = sbi->data_map.odm_stripe_unit; + sbi->layout.mirrors_p1 = sbi->data_map.odm_mirror_cnt + 1; + sbi->layout.group_width = sbi->data_map.odm_num_comps / + sbi->layout.mirrors_p1; + + return 0; } /* @odi is valid only as long as @fscb_dev is valid */ @@ -502,6 +537,9 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) } /* Default layout in case we do not have a device-table */ + sbi->layout.stripe_unit = PAGE_SIZE; + sbi->layout.mirrors_p1 = 1; + sbi->layout.group_width = 1; sbi->layout.s_ods[0] = od; sbi->layout.s_numdevs = 1; sbi->layout.s_pid = opts->pid; -- cgit v1.2.3 From 86093aaff5be5b214613eb60553e236bdb389c84 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh <bharrosh@panasas.com> Date: Thu, 28 Jan 2010 18:24:06 +0200 Subject: exofs: convert io_state to use pages array instead of bio at input * inode.c operations are full-pages based, and not actually true scatter-gather * Lets us use more pages at once upto 512 (from 249) in 64 bit * Brings us much much closer to be able to use exofs's io_state engine from objlayout driver. (Once I decide where to put the common code) After RAID0 patch the outer (input) bio was never used as a bio, but was simply a page carrier into the raid engine. Even in the simple mirror/single-dev arrangement pages info was copied into a second bio. It is now easer to just pass a pages array into the io_state and prepare bio(s) once. Signed-off-by: Boaz Harrosh <bharrosh@panasas.com> --- fs/exofs/exofs.h | 5 +++- fs/exofs/inode.c | 81 ++++++++++++++++++++++++++++---------------------------- fs/exofs/ios.c | 46 ++++++++++++++++++-------------- 3 files changed, 71 insertions(+), 61 deletions(-) (limited to 'fs') diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h index 0d8a34b21ae1..acfebd36de83 100644 --- a/fs/exofs/exofs.h +++ b/fs/exofs/exofs.h @@ -128,7 +128,10 @@ struct exofs_io_state { loff_t offset; unsigned long length; void *kern_buff; - struct bio *bio; + + struct page **pages; + unsigned nr_pages; + unsigned pgbase; /* Attributes */ unsigned in_attr_len; diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index 2b3163ea56eb..6ca0b0117f04 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -41,16 +41,18 @@ enum { BIO_MAX_PAGES_KMALLOC = (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec), + MAX_PAGES_KMALLOC = + PAGE_SIZE / sizeof(struct page *), }; struct page_collect { struct exofs_sb_info *sbi; - struct request_queue *req_q; struct inode *inode; unsigned expected_pages; struct exofs_io_state *ios; - struct bio *bio; + struct page **pages; + unsigned alloc_pages; unsigned nr_pages; unsigned long length; loff_t pg_first; /* keep 64bit also in 32-arches */ @@ -62,15 +64,12 @@ static void _pcol_init(struct page_collect *pcol, unsigned expected_pages, struct exofs_sb_info *sbi = inode->i_sb->s_fs_info; pcol->sbi = sbi; - /* Create master bios on first Q, later on cloning, each clone will be - * allocated on it's destination Q - */ - pcol->req_q = osd_request_queue(sbi->layout.s_ods[0]); pcol->inode = inode; pcol->expected_pages = expected_pages; pcol->ios = NULL; - pcol->bio = NULL; + pcol->pages = NULL; + pcol->alloc_pages = 0; pcol->nr_pages = 0; pcol->length = 0; pcol->pg_first = -1; @@ -80,7 +79,8 @@ static void _pcol_reset(struct page_collect *pcol) { pcol->expected_pages -= min(pcol->nr_pages, pcol->expected_pages); - pcol->bio = NULL; + pcol->pages = NULL; + pcol->alloc_pages = 0; pcol->nr_pages = 0; pcol->length = 0; pcol->pg_first = -1; @@ -90,13 +90,13 @@ static void _pcol_reset(struct page_collect *pcol) * it might not end here. don't be left with nothing */ if (!pcol->expected_pages) - pcol->expected_pages = BIO_MAX_PAGES_KMALLOC; + pcol->expected_pages = MAX_PAGES_KMALLOC; } static int pcol_try_alloc(struct page_collect *pcol) { - int pages = min_t(unsigned, pcol->expected_pages, - BIO_MAX_PAGES_KMALLOC); + unsigned pages = min_t(unsigned, pcol->expected_pages, + MAX_PAGES_KMALLOC); if (!pcol->ios) { /* First time allocate io_state */ int ret = exofs_get_io_state(&pcol->sbi->layout, &pcol->ios); @@ -105,23 +105,28 @@ static int pcol_try_alloc(struct page_collect *pcol) return ret; } + /* TODO: easily support bio chaining */ + pages = min_t(unsigned, pages, + pcol->sbi->layout.group_width * BIO_MAX_PAGES_KMALLOC); + for (; pages; pages >>= 1) { - pcol->bio = bio_kmalloc(GFP_KERNEL, pages); - if (likely(pcol->bio)) + pcol->pages = kmalloc(pages * sizeof(struct page *), + GFP_KERNEL); + if (likely(pcol->pages)) { + pcol->alloc_pages = pages; return 0; + } } - EXOFS_ERR("Failed to bio_kmalloc expected_pages=%u\n", + EXOFS_ERR("Failed to kmalloc expected_pages=%u\n", pcol->expected_pages); return -ENOMEM; } static void pcol_free(struct page_collect *pcol) { - if (pcol->bio) { - bio_put(pcol->bio); - pcol->bio = NULL; - } + kfree(pcol->pages); + pcol->pages = NULL; if (pcol->ios) { exofs_put_io_state(pcol->ios); @@ -132,11 +137,10 @@ static void pcol_free(struct page_collect *pcol) static int pcol_add_page(struct page_collect *pcol, struct page *page, unsigned len) { - int added_len = bio_add_pc_page(pcol->req_q, pcol->bio, page, len, 0); - if (unlikely(len != added_len)) + if (unlikely(pcol->nr_pages >= pcol->alloc_pages)) return -ENOMEM; - ++pcol->nr_pages; + pcol->pages[pcol->nr_pages++] = page; pcol->length += len; return 0; } @@ -181,7 +185,6 @@ static void update_write_page(struct page *page, int ret) */ static int __readpages_done(struct page_collect *pcol, bool do_unlock) { - struct bio_vec *bvec; int i; u64 resid; u64 good_bytes; @@ -198,8 +201,8 @@ static int __readpages_done(struct page_collect *pcol, bool do_unlock) pcol->inode->i_ino, _LLU(good_bytes), pcol->length, pcol->nr_pages); - __bio_for_each_segment(bvec, pcol->bio, i, 0) { - struct page *page = bvec->bv_page; + for (i = 0; i < pcol->nr_pages; i++) { + struct page *page = pcol->pages[i]; struct inode *inode = page->mapping->host; int page_stat; @@ -218,7 +221,7 @@ static int __readpages_done(struct page_collect *pcol, bool do_unlock) ret = update_read_page(page, page_stat); if (do_unlock) unlock_page(page); - length += bvec->bv_len; + length += PAGE_SIZE; } pcol_free(pcol); @@ -238,11 +241,10 @@ static void readpages_done(struct exofs_io_state *ios, void *p) static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw) { - struct bio_vec *bvec; int i; - __bio_for_each_segment(bvec, pcol->bio, i, 0) { - struct page *page = bvec->bv_page; + for (i = 0; i < pcol->nr_pages; i++) { + struct page *page = pcol->pages[i]; if (rw == READ) update_read_page(page, ret); @@ -260,13 +262,14 @@ static int read_exec(struct page_collect *pcol, bool is_sync) struct page_collect *pcol_copy = NULL; int ret; - if (!pcol->bio) + if (!pcol->pages) return 0; /* see comment in _readpage() about sync reads */ WARN_ON(is_sync && (pcol->nr_pages != 1)); - ios->bio = pcol->bio; + ios->pages = pcol->pages; + ios->nr_pages = pcol->nr_pages; ios->length = pcol->length; ios->offset = pcol->pg_first << PAGE_CACHE_SHIFT; @@ -366,7 +369,7 @@ try_again: goto try_again; } - if (!pcol->bio) { + if (!pcol->pages) { ret = pcol_try_alloc(pcol); if (unlikely(ret)) goto fail; @@ -448,7 +451,6 @@ static int exofs_readpage(struct file *file, struct page *page) static void writepages_done(struct exofs_io_state *ios, void *p) { struct page_collect *pcol = p; - struct bio_vec *bvec; int i; u64 resid; u64 good_bytes; @@ -467,8 +469,8 @@ static void writepages_done(struct exofs_io_state *ios, void *p) pcol->inode->i_ino, _LLU(good_bytes), pcol->length, pcol->nr_pages); - __bio_for_each_segment(bvec, pcol->bio, i, 0) { - struct page *page = bvec->bv_page; + for (i = 0; i < pcol->nr_pages; i++) { + struct page *page = pcol->pages[i]; struct inode *inode = page->mapping->host; int page_stat; @@ -485,7 +487,7 @@ static void writepages_done(struct exofs_io_state *ios, void *p) EXOFS_DBGMSG2(" writepages_done(0x%lx, 0x%lx) status=%d\n", inode->i_ino, page->index, page_stat); - length += bvec->bv_len; + length += PAGE_SIZE; } pcol_free(pcol); @@ -500,7 +502,7 @@ static int write_exec(struct page_collect *pcol) struct page_collect *pcol_copy = NULL; int ret; - if (!pcol->bio) + if (!pcol->pages) return 0; pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL); @@ -512,9 +514,8 @@ static int write_exec(struct page_collect *pcol) *pcol_copy = *pcol; - pcol_copy->bio->bi_rw |= (1 << BIO_RW); /* FIXME: bio_set_dir() */ - - ios->bio = pcol_copy->bio; + ios->pages = pcol_copy->pages; + ios->nr_pages = pcol_copy->nr_pages; ios->offset = pcol_copy->pg_first << PAGE_CACHE_SHIFT; ios->length = pcol_copy->length; ios->done = writepages_done; @@ -605,7 +606,7 @@ try_again: goto try_again; } - if (!pcol->bio) { + if (!pcol->pages) { ret = pcol_try_alloc(pcol); if (unlikely(ret)) goto fail; diff --git a/fs/exofs/ios.c b/fs/exofs/ios.c index 6e446b2670b9..263052c77f41 100644 --- a/fs/exofs/ios.c +++ b/fs/exofs/ios.c @@ -283,10 +283,11 @@ static void _offset_dev_unit_off(struct exofs_io_state *ios, u64 file_offset, *dev = stripe_mod / stripe_unit * ios->layout->mirrors_p1; } -static int _add_stripe_unit(struct exofs_io_state *ios, unsigned *cur_bvec, - struct exofs_per_dev_state *per_dev, int cur_len) +static int _add_stripe_unit(struct exofs_io_state *ios, unsigned *cur_pg, + unsigned pgbase, struct exofs_per_dev_state *per_dev, + int cur_len) { - unsigned bv = *cur_bvec; + unsigned pg = *cur_pg; struct request_queue *q = osd_request_queue(exofs_ios_od(ios, per_dev->dev)); @@ -295,7 +296,7 @@ static int _add_stripe_unit(struct exofs_io_state *ios, unsigned *cur_bvec, if (per_dev->bio == NULL) { unsigned pages_in_stripe = ios->layout->group_width * (ios->layout->stripe_unit / PAGE_SIZE); - unsigned bio_size = (ios->bio->bi_vcnt + pages_in_stripe) / + unsigned bio_size = (ios->nr_pages + pages_in_stripe) / ios->layout->group_width; per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size); @@ -307,21 +308,22 @@ static int _add_stripe_unit(struct exofs_io_state *ios, unsigned *cur_bvec, } while (cur_len > 0) { - int added_len; - struct bio_vec *bvec = &ios->bio->bi_io_vec[bv]; + unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len); + unsigned added_len; - BUG_ON(ios->bio->bi_vcnt <= bv); - cur_len -= bvec->bv_len; + BUG_ON(ios->nr_pages <= pg); + cur_len -= pglen; - added_len = bio_add_pc_page(q, per_dev->bio, bvec->bv_page, - bvec->bv_len, bvec->bv_offset); - if (unlikely(bvec->bv_len != added_len)) + added_len = bio_add_pc_page(q, per_dev->bio, ios->pages[pg], + pglen, pgbase); + if (unlikely(pglen != added_len)) return -ENOMEM; - ++bv; + pgbase = 0; + ++pg; } BUG_ON(cur_len); - *cur_bvec = bv; + *cur_pg = pg; return 0; } @@ -332,10 +334,10 @@ static int _prepare_for_striping(struct exofs_io_state *ios) unsigned stripe_unit = ios->layout->stripe_unit; unsigned comp = 0; unsigned stripes = 0; - unsigned cur_bvec = 0; - int ret; + unsigned cur_pg = 0; + int ret = 0; - if (!ios->bio) { + if (!ios->pages) { if (ios->kern_buff) { struct exofs_per_dev_state *per_dev = &ios->per_dev[0]; unsigned unit_off; @@ -352,7 +354,7 @@ static int _prepare_for_striping(struct exofs_io_state *ios) while (length) { struct exofs_per_dev_state *per_dev = &ios->per_dev[comp]; - unsigned cur_len; + unsigned cur_len, page_off; if (!per_dev->length) { unsigned unit_off; @@ -362,11 +364,15 @@ static int _prepare_for_striping(struct exofs_io_state *ios) stripes++; cur_len = min_t(u64, stripe_unit - unit_off, length); offset += cur_len; + page_off = unit_off & ~PAGE_MASK; + BUG_ON(page_off != ios->pgbase); } else { cur_len = min_t(u64, stripe_unit, length); + page_off = 0; } - ret = _add_stripe_unit(ios, &cur_bvec, per_dev, cur_len); + ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev, + cur_len); if (unlikely(ret)) goto out; @@ -448,7 +454,7 @@ static int _sbi_write_mirror(struct exofs_io_state *ios, int cur_comp) per_dev->or = or; per_dev->offset = master_dev->offset; - if (ios->bio) { + if (ios->pages) { struct bio *bio; if (per_dev != master_dev) { @@ -541,7 +547,7 @@ static int _sbi_read_mirror(struct exofs_io_state *ios, unsigned cur_comp) } per_dev->or = or; - if (ios->bio) { + if (ios->pages) { osd_req_read(or, &ios->obj, per_dev->offset, per_dev->bio, per_dev->length); EXOFS_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx" -- cgit v1.2.3 From 96391e2bae0f8882b6f44809202a68be66e91dce Mon Sep 17 00:00:00 2001 From: Boaz Harrosh <bharrosh@panasas.com> Date: Tue, 9 Feb 2010 11:43:21 +0200 Subject: exofs: Error recovery if object is missing from storage If an object is referenced by a directory but does not exist on a target, it is a very serious corruption that means: 1. Either a power failure with very slim chance of it happening. Because the directory update is always submitted much after object creation, but if a directory is written to one device and the object creation to another it might theoretically happen. 2. It only ever happened to me while developing with BUGs causing file corruption. Crashes could also cause it but they are more like case 1. In any way the object does not exist, so data is surely lost. If there is a mix-up in the obj-id or data-map, then lost objects can be salvaged by off-line fsck. The only recoverable information is the directory name. By letting it appear as a regular empty file, with date==0 (1970 Jan 1st) ownership to root, we enable recovery of the only useful information. And also enable deletion or over-write. I can see how this can hurt. Signed-off-by: Boaz Harrosh <bharrosh@panasas.com> --- fs/exofs/inode.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index 6ca0b0117f04..5514f3c2c2f4 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -903,8 +903,18 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi, ios->in_attr_len = ARRAY_SIZE(attrs); ret = exofs_sbi_read(ios); - if (ret) + if (unlikely(ret)) { + EXOFS_ERR("object(0x%llx) corrupted, return empty file=>%d\n", + _LLU(ios->obj.id), ret); + memset(inode, 0, sizeof(*inode)); + inode->i_mode = 0040000 | (0777 & ~022); + /* If object is lost on target we might as well enable it's + * delete. + */ + if ((ret == -ENOENT) || (ret == -EINVAL)) + ret = 0; goto out; + } ret = extract_attr_from_ios(ios, &attrs[0]); if (ret) { -- cgit v1.2.3 From b367e78bd1c7af4c018ce98b1f6d3e001aba895a Mon Sep 17 00:00:00 2001 From: Boaz Harrosh <bharrosh@panasas.com> Date: Sun, 7 Feb 2010 19:18:58 +0200 Subject: exofs: Prepare for groups * Rename _offset_dev_unit_off() to _calc_stripe_info() and recieve a struct for the output params * In _prepare_for_striping we only need to call _calc_stripe_info() once. The other componets are easy to calculate from that. This code was inspired by what's done in truncate. * Some code shifts that make sense now but will make more sense when group support is added. Signed-off-by: Boaz Harrosh <bharrosh@panasas.com> --- fs/exofs/ios.c | 159 +++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 99 insertions(+), 60 deletions(-) (limited to 'fs') diff --git a/fs/exofs/ios.c b/fs/exofs/ios.c index 263052c77f41..d28febdf54ab 100644 --- a/fs/exofs/ios.c +++ b/fs/exofs/ios.c @@ -259,28 +259,46 @@ int exofs_check_io(struct exofs_io_state *ios, u64 *resid) return acumulated_lin_err; } -/* REMOVEME: After review - Some quoteing from the standard - - L = logical offset into the file - W = number of data components in a stripe - S = W * stripe_unit (S is Stripe length) - N = L / S (N is the stripe Number) - C = (L-(N*S)) / stripe_unit (C is the component) - O = (N*stripe_unit)+(L%stripe_unit) (O is the object's offset) -*/ - -static void _offset_dev_unit_off(struct exofs_io_state *ios, u64 file_offset, - u64 *obj_offset, unsigned *dev, unsigned *unit_off) +/* + * L - logical offset into the file + * + * U - The number of bytes in a full stripe + * + * U = stripe_unit * group_width + * + * N - The stripe number + * + * N = L / U + * + * C - The component index coresponding to L + * + * C = (L - (N*U)) / stripe_unit + * + * O - The component offset coresponding to L + * + * (N*stripe_unit)+(L%stripe_unit) + */ + +struct _striping_info { + u64 obj_offset; + unsigned dev; + unsigned unit_off; +}; + +static void _calc_stripe_info(struct exofs_io_state *ios, u64 file_offset, + struct _striping_info *si) { - unsigned stripe_unit = ios->layout->stripe_unit; - unsigned stripe_length = stripe_unit * ios->layout->group_width; - u64 stripe_no = file_offset; - unsigned stripe_mod = do_div(stripe_no, stripe_length); + u32 stripe_unit = ios->layout->stripe_unit; + u32 group_width = ios->layout->group_width; + u32 U = stripe_unit * group_width; + + u32 LmodU; + u64 N = div_u64_rem(file_offset, U, &LmodU); - *unit_off = stripe_mod % stripe_unit; - *obj_offset = stripe_no * stripe_unit + *unit_off; - *dev = stripe_mod / stripe_unit * ios->layout->mirrors_p1; + si->unit_off = LmodU % stripe_unit; + si->obj_offset = N * stripe_unit + si->unit_off; + si->dev = LmodU / stripe_unit; + si->dev *= ios->layout->mirrors_p1; } static int _add_stripe_unit(struct exofs_io_state *ios, unsigned *cur_pg, @@ -327,65 +345,88 @@ static int _add_stripe_unit(struct exofs_io_state *ios, unsigned *cur_pg, return 0; } -static int _prepare_for_striping(struct exofs_io_state *ios) +static int _prepare_pages(struct exofs_io_state *ios, + struct _striping_info *si) { u64 length = ios->length; - u64 offset = ios->offset; unsigned stripe_unit = ios->layout->stripe_unit; + unsigned mirrors_p1 = ios->layout->mirrors_p1; + unsigned dev = si->dev; unsigned comp = 0; unsigned stripes = 0; unsigned cur_pg = 0; int ret = 0; - if (!ios->pages) { - if (ios->kern_buff) { - struct exofs_per_dev_state *per_dev = &ios->per_dev[0]; - unsigned unit_off; - - _offset_dev_unit_off(ios, offset, &per_dev->offset, - &per_dev->dev, &unit_off); - /* no cross device without page array */ - BUG_ON((ios->layout->group_width > 1) && - (unit_off + length > stripe_unit)); - } - ios->numdevs = ios->layout->mirrors_p1; - return 0; - } - while (length) { struct exofs_per_dev_state *per_dev = &ios->per_dev[comp]; - unsigned cur_len, page_off; + unsigned cur_len, page_off = 0; if (!per_dev->length) { - unsigned unit_off; + per_dev->dev = dev; + if (dev < si->dev) { + per_dev->offset = si->obj_offset + stripe_unit - + si->unit_off; + cur_len = stripe_unit; + } else if (dev == si->dev) { + per_dev->offset = si->obj_offset; + cur_len = stripe_unit - si->unit_off; + page_off = si->unit_off & ~PAGE_MASK; + BUG_ON(page_off && (page_off != ios->pgbase)); + } else { /* dev > si->dev */ + per_dev->offset = si->obj_offset - si->unit_off; + cur_len = stripe_unit; + } - _offset_dev_unit_off(ios, offset, &per_dev->offset, - &per_dev->dev, &unit_off); stripes++; - cur_len = min_t(u64, stripe_unit - unit_off, length); - offset += cur_len; - page_off = unit_off & ~PAGE_MASK; - BUG_ON(page_off != ios->pgbase); + + dev += mirrors_p1; + dev %= ios->layout->s_numdevs; } else { - cur_len = min_t(u64, stripe_unit, length); - page_off = 0; + cur_len = stripe_unit; } + if (cur_len >= length) + cur_len = length; ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev, cur_len); if (unlikely(ret)) goto out; - comp += ios->layout->mirrors_p1; + comp += mirrors_p1; comp %= ios->layout->s_numdevs; length -= cur_len; } out: - ios->numdevs = stripes * ios->layout->mirrors_p1; + ios->numdevs = stripes * mirrors_p1; return ret; } +static int _prepare_for_striping(struct exofs_io_state *ios) +{ + struct _striping_info si; + + _calc_stripe_info(ios, ios->offset, &si); + + if (!ios->pages) { + if (ios->kern_buff) { + struct exofs_per_dev_state *per_dev = &ios->per_dev[0]; + + per_dev->offset = si.obj_offset; + per_dev->dev = si.dev; + + /* no cross device without page array */ + BUG_ON((ios->layout->group_width > 1) && + (si.unit_off + ios->length > + ios->layout->stripe_unit)); + } + ios->numdevs = ios->layout->mirrors_p1; + return 0; + } + + return _prepare_pages(ios, &si); +} + int exofs_sbi_create(struct exofs_io_state *ios) { int i, ret; @@ -648,9 +689,7 @@ int exofs_oi_truncate(struct exofs_i_info *oi, u64 size) struct osd_attr attr; __be64 newsize; } *size_attrs; - u64 this_obj_size; - unsigned dev; - unsigned unit_off; + struct _striping_info si; int i, ret; ret = exofs_get_io_state(&sbi->layout, &ios); @@ -668,19 +707,19 @@ int exofs_oi_truncate(struct exofs_i_info *oi, u64 size) ios->cred = oi->i_cred; ios->numdevs = ios->layout->s_numdevs; - _offset_dev_unit_off(ios, size, &this_obj_size, &dev, &unit_off); + _calc_stripe_info(ios, size, &si); for (i = 0; i < ios->layout->group_width; ++i) { struct exofs_trunc_attr *size_attr = &size_attrs[i]; u64 obj_size; - if (i < dev) - obj_size = this_obj_size + - ios->layout->stripe_unit - unit_off; - else if (i == dev) - obj_size = this_obj_size; - else /* i > dev */ - obj_size = this_obj_size - unit_off; + if (i < si.dev) + obj_size = si.obj_offset + + ios->layout->stripe_unit - si.unit_off; + else if (i == si.dev) + obj_size = si.obj_offset; + else /* i > si.dev */ + obj_size = si.obj_offset - si.unit_off; size_attr->newsize = cpu_to_be64(obj_size); size_attr->attr = g_attr_logical_length; -- cgit v1.2.3 From 50a76fd3c352ed2740eba01512efcfceee0703be Mon Sep 17 00:00:00 2001 From: Boaz Harrosh <bharrosh@panasas.com> Date: Thu, 11 Feb 2010 13:01:39 +0200 Subject: exofs: groups support * _calc_stripe_info() changes to accommodate for grouping calculations. Returns additional information * old _prepare_pages() becomes _prepare_one_group() which stores pages belonging to one device group. * New _prepare_for_striping iterates on all groups calling _prepare_one_group(). * Enable mounting of groups data_maps (group_width != 0) [QUESTION] what is faster A or B; A. x += stride; x = x % width + first_x; B x += stride if (x < last_x) x = first_x; Signed-off-by: Boaz Harrosh <bharrosh@panasas.com> --- fs/exofs/exofs.h | 3 ++ fs/exofs/ios.c | 129 +++++++++++++++++++++++++++++++++++++++++++++---------- fs/exofs/super.c | 46 ++++++++++++++------ 3 files changed, 141 insertions(+), 37 deletions(-) (limited to 'fs') diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h index acfebd36de83..59b8bf2825c7 100644 --- a/fs/exofs/exofs.h +++ b/fs/exofs/exofs.h @@ -63,6 +63,8 @@ struct exofs_layout { unsigned mirrors_p1; unsigned group_width; + u64 group_depth; + unsigned group_count; enum exofs_inode_layout_gen_functions lay_func; @@ -132,6 +134,7 @@ struct exofs_io_state { struct page **pages; unsigned nr_pages; unsigned pgbase; + unsigned pages_consumed; /* Attributes */ unsigned in_attr_len; diff --git a/fs/exofs/ios.c b/fs/exofs/ios.c index d28febdf54ab..5293bc411d17 100644 --- a/fs/exofs/ios.c +++ b/fs/exofs/ios.c @@ -262,25 +262,50 @@ int exofs_check_io(struct exofs_io_state *ios, u64 *resid) /* * L - logical offset into the file * - * U - The number of bytes in a full stripe + * U - The number of bytes in a stripe within a group * * U = stripe_unit * group_width * - * N - The stripe number + * T - The number of bytes striped within a group of component objects + * (before advancing to the next group) * - * N = L / U + * T = stripe_unit * group_width * group_depth + * + * S - The number of bytes striped across all component objects + * before the pattern repeats + * + * S = stripe_unit * group_width * group_depth * group_count + * + * M - The "major" (i.e., across all components) stripe number + * + * M = L / S + * + * G - Counts the groups from the beginning of the major stripe + * + * G = (L - (M * S)) / T [or (L % S) / T] + * + * H - The byte offset within the group + * + * H = (L - (M * S)) % T [or (L % S) % T] + * + * N - The "minor" (i.e., across the group) stripe number + * + * N = H / U * * C - The component index coresponding to L * - * C = (L - (N*U)) / stripe_unit + * C = (H - (N * U)) / stripe_unit + G * group_width + * [or (L % U) / stripe_unit + G * group_width] * * O - The component offset coresponding to L * - * (N*stripe_unit)+(L%stripe_unit) + * O = L % stripe_unit + N * stripe_unit + M * group_depth * stripe_unit */ - struct _striping_info { u64 obj_offset; + u64 group_length; + u64 total_group_length; + u64 Major; unsigned dev; unsigned unit_off; }; @@ -290,15 +315,35 @@ static void _calc_stripe_info(struct exofs_io_state *ios, u64 file_offset, { u32 stripe_unit = ios->layout->stripe_unit; u32 group_width = ios->layout->group_width; + u64 group_depth = ios->layout->group_depth; + u32 U = stripe_unit * group_width; + u64 T = U * group_depth; + u64 S = T * ios->layout->group_count; + u64 M = div64_u64(file_offset, S); + + /* + G = (L - (M * S)) / T + H = (L - (M * S)) % T + */ + u64 LmodS = file_offset - M * S; + u32 G = div64_u64(LmodS, T); + u64 H = LmodS - G * T; + + u32 N = div_u64(H, U); + + /* "H - (N * U)" is just "H % U" so it's bound to u32 */ + si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width; + si->dev *= ios->layout->mirrors_p1; - u32 LmodU; - u64 N = div_u64_rem(file_offset, U, &LmodU); + div_u64_rem(file_offset, stripe_unit, &si->unit_off); - si->unit_off = LmodU % stripe_unit; - si->obj_offset = N * stripe_unit + si->unit_off; - si->dev = LmodU / stripe_unit; - si->dev *= ios->layout->mirrors_p1; + si->obj_offset = si->unit_off + (N * stripe_unit) + + (M * group_depth * stripe_unit); + + si->group_length = T - H; + si->total_group_length = T; + si->Major = M; } static int _add_stripe_unit(struct exofs_io_state *ios, unsigned *cur_pg, @@ -345,16 +390,17 @@ static int _add_stripe_unit(struct exofs_io_state *ios, unsigned *cur_pg, return 0; } -static int _prepare_pages(struct exofs_io_state *ios, - struct _striping_info *si) +static int _prepare_one_group(struct exofs_io_state *ios, u64 length, + struct _striping_info *si, unsigned first_comp) { - u64 length = ios->length; unsigned stripe_unit = ios->layout->stripe_unit; unsigned mirrors_p1 = ios->layout->mirrors_p1; + unsigned devs_in_group = ios->layout->group_width * mirrors_p1; unsigned dev = si->dev; - unsigned comp = 0; - unsigned stripes = 0; - unsigned cur_pg = 0; + unsigned first_dev = dev - (dev % devs_in_group); + unsigned comp = first_comp + (dev - first_dev); + unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0; + unsigned cur_pg = ios->pages_consumed; int ret = 0; while (length) { @@ -377,10 +423,11 @@ static int _prepare_pages(struct exofs_io_state *ios, cur_len = stripe_unit; } - stripes++; + if (max_comp < comp) + max_comp = comp; dev += mirrors_p1; - dev %= ios->layout->s_numdevs; + dev = (dev % devs_in_group) + first_dev; } else { cur_len = stripe_unit; } @@ -393,18 +440,24 @@ static int _prepare_pages(struct exofs_io_state *ios, goto out; comp += mirrors_p1; - comp %= ios->layout->s_numdevs; + comp = (comp % devs_in_group) + first_comp; length -= cur_len; } out: - ios->numdevs = stripes * mirrors_p1; + ios->numdevs = max_comp + mirrors_p1; + ios->pages_consumed = cur_pg; return ret; } static int _prepare_for_striping(struct exofs_io_state *ios) { + u64 length = ios->length; struct _striping_info si; + unsigned devs_in_group = ios->layout->group_width * + ios->layout->mirrors_p1; + unsigned first_comp = 0; + int ret = 0; _calc_stripe_info(ios, ios->offset, &si); @@ -424,7 +477,31 @@ static int _prepare_for_striping(struct exofs_io_state *ios) return 0; } - return _prepare_pages(ios, &si); + while (length) { + if (length < si.group_length) + si.group_length = length; + + ret = _prepare_one_group(ios, si.group_length, &si, first_comp); + if (unlikely(ret)) + goto out; + + length -= si.group_length; + + si.group_length = si.total_group_length; + si.unit_off = 0; + ++si.Major; + si.obj_offset = si.Major * ios->layout->stripe_unit * + ios->layout->group_depth; + + si.dev = (si.dev - (si.dev % devs_in_group)) + devs_in_group; + si.dev %= ios->layout->s_numdevs; + + first_comp += devs_in_group; + first_comp %= ios->layout->s_numdevs; + } + +out: + return ret; } int exofs_sbi_create(struct exofs_io_state *ios) @@ -482,6 +559,9 @@ static int _sbi_write_mirror(struct exofs_io_state *ios, int cur_comp) unsigned last_comp = cur_comp + ios->layout->mirrors_p1; int ret = 0; + if (ios->pages && !master_dev->length) + return 0; /* Just an empty slot */ + for (; cur_comp < last_comp; ++cur_comp, ++dev) { struct exofs_per_dev_state *per_dev = &ios->per_dev[cur_comp]; struct osd_request *or; @@ -580,6 +660,9 @@ static int _sbi_read_mirror(struct exofs_io_state *ios, unsigned cur_comp) struct exofs_per_dev_state *per_dev = &ios->per_dev[cur_comp]; unsigned first_dev = (unsigned)ios->obj.id; + if (ios->pages && !per_dev->length) + return 0; /* Just an empty slot */ + first_dev = per_dev->dev + first_dev % ios->layout->mirrors_p1; or = osd_start_request(exofs_ios_od(ios, first_dev), GFP_KERNEL); if (unlikely(!or)) { diff --git a/fs/exofs/super.c b/fs/exofs/super.c index 8f4e4b37a578..6cf5e4e84d61 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -323,11 +323,7 @@ static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs, sbi->data_map.odm_raid_algorithm = le32_to_cpu(dt->dt_data_map.cb_raid_algorithm); -/* FIXME: Only raid0 !group_width/depth for now. if not so, do not mount */ - if (sbi->data_map.odm_group_width || sbi->data_map.odm_group_depth) { - EXOFS_ERR("Group width/depth not supported\n"); - return -EINVAL; - } +/* FIXME: Only raid0 for now. if not so, do not mount */ if (sbi->data_map.odm_num_comps != numdevs) { EXOFS_ERR("odm_num_comps(%u) != numdevs(%u)\n", sbi->data_map.odm_num_comps, numdevs); @@ -343,14 +339,6 @@ static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs, return -EINVAL; } - stripe_length = sbi->data_map.odm_stripe_unit * - (numdevs / (sbi->data_map.odm_mirror_cnt + 1)); - if (stripe_length >= (1ULL << 32)) { - EXOFS_ERR("Total Stripe length(0x%llx)" - " >= 32bit is not supported\n", _LLU(stripe_length)); - return -EINVAL; - } - if (0 != (sbi->data_map.odm_stripe_unit & ~PAGE_MASK)) { EXOFS_ERR("Stripe Unit(0x%llx)" " must be Multples of PAGE_SIZE(0x%lx)\n", @@ -360,8 +348,36 @@ static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs, sbi->layout.stripe_unit = sbi->data_map.odm_stripe_unit; sbi->layout.mirrors_p1 = sbi->data_map.odm_mirror_cnt + 1; - sbi->layout.group_width = sbi->data_map.odm_num_comps / + + if (sbi->data_map.odm_group_width) { + sbi->layout.group_width = sbi->data_map.odm_group_width; + sbi->layout.group_depth = sbi->data_map.odm_group_depth; + if (!sbi->layout.group_depth) { + EXOFS_ERR("group_depth == 0 && group_width != 0\n"); + return -EINVAL; + } + sbi->layout.group_count = sbi->data_map.odm_num_comps / + sbi->layout.mirrors_p1 / + sbi->data_map.odm_group_width; + } else { + if (sbi->data_map.odm_group_depth) { + printk(KERN_NOTICE "Warning: group_depth ignored " + "group_width == 0 && group_depth == %d\n", + sbi->data_map.odm_group_depth); + sbi->data_map.odm_group_depth = 0; + } + sbi->layout.group_width = sbi->data_map.odm_num_comps / sbi->layout.mirrors_p1; + sbi->layout.group_depth = -1; + sbi->layout.group_count = 1; + } + + stripe_length = (u64)sbi->layout.group_width * sbi->layout.stripe_unit; + if (stripe_length >= (1ULL << 32)) { + EXOFS_ERR("Total Stripe length(0x%llx)" + " >= 32bit is not supported\n", _LLU(stripe_length)); + return -EINVAL; + } return 0; } @@ -540,6 +556,8 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) sbi->layout.stripe_unit = PAGE_SIZE; sbi->layout.mirrors_p1 = 1; sbi->layout.group_width = 1; + sbi->layout.group_depth = -1; + sbi->layout.group_count = 1; sbi->layout.s_ods[0] = od; sbi->layout.s_numdevs = 1; sbi->layout.s_pid = opts->pid; -- cgit v1.2.3 From 9f7cdbc33f36d28e57eaba0093f68f0d14c38c5b Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov <dmonakhov@openvz.org> Date: Sat, 27 Feb 2010 20:35:12 +0300 Subject: blkdev: fix merge_bvec_fn return value checks merge_bvec_fn() returns bvec->bv_len on success. So we have to check against this value. But in case of fs_optimization merge we compare with wrong value. This patch must be included in b428cd6da7e6559aca69aa2e3a526037d3f20403 But accidentally i've forgot to add this in the initial patch. To make things straight let's replace all such checks. In fact this makes code easy to understand. Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org> Signed-off-by: Jens Axboe <jens.axboe@oracle.com> --- fs/bio.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/bio.c b/fs/bio.c index dc17afd672e3..0bda289f86fc 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -555,7 +555,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page .bi_rw = bio->bi_rw, }; - if (q->merge_bvec_fn(q, &bvm, prev) < len) { + if (q->merge_bvec_fn(q, &bvm, prev) != prev->bv_len) { prev->bv_len -= len; return 0; } @@ -608,7 +608,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page * merge_bvec_fn() returns number of bytes it can accept * at this offset */ - if (q->merge_bvec_fn(q, &bvm, bvec) < len) { + if (q->merge_bvec_fn(q, &bvm, bvec) != bvec->bv_len) { bvec->bv_page = NULL; bvec->bv_len = 0; bvec->bv_offset = 0; -- cgit v1.2.3 From 009d851837ab26cab18adda6169a813f70b0b21b Mon Sep 17 00:00:00 2001 From: Steven Whitehouse <swhiteho@redhat.com> Date: Tue, 8 Dec 2009 12:12:13 +0000 Subject: GFS2: Metadata address space clean up Since the start of GFS2, an "extra" inode has been used to store the metadata belonging to each inode. The only reason for using this inode was to have an extra address space, the other fields were unused. This means that the memory usage was rather inefficient. The reason for keeping each inode's metadata in a separate address space is that when glocks are requested on remote nodes, we need to be able to efficiently locate the data and metadata which relating to that glock (inode) in order to sync or sync and invalidate it (depending on the remotely requested lock mode). This patch adds a new type of glock, which has in addition to its normal fields, has an address space. This applies to all inode and rgrp glocks (but to no other glock types which remain as before). As a result, we no longer need to have the second inode. This results in three major improvements: 1. A saving of approx 25% of memory used in caching inodes 2. A removal of the circular dependency between inodes and glocks 3. No confusion between "normal" and "metadata" inodes in super.c Although the first of these is the more immediately apparent, the second is just as important as it now enables a number of clean ups at umount time. Those will be the subject of future patches. Signed-off-by: Steven Whitehouse <swhiteho@redhat.com> --- fs/gfs2/aops.c | 4 ++-- fs/gfs2/glock.c | 40 +++++++++++++++++++++------------------- fs/gfs2/glock.h | 7 +++++++ fs/gfs2/glops.c | 16 +++++++++------- fs/gfs2/incore.h | 4 ++-- fs/gfs2/inode.c | 6 ++---- fs/gfs2/lock_dlm.c | 5 ++++- fs/gfs2/main.c | 28 ++++++++++++++++++++++++++++ fs/gfs2/meta_io.c | 46 ++++++---------------------------------------- fs/gfs2/meta_io.h | 12 ++++++++++-- fs/gfs2/super.c | 26 ++++++++------------------ fs/gfs2/util.c | 1 + fs/gfs2/util.h | 1 + 13 files changed, 101 insertions(+), 95 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 7b8da9415267..0c1d0b82dcf1 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -1061,8 +1061,8 @@ out: int gfs2_releasepage(struct page *page, gfp_t gfp_mask) { - struct inode *aspace = page->mapping->host; - struct gfs2_sbd *sdp = aspace->i_sb->s_fs_info; + struct address_space *mapping = page->mapping; + struct gfs2_sbd *sdp = gfs2_mapping2sbd(mapping); struct buffer_head *bh, *head; struct gfs2_bufdata *bd; diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index f42663325931..dfb10a4d467e 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -154,12 +154,14 @@ static unsigned int gl_hash(const struct gfs2_sbd *sdp, static void glock_free(struct gfs2_glock *gl) { struct gfs2_sbd *sdp = gl->gl_sbd; - struct inode *aspace = gl->gl_aspace; + struct address_space *mapping = gfs2_glock2aspace(gl); + struct kmem_cache *cachep = gfs2_glock_cachep; - if (aspace) - gfs2_aspace_put(aspace); + GLOCK_BUG_ON(gl, mapping && mapping->nrpages); trace_gfs2_glock_put(gl); - sdp->sd_lockstruct.ls_ops->lm_put_lock(gfs2_glock_cachep, gl); + if (mapping) + cachep = gfs2_glock_aspace_cachep; + sdp->sd_lockstruct.ls_ops->lm_put_lock(cachep, gl); } /** @@ -750,10 +752,11 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, const struct gfs2_glock_operations *glops, int create, struct gfs2_glock **glp) { + struct super_block *s = sdp->sd_vfs; struct lm_lockname name = { .ln_number = number, .ln_type = glops->go_type }; struct gfs2_glock *gl, *tmp; unsigned int hash = gl_hash(sdp, &name); - int error; + struct address_space *mapping; read_lock(gl_lock_addr(hash)); gl = search_bucket(hash, sdp, &name); @@ -765,7 +768,10 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, if (!create) return -ENOENT; - gl = kmem_cache_alloc(gfs2_glock_cachep, GFP_KERNEL); + if (glops->go_flags & GLOF_ASPACE) + gl = kmem_cache_alloc(gfs2_glock_aspace_cachep, GFP_KERNEL); + else + gl = kmem_cache_alloc(gfs2_glock_cachep, GFP_KERNEL); if (!gl) return -ENOMEM; @@ -784,18 +790,18 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, gl->gl_tchange = jiffies; gl->gl_object = NULL; gl->gl_sbd = sdp; - gl->gl_aspace = NULL; INIT_DELAYED_WORK(&gl->gl_work, glock_work_func); INIT_WORK(&gl->gl_delete, delete_work_func); - /* If this glock protects actual on-disk data or metadata blocks, - create a VFS inode to manage the pages/buffers holding them. */ - if (glops == &gfs2_inode_glops || glops == &gfs2_rgrp_glops) { - gl->gl_aspace = gfs2_aspace_get(sdp); - if (!gl->gl_aspace) { - error = -ENOMEM; - goto fail; - } + mapping = gfs2_glock2aspace(gl); + if (mapping) { + mapping->a_ops = &gfs2_meta_aops; + mapping->host = s->s_bdev->bd_inode; + mapping->flags = 0; + mapping_set_gfp_mask(mapping, GFP_NOFS); + mapping->assoc_mapping = NULL; + mapping->backing_dev_info = s->s_bdi; + mapping->writeback_index = 0; } write_lock(gl_lock_addr(hash)); @@ -812,10 +818,6 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, *glp = gl; return 0; - -fail: - kmem_cache_free(gfs2_glock_cachep, gl); - return error; } /** diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index c0262faf4725..2bda1911b156 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -180,6 +180,13 @@ static inline int gfs2_glock_is_held_shrd(struct gfs2_glock *gl) return gl->gl_state == LM_ST_SHARED; } +static inline struct address_space *gfs2_glock2aspace(struct gfs2_glock *gl) +{ + if (gl->gl_ops->go_flags & GLOF_ASPACE) + return (struct address_space *)(gl + 1); + return NULL; +} + int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, const struct gfs2_glock_operations *glops, int create, struct gfs2_glock **glp); diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 78554acc0605..38e3749d476c 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -87,7 +87,7 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl) static void rgrp_go_sync(struct gfs2_glock *gl) { - struct address_space *metamapping = gl->gl_aspace->i_mapping; + struct address_space *metamapping = gfs2_glock2aspace(gl); int error; if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags)) @@ -113,7 +113,7 @@ static void rgrp_go_sync(struct gfs2_glock *gl) static void rgrp_go_inval(struct gfs2_glock *gl, int flags) { - struct address_space *mapping = gl->gl_aspace->i_mapping; + struct address_space *mapping = gfs2_glock2aspace(gl); BUG_ON(!(flags & DIO_METADATA)); gfs2_assert_withdraw(gl->gl_sbd, !atomic_read(&gl->gl_ail_count)); @@ -134,7 +134,7 @@ static void rgrp_go_inval(struct gfs2_glock *gl, int flags) static void inode_go_sync(struct gfs2_glock *gl) { struct gfs2_inode *ip = gl->gl_object; - struct address_space *metamapping = gl->gl_aspace->i_mapping; + struct address_space *metamapping = gfs2_glock2aspace(gl); int error; if (ip && !S_ISREG(ip->i_inode.i_mode)) @@ -183,7 +183,7 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags) gfs2_assert_withdraw(gl->gl_sbd, !atomic_read(&gl->gl_ail_count)); if (flags & DIO_METADATA) { - struct address_space *mapping = gl->gl_aspace->i_mapping; + struct address_space *mapping = gfs2_glock2aspace(gl); truncate_inode_pages(mapping, 0); if (ip) { set_bit(GIF_INVALID, &ip->i_flags); @@ -282,7 +282,8 @@ static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl) static int rgrp_go_demote_ok(const struct gfs2_glock *gl) { - return !gl->gl_aspace->i_mapping->nrpages; + const struct address_space *mapping = (const struct address_space *)(gl + 1); + return !mapping->nrpages; } /** @@ -387,8 +388,7 @@ static void iopen_go_callback(struct gfs2_glock *gl) struct gfs2_inode *ip = (struct gfs2_inode *)gl->gl_object; if (gl->gl_demote_state == LM_ST_UNLOCKED && - gl->gl_state == LM_ST_SHARED && - ip && test_bit(GIF_USER, &ip->i_flags)) { + gl->gl_state == LM_ST_SHARED && ip) { gfs2_glock_hold(gl); if (queue_work(gfs2_delete_workqueue, &gl->gl_delete) == 0) gfs2_glock_put_nolock(gl); @@ -407,6 +407,7 @@ const struct gfs2_glock_operations gfs2_inode_glops = { .go_dump = inode_go_dump, .go_type = LM_TYPE_INODE, .go_min_hold_time = HZ / 5, + .go_flags = GLOF_ASPACE, }; const struct gfs2_glock_operations gfs2_rgrp_glops = { @@ -418,6 +419,7 @@ const struct gfs2_glock_operations gfs2_rgrp_glops = { .go_dump = gfs2_rgrp_dump, .go_type = LM_TYPE_RGRP, .go_min_hold_time = HZ / 5, + .go_flags = GLOF_ASPACE, }; const struct gfs2_glock_operations gfs2_trans_glops = { diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index bc0ad158e6b4..1de7e1b7ce83 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -162,6 +162,8 @@ struct gfs2_glock_operations { void (*go_callback) (struct gfs2_glock *gl); const int go_type; const unsigned long go_min_hold_time; + const unsigned long go_flags; +#define GLOF_ASPACE 1 }; enum { @@ -225,7 +227,6 @@ struct gfs2_glock { struct gfs2_sbd *gl_sbd; - struct inode *gl_aspace; struct list_head gl_ail_list; atomic_t gl_ail_count; struct delayed_work gl_work; @@ -258,7 +259,6 @@ enum { GIF_INVALID = 0, GIF_QD_LOCKED = 1, GIF_SW_PAGED = 3, - GIF_USER = 4, /* user inode, not metadata addr space */ }; diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 6e220f4eee7d..b1bf2694fb2b 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -45,7 +45,7 @@ static int iget_test(struct inode *inode, void *opaque) struct gfs2_inode *ip = GFS2_I(inode); u64 *no_addr = opaque; - if (ip->i_no_addr == *no_addr && test_bit(GIF_USER, &ip->i_flags)) + if (ip->i_no_addr == *no_addr) return 1; return 0; @@ -58,7 +58,6 @@ static int iget_set(struct inode *inode, void *opaque) inode->i_ino = (unsigned long)*no_addr; ip->i_no_addr = *no_addr; - set_bit(GIF_USER, &ip->i_flags); return 0; } @@ -84,7 +83,7 @@ static int iget_skip_test(struct inode *inode, void *opaque) struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_skip_data *data = opaque; - if (ip->i_no_addr == data->no_addr && test_bit(GIF_USER, &ip->i_flags)){ + if (ip->i_no_addr == data->no_addr) { if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)){ data->skipped = 1; return 0; @@ -103,7 +102,6 @@ static int iget_skip_set(struct inode *inode, void *opaque) return 1; inode->i_ino = (unsigned long)(data->no_addr); ip->i_no_addr = data->no_addr; - set_bit(GIF_USER, &ip->i_flags); return 0; } diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index 0e5e0e7022e5..569b46240f61 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -30,7 +30,10 @@ static void gdlm_ast(void *arg) switch (gl->gl_lksb.sb_status) { case -DLM_EUNLOCK: /* Unlocked, so glock can be freed */ - kmem_cache_free(gfs2_glock_cachep, gl); + if (gl->gl_ops->go_flags & GLOF_ASPACE) + kmem_cache_free(gfs2_glock_aspace_cachep, gl); + else + kmem_cache_free(gfs2_glock_cachep, gl); if (atomic_dec_and_test(&sdp->sd_glock_disposal)) wake_up(&sdp->sd_glock_wait); return; diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index 5b31f7741a8f..a88fadc704bb 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -52,6 +52,22 @@ static void gfs2_init_glock_once(void *foo) atomic_set(&gl->gl_ail_count, 0); } +static void gfs2_init_gl_aspace_once(void *foo) +{ + struct gfs2_glock *gl = foo; + struct address_space *mapping = (struct address_space *)(gl + 1); + + gfs2_init_glock_once(gl); + memset(mapping, 0, sizeof(*mapping)); + INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC); + spin_lock_init(&mapping->tree_lock); + spin_lock_init(&mapping->i_mmap_lock); + INIT_LIST_HEAD(&mapping->private_list); + spin_lock_init(&mapping->private_lock); + INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap); + INIT_LIST_HEAD(&mapping->i_mmap_nonlinear); +} + /** * init_gfs2_fs - Register GFS2 as a filesystem * @@ -78,6 +94,14 @@ static int __init init_gfs2_fs(void) if (!gfs2_glock_cachep) goto fail; + gfs2_glock_aspace_cachep = kmem_cache_create("gfs2_glock (aspace)", + sizeof(struct gfs2_glock) + + sizeof(struct address_space), + 0, 0, gfs2_init_gl_aspace_once); + + if (!gfs2_glock_aspace_cachep) + goto fail; + gfs2_inode_cachep = kmem_cache_create("gfs2_inode", sizeof(struct gfs2_inode), 0, SLAB_RECLAIM_ACCOUNT| @@ -144,6 +168,9 @@ fail: if (gfs2_inode_cachep) kmem_cache_destroy(gfs2_inode_cachep); + if (gfs2_glock_aspace_cachep) + kmem_cache_destroy(gfs2_glock_aspace_cachep); + if (gfs2_glock_cachep) kmem_cache_destroy(gfs2_glock_cachep); @@ -169,6 +196,7 @@ static void __exit exit_gfs2_fs(void) kmem_cache_destroy(gfs2_rgrpd_cachep); kmem_cache_destroy(gfs2_bufdata_cachep); kmem_cache_destroy(gfs2_inode_cachep); + kmem_cache_destroy(gfs2_glock_aspace_cachep); kmem_cache_destroy(gfs2_glock_cachep); gfs2_sys_uninit(); diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 6f68a5f18eb8..0bb12c80937a 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -93,48 +93,12 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb return err; } -static const struct address_space_operations aspace_aops = { +const struct address_space_operations gfs2_meta_aops = { .writepage = gfs2_aspace_writepage, .releasepage = gfs2_releasepage, .sync_page = block_sync_page, }; -/** - * gfs2_aspace_get - Create and initialize a struct inode structure - * @sdp: the filesystem the aspace is in - * - * Right now a struct inode is just a struct inode. Maybe Linux - * will supply a more lightweight address space construct (that works) - * in the future. - * - * Make sure pages/buffers in this aspace aren't in high memory. - * - * Returns: the aspace - */ - -struct inode *gfs2_aspace_get(struct gfs2_sbd *sdp) -{ - struct inode *aspace; - struct gfs2_inode *ip; - - aspace = new_inode(sdp->sd_vfs); - if (aspace) { - mapping_set_gfp_mask(aspace->i_mapping, GFP_NOFS); - aspace->i_mapping->a_ops = &aspace_aops; - aspace->i_size = MAX_LFS_FILESIZE; - ip = GFS2_I(aspace); - clear_bit(GIF_USER, &ip->i_flags); - insert_inode_hash(aspace); - } - return aspace; -} - -void gfs2_aspace_put(struct inode *aspace) -{ - remove_inode_hash(aspace); - iput(aspace); -} - /** * gfs2_meta_sync - Sync all buffers associated with a glock * @gl: The glock @@ -143,7 +107,7 @@ void gfs2_aspace_put(struct inode *aspace) void gfs2_meta_sync(struct gfs2_glock *gl) { - struct address_space *mapping = gl->gl_aspace->i_mapping; + struct address_space *mapping = gfs2_glock2aspace(gl); int error; filemap_fdatawrite(mapping); @@ -164,7 +128,7 @@ void gfs2_meta_sync(struct gfs2_glock *gl) struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create) { - struct address_space *mapping = gl->gl_aspace->i_mapping; + struct address_space *mapping = gfs2_glock2aspace(gl); struct gfs2_sbd *sdp = gl->gl_sbd; struct page *page; struct buffer_head *bh; @@ -344,8 +308,10 @@ void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh, void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int meta) { - struct gfs2_sbd *sdp = GFS2_SB(bh->b_page->mapping->host); + struct address_space *mapping = bh->b_page->mapping; + struct gfs2_sbd *sdp = gfs2_mapping2sbd(mapping); struct gfs2_bufdata *bd = bh->b_private; + if (test_clear_buffer_pinned(bh)) { list_del_init(&bd->bd_le.le_list); if (meta) { diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h index de270c2f9b63..6a1d9ba16411 100644 --- a/fs/gfs2/meta_io.h +++ b/fs/gfs2/meta_io.h @@ -37,8 +37,16 @@ static inline void gfs2_buffer_copy_tail(struct buffer_head *to_bh, 0, from_head - to_head); } -struct inode *gfs2_aspace_get(struct gfs2_sbd *sdp); -void gfs2_aspace_put(struct inode *aspace); +extern const struct address_space_operations gfs2_meta_aops; + +static inline struct gfs2_sbd *gfs2_mapping2sbd(struct address_space *mapping) +{ + struct inode *inode = mapping->host; + if (mapping->a_ops == &gfs2_meta_aops) + return (((struct gfs2_glock *)mapping) - 1)->gl_sbd; + else + return inode->i_sb->s_fs_info; +} void gfs2_meta_sync(struct gfs2_glock *gl); diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index b9dd3da22c0a..ad7bc2d25ac2 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -722,8 +722,7 @@ static int gfs2_write_inode(struct inode *inode, int sync) int ret = 0; /* Check this is a "normal" inode, etc */ - if (!test_bit(GIF_USER, &ip->i_flags) || - (current->flags & PF_MEMALLOC)) + if (current->flags & PF_MEMALLOC) return 0; ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); if (ret) @@ -1194,7 +1193,7 @@ static void gfs2_drop_inode(struct inode *inode) { struct gfs2_inode *ip = GFS2_I(inode); - if (test_bit(GIF_USER, &ip->i_flags) && inode->i_nlink) { + if (inode->i_nlink) { struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl; if (gl && test_bit(GLF_DEMOTE, &gl->gl_flags)) clear_nlink(inode); @@ -1212,18 +1211,12 @@ static void gfs2_clear_inode(struct inode *inode) { struct gfs2_inode *ip = GFS2_I(inode); - /* This tells us its a "real" inode and not one which only - * serves to contain an address space (see rgrp.c, meta_io.c) - * which therefore doesn't have its own glocks. - */ - if (test_bit(GIF_USER, &ip->i_flags)) { - ip->i_gl->gl_object = NULL; - gfs2_glock_put(ip->i_gl); - ip->i_gl = NULL; - if (ip->i_iopen_gh.gh_gl) { - ip->i_iopen_gh.gh_gl->gl_object = NULL; - gfs2_glock_dq_uninit(&ip->i_iopen_gh); - } + ip->i_gl->gl_object = NULL; + gfs2_glock_put(ip->i_gl); + ip->i_gl = NULL; + if (ip->i_iopen_gh.gh_gl) { + ip->i_iopen_gh.gh_gl->gl_object = NULL; + gfs2_glock_dq_uninit(&ip->i_iopen_gh); } } @@ -1358,9 +1351,6 @@ static void gfs2_delete_inode(struct inode *inode) struct gfs2_holder gh; int error; - if (!test_bit(GIF_USER, &ip->i_flags)) - goto out; - error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); if (unlikely(error)) { gfs2_glock_dq_uninit(&ip->i_iopen_gh); diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c index f6a7efa34eb9..226f2bfbf16a 100644 --- a/fs/gfs2/util.c +++ b/fs/gfs2/util.c @@ -21,6 +21,7 @@ #include "util.h" struct kmem_cache *gfs2_glock_cachep __read_mostly; +struct kmem_cache *gfs2_glock_aspace_cachep __read_mostly; struct kmem_cache *gfs2_inode_cachep __read_mostly; struct kmem_cache *gfs2_bufdata_cachep __read_mostly; struct kmem_cache *gfs2_rgrpd_cachep __read_mostly; diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h index 33e96b0ce9ab..b432e04600de 100644 --- a/fs/gfs2/util.h +++ b/fs/gfs2/util.h @@ -145,6 +145,7 @@ gfs2_io_error_bh_i((sdp), (bh), __func__, __FILE__, __LINE__); extern struct kmem_cache *gfs2_glock_cachep; +extern struct kmem_cache *gfs2_glock_aspace_cachep; extern struct kmem_cache *gfs2_inode_cachep; extern struct kmem_cache *gfs2_bufdata_cachep; extern struct kmem_cache *gfs2_rgrpd_cachep; -- cgit v1.2.3 From c1184f8ab7ea26681f3cab18284a870aad678b0f Mon Sep 17 00:00:00 2001 From: Steven Whitehouse <swhiteho@redhat.com> Date: Fri, 8 Jan 2010 16:14:29 +0000 Subject: GFS2: Remove loopy umount code As a consequence of the previous patch, we can now remove the loop which used to be required due to the circular dependency between the inodes and glocks. Instead we can just invalidate the inodes, and then clear up any glocks which are left. Also we no longer need the rwsem since there is no longer any danger of the inode invalidation calling back into the glock code (and from there back into the inode code). Signed-off-by: Steven Whitehouse <swhiteho@redhat.com> --- fs/gfs2/glock.c | 33 ++------------------------------- fs/gfs2/incore.h | 1 - fs/gfs2/ops_fstype.c | 4 +--- fs/gfs2/super.c | 1 + fs/gfs2/sys.c | 2 -- 5 files changed, 4 insertions(+), 37 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index dfb10a4d467e..4773f9098a41 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -19,7 +19,6 @@ #include <linux/list.h> #include <linux/wait.h> #include <linux/module.h> -#include <linux/rwsem.h> #include <asm/uaccess.h> #include <linux/seq_file.h> #include <linux/debugfs.h> @@ -60,7 +59,6 @@ static int __dump_glock(struct seq_file *seq, const struct gfs2_glock *gl); #define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { __dump_glock(NULL, gl); BUG(); } } while(0) static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target); -static DECLARE_RWSEM(gfs2_umount_flush_sem); static struct dentry *gfs2_root; static struct workqueue_struct *glock_workqueue; struct workqueue_struct *gfs2_delete_workqueue; @@ -714,7 +712,6 @@ static void glock_work_func(struct work_struct *work) finish_xmote(gl, gl->gl_reply); drop_ref = 1; } - down_read(&gfs2_umount_flush_sem); spin_lock(&gl->gl_spin); if (test_and_clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) && gl->gl_state != LM_ST_UNLOCKED && @@ -727,7 +724,6 @@ static void glock_work_func(struct work_struct *work) } run_queue(gl, 0); spin_unlock(&gl->gl_spin); - up_read(&gfs2_umount_flush_sem); if (!delay || queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0) gfs2_glock_put(gl); @@ -1512,35 +1508,10 @@ void gfs2_glock_thaw(struct gfs2_sbd *sdp) void gfs2_gl_hash_clear(struct gfs2_sbd *sdp) { - unsigned long t; unsigned int x; - int cont; - t = jiffies; - - for (;;) { - cont = 0; - for (x = 0; x < GFS2_GL_HASH_SIZE; x++) { - if (examine_bucket(clear_glock, sdp, x)) - cont = 1; - } - - if (!cont) - break; - - if (time_after_eq(jiffies, - t + gfs2_tune_get(sdp, gt_stall_secs) * HZ)) { - fs_warn(sdp, "Unmount seems to be stalled. " - "Dumping lock state...\n"); - gfs2_dump_lockstate(sdp); - t = jiffies; - } - - down_write(&gfs2_umount_flush_sem); - invalidate_inodes(sdp->sd_vfs); - up_write(&gfs2_umount_flush_sem); - msleep(10); - } + for (x = 0; x < GFS2_GL_HASH_SIZE; x++) + examine_bucket(clear_glock, sdp, x); flush_workqueue(glock_workqueue); wait_event(sdp->sd_glock_wait, atomic_read(&sdp->sd_glock_disposal) == 0); gfs2_dump_lockstate(sdp); diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 1de7e1b7ce83..b8025e51cabf 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -451,7 +451,6 @@ struct gfs2_tune { unsigned int gt_quota_quantum; /* Secs between syncs to quota file */ unsigned int gt_new_files_jdata; unsigned int gt_max_readahead; /* Max bytes to read-ahead from disk */ - unsigned int gt_stall_secs; /* Detects trouble! */ unsigned int gt_complain_secs; unsigned int gt_statfs_quantum; unsigned int gt_statfs_slow; diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index a86ed6381566..a054b526dc08 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -65,7 +65,6 @@ static void gfs2_tune_init(struct gfs2_tune *gt) gt->gt_quota_scale_den = 1; gt->gt_new_files_jdata = 0; gt->gt_max_readahead = 1 << 18; - gt->gt_stall_secs = 600; gt->gt_complain_secs = 10; } @@ -1241,10 +1240,9 @@ fail_sb: fail_locking: init_locking(sdp, &mount_gh, UNDO); fail_lm: + invalidate_inodes(sb); gfs2_gl_hash_clear(sdp); gfs2_lm_unmount(sdp); - while (invalidate_inodes(sb)) - yield(); fail_sys: gfs2_sys_fs_del(sdp); fail: diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index ad7bc2d25ac2..e5e22629da67 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -859,6 +859,7 @@ restart: gfs2_clear_rgrpd(sdp); gfs2_jindex_free(sdp); /* Take apart glock structures and buffer lists */ + invalidate_inodes(sdp->sd_vfs); gfs2_gl_hash_clear(sdp); /* Unmount the locking protocol */ gfs2_lm_unmount(sdp); diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index 0dc34621f6a6..a0db1c94317d 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -478,7 +478,6 @@ TUNE_ATTR(complain_secs, 0); TUNE_ATTR(statfs_slow, 0); TUNE_ATTR(new_files_jdata, 0); TUNE_ATTR(quota_simul_sync, 1); -TUNE_ATTR(stall_secs, 1); TUNE_ATTR(statfs_quantum, 1); TUNE_ATTR_3(quota_scale, quota_scale_show, quota_scale_store); @@ -491,7 +490,6 @@ static struct attribute *tune_attrs[] = { &tune_attr_complain_secs.attr, &tune_attr_statfs_slow.attr, &tune_attr_quota_simul_sync.attr, - &tune_attr_stall_secs.attr, &tune_attr_statfs_quantum.attr, &tune_attr_quota_scale.attr, &tune_attr_new_files_jdata.attr, -- cgit v1.2.3 From e5884636da3a128617032747654284ae7badc7ff Mon Sep 17 00:00:00 2001 From: Dave Chinner <dchinner@redhat.com> Date: Fri, 5 Feb 2010 16:45:25 +1100 Subject: GFS2: ordered writes are backwards When we queue data buffers for ordered write, the buffers are added to the head of the ordered write list. When the log needs to push these buffers to disk, it also walks the list from the head. The result is that the the ordered buffers are submitted to disk in reverse order. For large writes, this means that whenever the log flushes large streams of reverse sequential order buffers are pushed down into the block layers. The elevators don't handle this particularly well, so IO rates tend to be significantly lower than if the IO was issued in ascending block order. Queue new ordered buffers to the tail of the ordered buffer list to ensure that IO is dispatched in the order it was submitted. This should significantly improve large sequential write speeds. On a disk capable of 85MB/s, speeds increase from 50MB/s to 65MB/s for noop and from 38MB/s to 50MB/s for cfq. Signed-off-by: Dave Chinner <dchinner@redhat.com> Signed-off-by: Steven Whitehouse <swhiteho@redhat.com> --- fs/gfs2/lops.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index de97632ba32f..adc260fbea90 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -528,9 +528,9 @@ static void databuf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le) gfs2_pin(sdp, bd->bd_bh); tr->tr_num_databuf_new++; sdp->sd_log_num_databuf++; - list_add(&le->le_list, &sdp->sd_log_le_databuf); + list_add_tail(&le->le_list, &sdp->sd_log_le_databuf); } else { - list_add(&le->le_list, &sdp->sd_log_le_ordered); + list_add_tail(&le->le_list, &sdp->sd_log_le_ordered); } out: gfs2_log_unlock(sdp); -- cgit v1.2.3 From 4818972efb105730f007e5efc05e203a065fc318 Mon Sep 17 00:00:00 2001 From: Bob Peterson <rpeterso@redhat.com> Date: Tue, 23 Feb 2010 12:20:00 -0500 Subject: GFS2: print glock numbers in hex This patch changes glock numbers from printing in decimal to hex. Since DLM prints corresponding resource IDs in hex, it makes debugging easier. Signed-off-by: Bob Peterson <rpeterso@redhat.com> Signed-off-by: Steven Whitehouse <swhiteho@redhat.com> --- fs/gfs2/glock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 4773f9098a41..454d4b4eb36b 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -1658,7 +1658,7 @@ static int __dump_glock(struct seq_file *seq, const struct gfs2_glock *gl) dtime *= 1000000/HZ; /* demote time in uSec */ if (!test_bit(GLF_DEMOTE, &gl->gl_flags)) dtime = 0; - gfs2_print_dbg(seq, "G: s:%s n:%u/%llu f:%s t:%s d:%s/%llu a:%d r:%d\n", + gfs2_print_dbg(seq, "G: s:%s n:%u/%llx f:%s t:%s d:%s/%llu a:%d r:%d\n", state2str(gl->gl_state), gl->gl_name.ln_type, (unsigned long long)gl->gl_name.ln_number, -- cgit v1.2.3 From a9cc799eca0c798ab5dd8648564fc2025bdd9bd2 Mon Sep 17 00:00:00 2001 From: Eric Sandeen <sandeen@sandeen.net> Date: Wed, 3 Feb 2010 17:50:13 +0000 Subject: xfs: increase readdir buffer size While doing some testing of readdir perf a while back, I noticed that the buffer size we're using internally is smaller than what glibc gives us by default. Upping this size helped a bit, and seems safe. glibc's __alloc_dir() does: const size_t default_allocation = (4 * BUFSIZ < sizeof (struct dirent64) ? sizeof (struct dirent64) : 4 * BUFSIZ); const size_t small_allocation = (BUFSIZ < sizeof (struct dirent64) ? sizeof (struct dirent64) : BUFSIZ); size_t allocation = default_allocation; #ifdef _STATBUF_ST_BLKSIZE if (statp != NULL && default_allocation < statp->st_blksize) allocation = statp->st_blksize; #endif and #define _G_BUFSIZ 8192 #define _IO_BUFSIZ _G_BUFSIZ # define BUFSIZ _IO_BUFSIZ so the default buffer is 4 * 8192 = 32768 (except in the unlikely case of blocks > 32k....) Signed-off-by: Eric Sandeen <sandeen@sandeen.net> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Alex Elder <aelder@sgi.com> --- fs/xfs/linux-2.6/xfs_file.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index e4caeb28ce2e..3805ada98747 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -203,9 +203,9 @@ xfs_file_readdir( * * Try to give it an estimate that's good enough, maybe at some * point we can change the ->readdir prototype to include the - * buffer size. + * buffer size. For now we use the current glibc buffer size. */ - bufsize = (size_t)min_t(loff_t, PAGE_SIZE, ip->i_d.di_size); + bufsize = (size_t)min_t(loff_t, 32768, ip->i_d.di_size); error = xfs_readdir(ip, dirent, bufsize, (xfs_off_t *)&filp->f_pos, filldir); -- cgit v1.2.3 From b262e5dfd9ddd2f6d0ef3fa53eb88690f22134a5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@infradead.org> Date: Sun, 14 Feb 2010 22:01:45 +0000 Subject: xfs: fix dquota trace format The be32_to_cpu in the TP_printk output breaks automatic parsing of the trace format by the trace-cmd tools, so we have to move it into the TP_assign block. While we're at it also fix the format for the quota limits to more regular and easier parseable. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Dave Chinner <david@fromorbit.com> Signed-off-by: Alex Elder <aelder@sgi.com> --- fs/xfs/linux-2.6/xfs_trace.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h index a4574dcf5065..87f9fad80aac 100644 --- a/fs/xfs/linux-2.6/xfs_trace.h +++ b/fs/xfs/linux-2.6/xfs_trace.h @@ -593,7 +593,7 @@ DECLARE_EVENT_CLASS(xfs_dquot_class, TP_ARGS(dqp), TP_STRUCT__entry( __field(dev_t, dev) - __field(__be32, id) + __field(u32, id) __field(unsigned, flags) __field(unsigned, nrefs) __field(unsigned long long, res_bcount) @@ -606,7 +606,7 @@ DECLARE_EVENT_CLASS(xfs_dquot_class, ), \ TP_fast_assign( __entry->dev = dqp->q_mount->m_super->s_dev; - __entry->id = dqp->q_core.d_id; + __entry->id = be32_to_cpu(dqp->q_core.d_id); __entry->flags = dqp->dq_flags; __entry->nrefs = dqp->q_nrefs; __entry->res_bcount = dqp->q_res_bcount; @@ -622,10 +622,10 @@ DECLARE_EVENT_CLASS(xfs_dquot_class, be64_to_cpu(dqp->q_core.d_ino_softlimit); ), TP_printk("dev %d:%d id 0x%x flags %s nrefs %u res_bc 0x%llx " - "bcnt 0x%llx [hard 0x%llx | soft 0x%llx] " - "icnt 0x%llx [hard 0x%llx | soft 0x%llx]", + "bcnt 0x%llx bhardlimit 0x%llx bsoftlimit 0x%llx " + "icnt 0x%llx ihardlimit 0x%llx isoftlimit 0x%llx]", MAJOR(__entry->dev), MINOR(__entry->dev), - be32_to_cpu(__entry->id), + __entry->id, __print_flags(__entry->flags, "|", XFS_DQ_FLAGS), __entry->nrefs, __entry->res_bcount, -- cgit v1.2.3 From dda35b8f84d209784041bbad47f9e195a08a7527 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@infradead.org> Date: Mon, 15 Feb 2010 09:44:46 +0000 Subject: xfs: merge xfs_lrw.c into xfs_file.c Currently the code to implement the file operations is split over two small files. Merge the content of xfs_lrw.c into xfs_file.c to have it in one place. Note that I haven't done various cleanups that are possible after this yet, they will follow in the next patch. Also the function xfs_dev_is_read_only which was in xfs_lrw.c before really doesn't fit in here at all and was moved to xfs_mount.c. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Dave Chinner <david@fromorbit.com> Signed-off-by: Alex Elder <aelder@sgi.com> --- fs/xfs/Makefile | 1 - fs/xfs/linux-2.6/xfs_file.c | 723 ++++++++++++++++++++++++++++++++++++++++ fs/xfs/linux-2.6/xfs_lrw.c | 796 -------------------------------------------- fs/xfs/xfs_mount.c | 20 ++ fs/xfs/xfs_vnodeops.h | 12 - 5 files changed, 743 insertions(+), 809 deletions(-) delete mode 100644 fs/xfs/linux-2.6/xfs_lrw.c (limited to 'fs') diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 5c5a366aa332..b4769e40e8bc 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -105,7 +105,6 @@ xfs-y += $(addprefix $(XFS_LINUX)/, \ xfs_globals.o \ xfs_ioctl.o \ xfs_iops.o \ - xfs_lrw.o \ xfs_super.o \ xfs_sync.o \ xfs_xattr.o) diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index 3805ada98747..51fc510828a4 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -16,6 +16,7 @@ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" +#include "xfs_fs.h" #include "xfs_bit.h" #include "xfs_log.h" #include "xfs_inum.h" @@ -34,16 +35,738 @@ #include "xfs_dir2_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" +#include "xfs_bmap.h" #include "xfs_error.h" #include "xfs_rw.h" #include "xfs_vnodeops.h" #include "xfs_da_btree.h" #include "xfs_ioctl.h" +#include "xfs_trace.h" #include <linux/dcache.h> static const struct vm_operations_struct xfs_file_vm_ops; +/* + * xfs_iozero + * + * xfs_iozero clears the specified range of buffer supplied, + * and marks all the affected blocks as valid and modified. If + * an affected block is not allocated, it will be allocated. If + * an affected block is not completely overwritten, and is not + * valid before the operation, it will be read from disk before + * being partially zeroed. + */ +STATIC int +xfs_iozero( + struct xfs_inode *ip, /* inode */ + loff_t pos, /* offset in file */ + size_t count) /* size of data to zero */ +{ + struct page *page; + struct address_space *mapping; + int status; + + mapping = VFS_I(ip)->i_mapping; + do { + unsigned offset, bytes; + void *fsdata; + + offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ + bytes = PAGE_CACHE_SIZE - offset; + if (bytes > count) + bytes = count; + + status = pagecache_write_begin(NULL, mapping, pos, bytes, + AOP_FLAG_UNINTERRUPTIBLE, + &page, &fsdata); + if (status) + break; + + zero_user(page, offset, bytes); + + status = pagecache_write_end(NULL, mapping, pos, bytes, bytes, + page, fsdata); + WARN_ON(status <= 0); /* can't return less than zero! */ + pos += bytes; + count -= bytes; + status = 0; + } while (count); + + return (-status); +} + +ssize_t /* bytes read, or (-) error */ +xfs_read( + xfs_inode_t *ip, + struct kiocb *iocb, + const struct iovec *iovp, + unsigned int segs, + loff_t *offset, + int ioflags) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + xfs_mount_t *mp = ip->i_mount; + size_t size = 0; + ssize_t ret = 0; + xfs_fsize_t n; + unsigned long seg; + + + XFS_STATS_INC(xs_read_calls); + + /* START copy & waste from filemap.c */ + for (seg = 0; seg < segs; seg++) { + const struct iovec *iv = &iovp[seg]; + + /* + * If any segment has a negative length, or the cumulative + * length ever wraps negative then return -EINVAL. + */ + size += iv->iov_len; + if (unlikely((ssize_t)(size|iv->iov_len) < 0)) + return XFS_ERROR(-EINVAL); + } + /* END copy & waste from filemap.c */ + + if (unlikely(ioflags & IO_ISDIRECT)) { + xfs_buftarg_t *target = + XFS_IS_REALTIME_INODE(ip) ? + mp->m_rtdev_targp : mp->m_ddev_targp; + if ((*offset & target->bt_smask) || + (size & target->bt_smask)) { + if (*offset == ip->i_size) { + return (0); + } + return -XFS_ERROR(EINVAL); + } + } + + n = XFS_MAXIOFFSET(mp) - *offset; + if ((n <= 0) || (size == 0)) + return 0; + + if (n < size) + size = n; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + if (unlikely(ioflags & IO_ISDIRECT)) + mutex_lock(&inode->i_mutex); + xfs_ilock(ip, XFS_IOLOCK_SHARED); + + if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) { + int dmflags = FILP_DELAY_FLAG(file) | DM_SEM_FLAG_RD(ioflags); + int iolock = XFS_IOLOCK_SHARED; + + ret = -XFS_SEND_DATA(mp, DM_EVENT_READ, ip, *offset, size, + dmflags, &iolock); + if (ret) { + xfs_iunlock(ip, XFS_IOLOCK_SHARED); + if (unlikely(ioflags & IO_ISDIRECT)) + mutex_unlock(&inode->i_mutex); + return ret; + } + } + + if (unlikely(ioflags & IO_ISDIRECT)) { + if (inode->i_mapping->nrpages) + ret = -xfs_flushinval_pages(ip, (*offset & PAGE_CACHE_MASK), + -1, FI_REMAPF_LOCKED); + mutex_unlock(&inode->i_mutex); + if (ret) { + xfs_iunlock(ip, XFS_IOLOCK_SHARED); + return ret; + } + } + + trace_xfs_file_read(ip, size, *offset, ioflags); + + iocb->ki_pos = *offset; + ret = generic_file_aio_read(iocb, iovp, segs, *offset); + if (ret > 0) + XFS_STATS_ADD(xs_read_bytes, ret); + + xfs_iunlock(ip, XFS_IOLOCK_SHARED); + return ret; +} + +ssize_t +xfs_splice_read( + xfs_inode_t *ip, + struct file *infilp, + loff_t *ppos, + struct pipe_inode_info *pipe, + size_t count, + int flags, + int ioflags) +{ + xfs_mount_t *mp = ip->i_mount; + ssize_t ret; + + XFS_STATS_INC(xs_read_calls); + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + return -EIO; + + xfs_ilock(ip, XFS_IOLOCK_SHARED); + + if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) { + int iolock = XFS_IOLOCK_SHARED; + int error; + + error = XFS_SEND_DATA(mp, DM_EVENT_READ, ip, *ppos, count, + FILP_DELAY_FLAG(infilp), &iolock); + if (error) { + xfs_iunlock(ip, XFS_IOLOCK_SHARED); + return -error; + } + } + + trace_xfs_file_splice_read(ip, count, *ppos, ioflags); + + ret = generic_file_splice_read(infilp, ppos, pipe, count, flags); + if (ret > 0) + XFS_STATS_ADD(xs_read_bytes, ret); + + xfs_iunlock(ip, XFS_IOLOCK_SHARED); + return ret; +} + +ssize_t +xfs_splice_write( + xfs_inode_t *ip, + struct pipe_inode_info *pipe, + struct file *outfilp, + loff_t *ppos, + size_t count, + int flags, + int ioflags) +{ + xfs_mount_t *mp = ip->i_mount; + ssize_t ret; + struct inode *inode = outfilp->f_mapping->host; + xfs_fsize_t isize, new_size; + + XFS_STATS_INC(xs_write_calls); + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + return -EIO; + + xfs_ilock(ip, XFS_IOLOCK_EXCL); + + if (DM_EVENT_ENABLED(ip, DM_EVENT_WRITE) && !(ioflags & IO_INVIS)) { + int iolock = XFS_IOLOCK_EXCL; + int error; + + error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, ip, *ppos, count, + FILP_DELAY_FLAG(outfilp), &iolock); + if (error) { + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + return -error; + } + } + + new_size = *ppos + count; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + if (new_size > ip->i_size) + ip->i_new_size = new_size; + xfs_iunlock(ip, XFS_ILOCK_EXCL); + + trace_xfs_file_splice_write(ip, count, *ppos, ioflags); + + ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags); + if (ret > 0) + XFS_STATS_ADD(xs_write_bytes, ret); + + isize = i_size_read(inode); + if (unlikely(ret < 0 && ret != -EFAULT && *ppos > isize)) + *ppos = isize; + + if (*ppos > ip->i_size) { + xfs_ilock(ip, XFS_ILOCK_EXCL); + if (*ppos > ip->i_size) + ip->i_size = *ppos; + xfs_iunlock(ip, XFS_ILOCK_EXCL); + } + + if (ip->i_new_size) { + xfs_ilock(ip, XFS_ILOCK_EXCL); + ip->i_new_size = 0; + if (ip->i_d.di_size > ip->i_size) + ip->i_d.di_size = ip->i_size; + xfs_iunlock(ip, XFS_ILOCK_EXCL); + } + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + return ret; +} + +/* + * This routine is called to handle zeroing any space in the last + * block of the file that is beyond the EOF. We do this since the + * size is being increased without writing anything to that block + * and we don't want anyone to read the garbage on the disk. + */ +STATIC int /* error (positive) */ +xfs_zero_last_block( + xfs_inode_t *ip, + xfs_fsize_t offset, + xfs_fsize_t isize) +{ + xfs_fileoff_t last_fsb; + xfs_mount_t *mp = ip->i_mount; + int nimaps; + int zero_offset; + int zero_len; + int error = 0; + xfs_bmbt_irec_t imap; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + + zero_offset = XFS_B_FSB_OFFSET(mp, isize); + if (zero_offset == 0) { + /* + * There are no extra bytes in the last block on disk to + * zero, so return. + */ + return 0; + } + + last_fsb = XFS_B_TO_FSBT(mp, isize); + nimaps = 1; + error = xfs_bmapi(NULL, ip, last_fsb, 1, 0, NULL, 0, &imap, + &nimaps, NULL, NULL); + if (error) { + return error; + } + ASSERT(nimaps > 0); + /* + * If the block underlying isize is just a hole, then there + * is nothing to zero. + */ + if (imap.br_startblock == HOLESTARTBLOCK) { + return 0; + } + /* + * Zero the part of the last block beyond the EOF, and write it + * out sync. We need to drop the ilock while we do this so we + * don't deadlock when the buffer cache calls back to us. + */ + xfs_iunlock(ip, XFS_ILOCK_EXCL); + + zero_len = mp->m_sb.sb_blocksize - zero_offset; + if (isize + zero_len > offset) + zero_len = offset - isize; + error = xfs_iozero(ip, isize, zero_len); + + xfs_ilock(ip, XFS_ILOCK_EXCL); + ASSERT(error >= 0); + return error; +} + +/* + * Zero any on disk space between the current EOF and the new, + * larger EOF. This handles the normal case of zeroing the remainder + * of the last block in the file and the unusual case of zeroing blocks + * out beyond the size of the file. This second case only happens + * with fixed size extents and when the system crashes before the inode + * size was updated but after blocks were allocated. If fill is set, + * then any holes in the range are filled and zeroed. If not, the holes + * are left alone as holes. + */ + +int /* error (positive) */ +xfs_zero_eof( + xfs_inode_t *ip, + xfs_off_t offset, /* starting I/O offset */ + xfs_fsize_t isize) /* current inode size */ +{ + xfs_mount_t *mp = ip->i_mount; + xfs_fileoff_t start_zero_fsb; + xfs_fileoff_t end_zero_fsb; + xfs_fileoff_t zero_count_fsb; + xfs_fileoff_t last_fsb; + xfs_fileoff_t zero_off; + xfs_fsize_t zero_len; + int nimaps; + int error = 0; + xfs_bmbt_irec_t imap; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL)); + ASSERT(offset > isize); + + /* + * First handle zeroing the block on which isize resides. + * We only zero a part of that block so it is handled specially. + */ + error = xfs_zero_last_block(ip, offset, isize); + if (error) { + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL)); + return error; + } + + /* + * Calculate the range between the new size and the old + * where blocks needing to be zeroed may exist. To get the + * block where the last byte in the file currently resides, + * we need to subtract one from the size and truncate back + * to a block boundary. We subtract 1 in case the size is + * exactly on a block boundary. + */ + last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1; + start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize); + end_zero_fsb = XFS_B_TO_FSBT(mp, offset - 1); + ASSERT((xfs_sfiloff_t)last_fsb < (xfs_sfiloff_t)start_zero_fsb); + if (last_fsb == end_zero_fsb) { + /* + * The size was only incremented on its last block. + * We took care of that above, so just return. + */ + return 0; + } + + ASSERT(start_zero_fsb <= end_zero_fsb); + while (start_zero_fsb <= end_zero_fsb) { + nimaps = 1; + zero_count_fsb = end_zero_fsb - start_zero_fsb + 1; + error = xfs_bmapi(NULL, ip, start_zero_fsb, zero_count_fsb, + 0, NULL, 0, &imap, &nimaps, NULL, NULL); + if (error) { + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL)); + return error; + } + ASSERT(nimaps > 0); + + if (imap.br_state == XFS_EXT_UNWRITTEN || + imap.br_startblock == HOLESTARTBLOCK) { + /* + * This loop handles initializing pages that were + * partially initialized by the code below this + * loop. It basically zeroes the part of the page + * that sits on a hole and sets the page as P_HOLE + * and calls remapf if it is a mapped file. + */ + start_zero_fsb = imap.br_startoff + imap.br_blockcount; + ASSERT(start_zero_fsb <= (end_zero_fsb + 1)); + continue; + } + + /* + * There are blocks we need to zero. + * Drop the inode lock while we're doing the I/O. + * We'll still have the iolock to protect us. + */ + xfs_iunlock(ip, XFS_ILOCK_EXCL); + + zero_off = XFS_FSB_TO_B(mp, start_zero_fsb); + zero_len = XFS_FSB_TO_B(mp, imap.br_blockcount); + + if ((zero_off + zero_len) > offset) + zero_len = offset - zero_off; + + error = xfs_iozero(ip, zero_off, zero_len); + if (error) { + goto out_lock; + } + + start_zero_fsb = imap.br_startoff + imap.br_blockcount; + ASSERT(start_zero_fsb <= (end_zero_fsb + 1)); + + xfs_ilock(ip, XFS_ILOCK_EXCL); + } + + return 0; + +out_lock: + xfs_ilock(ip, XFS_ILOCK_EXCL); + ASSERT(error >= 0); + return error; +} + +ssize_t /* bytes written, or (-) error */ +xfs_write( + struct xfs_inode *xip, + struct kiocb *iocb, + const struct iovec *iovp, + unsigned int nsegs, + loff_t *offset, + int ioflags) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + unsigned long segs = nsegs; + xfs_mount_t *mp; + ssize_t ret = 0, error = 0; + xfs_fsize_t isize, new_size; + int iolock; + int eventsent = 0; + size_t ocount = 0, count; + loff_t pos; + int need_i_mutex; + + XFS_STATS_INC(xs_write_calls); + + error = generic_segment_checks(iovp, &segs, &ocount, VERIFY_READ); + if (error) + return error; + + count = ocount; + pos = *offset; + + if (count == 0) + return 0; + + mp = xip->i_mount; + + xfs_wait_for_freeze(mp, SB_FREEZE_WRITE); + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + +relock: + if (ioflags & IO_ISDIRECT) { + iolock = XFS_IOLOCK_SHARED; + need_i_mutex = 0; + } else { + iolock = XFS_IOLOCK_EXCL; + need_i_mutex = 1; + mutex_lock(&inode->i_mutex); + } + + xfs_ilock(xip, XFS_ILOCK_EXCL|iolock); + +start: + error = -generic_write_checks(file, &pos, &count, + S_ISBLK(inode->i_mode)); + if (error) { + xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock); + goto out_unlock_mutex; + } + + if ((DM_EVENT_ENABLED(xip, DM_EVENT_WRITE) && + !(ioflags & IO_INVIS) && !eventsent)) { + int dmflags = FILP_DELAY_FLAG(file); + + if (need_i_mutex) + dmflags |= DM_FLAGS_IMUX; + + xfs_iunlock(xip, XFS_ILOCK_EXCL); + error = XFS_SEND_DATA(xip->i_mount, DM_EVENT_WRITE, xip, + pos, count, dmflags, &iolock); + if (error) { + goto out_unlock_internal; + } + xfs_ilock(xip, XFS_ILOCK_EXCL); + eventsent = 1; + + /* + * The iolock was dropped and reacquired in XFS_SEND_DATA + * so we have to recheck the size when appending. + * We will only "goto start;" once, since having sent the + * event prevents another call to XFS_SEND_DATA, which is + * what allows the size to change in the first place. + */ + if ((file->f_flags & O_APPEND) && pos != xip->i_size) + goto start; + } + + if (ioflags & IO_ISDIRECT) { + xfs_buftarg_t *target = + XFS_IS_REALTIME_INODE(xip) ? + mp->m_rtdev_targp : mp->m_ddev_targp; + + if ((pos & target->bt_smask) || (count & target->bt_smask)) { + xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock); + return XFS_ERROR(-EINVAL); + } + + if (!need_i_mutex && (mapping->nrpages || pos > xip->i_size)) { + xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock); + iolock = XFS_IOLOCK_EXCL; + need_i_mutex = 1; + mutex_lock(&inode->i_mutex); + xfs_ilock(xip, XFS_ILOCK_EXCL|iolock); + goto start; + } + } + + new_size = pos + count; + if (new_size > xip->i_size) + xip->i_new_size = new_size; + + if (likely(!(ioflags & IO_INVIS))) + file_update_time(file); + + /* + * If the offset is beyond the size of the file, we have a couple + * of things to do. First, if there is already space allocated + * we need to either create holes or zero the disk or ... + * + * If there is a page where the previous size lands, we need + * to zero it out up to the new size. + */ + + if (pos > xip->i_size) { + error = xfs_zero_eof(xip, pos, xip->i_size); + if (error) { + xfs_iunlock(xip, XFS_ILOCK_EXCL); + goto out_unlock_internal; + } + } + xfs_iunlock(xip, XFS_ILOCK_EXCL); + + /* + * If we're writing the file then make sure to clear the + * setuid and setgid bits if the process is not being run + * by root. This keeps people from modifying setuid and + * setgid binaries. + */ + error = -file_remove_suid(file); + if (unlikely(error)) + goto out_unlock_internal; + + /* We can write back this queue in page reclaim */ + current->backing_dev_info = mapping->backing_dev_info; + + if ((ioflags & IO_ISDIRECT)) { + if (mapping->nrpages) { + WARN_ON(need_i_mutex == 0); + error = xfs_flushinval_pages(xip, + (pos & PAGE_CACHE_MASK), + -1, FI_REMAPF_LOCKED); + if (error) + goto out_unlock_internal; + } + + if (need_i_mutex) { + /* demote the lock now the cached pages are gone */ + xfs_ilock_demote(xip, XFS_IOLOCK_EXCL); + mutex_unlock(&inode->i_mutex); + + iolock = XFS_IOLOCK_SHARED; + need_i_mutex = 0; + } + + trace_xfs_file_direct_write(xip, count, *offset, ioflags); + ret = generic_file_direct_write(iocb, iovp, + &segs, pos, offset, count, ocount); + + /* + * direct-io write to a hole: fall through to buffered I/O + * for completing the rest of the request. + */ + if (ret >= 0 && ret != count) { + XFS_STATS_ADD(xs_write_bytes, ret); + + pos += ret; + count -= ret; + + ioflags &= ~IO_ISDIRECT; + xfs_iunlock(xip, iolock); + goto relock; + } + } else { + int enospc = 0; + ssize_t ret2 = 0; + +write_retry: + trace_xfs_file_buffered_write(xip, count, *offset, ioflags); + ret2 = generic_file_buffered_write(iocb, iovp, segs, + pos, offset, count, ret); + /* + * if we just got an ENOSPC, flush the inode now we + * aren't holding any page locks and retry *once* + */ + if (ret2 == -ENOSPC && !enospc) { + error = xfs_flush_pages(xip, 0, -1, 0, FI_NONE); + if (error) + goto out_unlock_internal; + enospc = 1; + goto write_retry; + } + ret = ret2; + } + + current->backing_dev_info = NULL; + + isize = i_size_read(inode); + if (unlikely(ret < 0 && ret != -EFAULT && *offset > isize)) + *offset = isize; + + if (*offset > xip->i_size) { + xfs_ilock(xip, XFS_ILOCK_EXCL); + if (*offset > xip->i_size) + xip->i_size = *offset; + xfs_iunlock(xip, XFS_ILOCK_EXCL); + } + + if (ret == -ENOSPC && + DM_EVENT_ENABLED(xip, DM_EVENT_NOSPACE) && !(ioflags & IO_INVIS)) { + xfs_iunlock(xip, iolock); + if (need_i_mutex) + mutex_unlock(&inode->i_mutex); + error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, xip, + DM_RIGHT_NULL, xip, DM_RIGHT_NULL, NULL, NULL, + 0, 0, 0); /* Delay flag intentionally unused */ + if (need_i_mutex) + mutex_lock(&inode->i_mutex); + xfs_ilock(xip, iolock); + if (error) + goto out_unlock_internal; + goto start; + } + + error = -ret; + if (ret <= 0) + goto out_unlock_internal; + + XFS_STATS_ADD(xs_write_bytes, ret); + + /* Handle various SYNC-type writes */ + if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) { + loff_t end = pos + ret - 1; + int error2; + + xfs_iunlock(xip, iolock); + if (need_i_mutex) + mutex_unlock(&inode->i_mutex); + + error2 = filemap_write_and_wait_range(mapping, pos, end); + if (!error) + error = error2; + if (need_i_mutex) + mutex_lock(&inode->i_mutex); + xfs_ilock(xip, iolock); + + error2 = xfs_fsync(xip); + if (!error) + error = error2; + } + + out_unlock_internal: + if (xip->i_new_size) { + xfs_ilock(xip, XFS_ILOCK_EXCL); + xip->i_new_size = 0; + /* + * If this was a direct or synchronous I/O that failed (such + * as ENOSPC) then part of the I/O may have been written to + * disk before the error occured. In this case the on-disk + * file size may have been adjusted beyond the in-memory file + * size and now needs to be truncated back. + */ + if (xip->i_d.di_size > xip->i_size) + xip->i_d.di_size = xip->i_size; + xfs_iunlock(xip, XFS_ILOCK_EXCL); + } + xfs_iunlock(xip, iolock); + out_unlock_mutex: + if (need_i_mutex) + mutex_unlock(&inode->i_mutex); + return -error; +} + STATIC ssize_t xfs_file_aio_read( struct kiocb *iocb, diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c deleted file mode 100644 index eac6f80d786d..000000000000 --- a/fs/xfs/linux-2.6/xfs_lrw.c +++ /dev/null @@ -1,796 +0,0 @@ -/* - * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include "xfs.h" -#include "xfs_fs.h" -#include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_inum.h" -#include "xfs_trans.h" -#include "xfs_sb.h" -#include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_alloc.h" -#include "xfs_dmapi.h" -#include "xfs_quota.h" -#include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" -#include "xfs_dinode.h" -#include "xfs_inode.h" -#include "xfs_bmap.h" -#include "xfs_btree.h" -#include "xfs_ialloc.h" -#include "xfs_rtalloc.h" -#include "xfs_error.h" -#include "xfs_itable.h" -#include "xfs_rw.h" -#include "xfs_attr.h" -#include "xfs_inode_item.h" -#include "xfs_buf_item.h" -#include "xfs_utils.h" -#include "xfs_iomap.h" -#include "xfs_vnodeops.h" -#include "xfs_trace.h" - -#include <linux/capability.h> -#include <linux/writeback.h> - - -/* - * xfs_iozero - * - * xfs_iozero clears the specified range of buffer supplied, - * and marks all the affected blocks as valid and modified. If - * an affected block is not allocated, it will be allocated. If - * an affected block is not completely overwritten, and is not - * valid before the operation, it will be read from disk before - * being partially zeroed. - */ -STATIC int -xfs_iozero( - struct xfs_inode *ip, /* inode */ - loff_t pos, /* offset in file */ - size_t count) /* size of data to zero */ -{ - struct page *page; - struct address_space *mapping; - int status; - - mapping = VFS_I(ip)->i_mapping; - do { - unsigned offset, bytes; - void *fsdata; - - offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ - bytes = PAGE_CACHE_SIZE - offset; - if (bytes > count) - bytes = count; - - status = pagecache_write_begin(NULL, mapping, pos, bytes, - AOP_FLAG_UNINTERRUPTIBLE, - &page, &fsdata); - if (status) - break; - - zero_user(page, offset, bytes); - - status = pagecache_write_end(NULL, mapping, pos, bytes, bytes, - page, fsdata); - WARN_ON(status <= 0); /* can't return less than zero! */ - pos += bytes; - count -= bytes; - status = 0; - } while (count); - - return (-status); -} - -ssize_t /* bytes read, or (-) error */ -xfs_read( - xfs_inode_t *ip, - struct kiocb *iocb, - const struct iovec *iovp, - unsigned int segs, - loff_t *offset, - int ioflags) -{ - struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; - xfs_mount_t *mp = ip->i_mount; - size_t size = 0; - ssize_t ret = 0; - xfs_fsize_t n; - unsigned long seg; - - - XFS_STATS_INC(xs_read_calls); - - /* START copy & waste from filemap.c */ - for (seg = 0; seg < segs; seg++) { - const struct iovec *iv = &iovp[seg]; - - /* - * If any segment has a negative length, or the cumulative - * length ever wraps negative then return -EINVAL. - */ - size += iv->iov_len; - if (unlikely((ssize_t)(size|iv->iov_len) < 0)) - return XFS_ERROR(-EINVAL); - } - /* END copy & waste from filemap.c */ - - if (unlikely(ioflags & IO_ISDIRECT)) { - xfs_buftarg_t *target = - XFS_IS_REALTIME_INODE(ip) ? - mp->m_rtdev_targp : mp->m_ddev_targp; - if ((*offset & target->bt_smask) || - (size & target->bt_smask)) { - if (*offset == ip->i_size) { - return (0); - } - return -XFS_ERROR(EINVAL); - } - } - - n = XFS_MAXIOFFSET(mp) - *offset; - if ((n <= 0) || (size == 0)) - return 0; - - if (n < size) - size = n; - - if (XFS_FORCED_SHUTDOWN(mp)) - return -EIO; - - if (unlikely(ioflags & IO_ISDIRECT)) - mutex_lock(&inode->i_mutex); - xfs_ilock(ip, XFS_IOLOCK_SHARED); - - if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) { - int dmflags = FILP_DELAY_FLAG(file) | DM_SEM_FLAG_RD(ioflags); - int iolock = XFS_IOLOCK_SHARED; - - ret = -XFS_SEND_DATA(mp, DM_EVENT_READ, ip, *offset, size, - dmflags, &iolock); - if (ret) { - xfs_iunlock(ip, XFS_IOLOCK_SHARED); - if (unlikely(ioflags & IO_ISDIRECT)) - mutex_unlock(&inode->i_mutex); - return ret; - } - } - - if (unlikely(ioflags & IO_ISDIRECT)) { - if (inode->i_mapping->nrpages) - ret = -xfs_flushinval_pages(ip, (*offset & PAGE_CACHE_MASK), - -1, FI_REMAPF_LOCKED); - mutex_unlock(&inode->i_mutex); - if (ret) { - xfs_iunlock(ip, XFS_IOLOCK_SHARED); - return ret; - } - } - - trace_xfs_file_read(ip, size, *offset, ioflags); - - iocb->ki_pos = *offset; - ret = generic_file_aio_read(iocb, iovp, segs, *offset); - if (ret > 0) - XFS_STATS_ADD(xs_read_bytes, ret); - - xfs_iunlock(ip, XFS_IOLOCK_SHARED); - return ret; -} - -ssize_t -xfs_splice_read( - xfs_inode_t *ip, - struct file *infilp, - loff_t *ppos, - struct pipe_inode_info *pipe, - size_t count, - int flags, - int ioflags) -{ - xfs_mount_t *mp = ip->i_mount; - ssize_t ret; - - XFS_STATS_INC(xs_read_calls); - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) - return -EIO; - - xfs_ilock(ip, XFS_IOLOCK_SHARED); - - if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) { - int iolock = XFS_IOLOCK_SHARED; - int error; - - error = XFS_SEND_DATA(mp, DM_EVENT_READ, ip, *ppos, count, - FILP_DELAY_FLAG(infilp), &iolock); - if (error) { - xfs_iunlock(ip, XFS_IOLOCK_SHARED); - return -error; - } - } - - trace_xfs_file_splice_read(ip, count, *ppos, ioflags); - - ret = generic_file_splice_read(infilp, ppos, pipe, count, flags); - if (ret > 0) - XFS_STATS_ADD(xs_read_bytes, ret); - - xfs_iunlock(ip, XFS_IOLOCK_SHARED); - return ret; -} - -ssize_t -xfs_splice_write( - xfs_inode_t *ip, - struct pipe_inode_info *pipe, - struct file *outfilp, - loff_t *ppos, - size_t count, - int flags, - int ioflags) -{ - xfs_mount_t *mp = ip->i_mount; - ssize_t ret; - struct inode *inode = outfilp->f_mapping->host; - xfs_fsize_t isize, new_size; - - XFS_STATS_INC(xs_write_calls); - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) - return -EIO; - - xfs_ilock(ip, XFS_IOLOCK_EXCL); - - if (DM_EVENT_ENABLED(ip, DM_EVENT_WRITE) && !(ioflags & IO_INVIS)) { - int iolock = XFS_IOLOCK_EXCL; - int error; - - error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, ip, *ppos, count, - FILP_DELAY_FLAG(outfilp), &iolock); - if (error) { - xfs_iunlock(ip, XFS_IOLOCK_EXCL); - return -error; - } - } - - new_size = *ppos + count; - - xfs_ilock(ip, XFS_ILOCK_EXCL); - if (new_size > ip->i_size) - ip->i_new_size = new_size; - xfs_iunlock(ip, XFS_ILOCK_EXCL); - - trace_xfs_file_splice_write(ip, count, *ppos, ioflags); - - ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags); - if (ret > 0) - XFS_STATS_ADD(xs_write_bytes, ret); - - isize = i_size_read(inode); - if (unlikely(ret < 0 && ret != -EFAULT && *ppos > isize)) - *ppos = isize; - - if (*ppos > ip->i_size) { - xfs_ilock(ip, XFS_ILOCK_EXCL); - if (*ppos > ip->i_size) - ip->i_size = *ppos; - xfs_iunlock(ip, XFS_ILOCK_EXCL); - } - - if (ip->i_new_size) { - xfs_ilock(ip, XFS_ILOCK_EXCL); - ip->i_new_size = 0; - if (ip->i_d.di_size > ip->i_size) - ip->i_d.di_size = ip->i_size; - xfs_iunlock(ip, XFS_ILOCK_EXCL); - } - xfs_iunlock(ip, XFS_IOLOCK_EXCL); - return ret; -} - -/* - * This routine is called to handle zeroing any space in the last - * block of the file that is beyond the EOF. We do this since the - * size is being increased without writing anything to that block - * and we don't want anyone to read the garbage on the disk. - */ -STATIC int /* error (positive) */ -xfs_zero_last_block( - xfs_inode_t *ip, - xfs_fsize_t offset, - xfs_fsize_t isize) -{ - xfs_fileoff_t last_fsb; - xfs_mount_t *mp = ip->i_mount; - int nimaps; - int zero_offset; - int zero_len; - int error = 0; - xfs_bmbt_irec_t imap; - - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - - zero_offset = XFS_B_FSB_OFFSET(mp, isize); - if (zero_offset == 0) { - /* - * There are no extra bytes in the last block on disk to - * zero, so return. - */ - return 0; - } - - last_fsb = XFS_B_TO_FSBT(mp, isize); - nimaps = 1; - error = xfs_bmapi(NULL, ip, last_fsb, 1, 0, NULL, 0, &imap, - &nimaps, NULL, NULL); - if (error) { - return error; - } - ASSERT(nimaps > 0); - /* - * If the block underlying isize is just a hole, then there - * is nothing to zero. - */ - if (imap.br_startblock == HOLESTARTBLOCK) { - return 0; - } - /* - * Zero the part of the last block beyond the EOF, and write it - * out sync. We need to drop the ilock while we do this so we - * don't deadlock when the buffer cache calls back to us. - */ - xfs_iunlock(ip, XFS_ILOCK_EXCL); - - zero_len = mp->m_sb.sb_blocksize - zero_offset; - if (isize + zero_len > offset) - zero_len = offset - isize; - error = xfs_iozero(ip, isize, zero_len); - - xfs_ilock(ip, XFS_ILOCK_EXCL); - ASSERT(error >= 0); - return error; -} - -/* - * Zero any on disk space between the current EOF and the new, - * larger EOF. This handles the normal case of zeroing the remainder - * of the last block in the file and the unusual case of zeroing blocks - * out beyond the size of the file. This second case only happens - * with fixed size extents and when the system crashes before the inode - * size was updated but after blocks were allocated. If fill is set, - * then any holes in the range are filled and zeroed. If not, the holes - * are left alone as holes. - */ - -int /* error (positive) */ -xfs_zero_eof( - xfs_inode_t *ip, - xfs_off_t offset, /* starting I/O offset */ - xfs_fsize_t isize) /* current inode size */ -{ - xfs_mount_t *mp = ip->i_mount; - xfs_fileoff_t start_zero_fsb; - xfs_fileoff_t end_zero_fsb; - xfs_fileoff_t zero_count_fsb; - xfs_fileoff_t last_fsb; - xfs_fileoff_t zero_off; - xfs_fsize_t zero_len; - int nimaps; - int error = 0; - xfs_bmbt_irec_t imap; - - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL)); - ASSERT(offset > isize); - - /* - * First handle zeroing the block on which isize resides. - * We only zero a part of that block so it is handled specially. - */ - error = xfs_zero_last_block(ip, offset, isize); - if (error) { - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL)); - return error; - } - - /* - * Calculate the range between the new size and the old - * where blocks needing to be zeroed may exist. To get the - * block where the last byte in the file currently resides, - * we need to subtract one from the size and truncate back - * to a block boundary. We subtract 1 in case the size is - * exactly on a block boundary. - */ - last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1; - start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize); - end_zero_fsb = XFS_B_TO_FSBT(mp, offset - 1); - ASSERT((xfs_sfiloff_t)last_fsb < (xfs_sfiloff_t)start_zero_fsb); - if (last_fsb == end_zero_fsb) { - /* - * The size was only incremented on its last block. - * We took care of that above, so just return. - */ - return 0; - } - - ASSERT(start_zero_fsb <= end_zero_fsb); - while (start_zero_fsb <= end_zero_fsb) { - nimaps = 1; - zero_count_fsb = end_zero_fsb - start_zero_fsb + 1; - error = xfs_bmapi(NULL, ip, start_zero_fsb, zero_count_fsb, - 0, NULL, 0, &imap, &nimaps, NULL, NULL); - if (error) { - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL)); - return error; - } - ASSERT(nimaps > 0); - - if (imap.br_state == XFS_EXT_UNWRITTEN || - imap.br_startblock == HOLESTARTBLOCK) { - /* - * This loop handles initializing pages that were - * partially initialized by the code below this - * loop. It basically zeroes the part of the page - * that sits on a hole and sets the page as P_HOLE - * and calls remapf if it is a mapped file. - */ - start_zero_fsb = imap.br_startoff + imap.br_blockcount; - ASSERT(start_zero_fsb <= (end_zero_fsb + 1)); - continue; - } - - /* - * There are blocks we need to zero. - * Drop the inode lock while we're doing the I/O. - * We'll still have the iolock to protect us. - */ - xfs_iunlock(ip, XFS_ILOCK_EXCL); - - zero_off = XFS_FSB_TO_B(mp, start_zero_fsb); - zero_len = XFS_FSB_TO_B(mp, imap.br_blockcount); - - if ((zero_off + zero_len) > offset) - zero_len = offset - zero_off; - - error = xfs_iozero(ip, zero_off, zero_len); - if (error) { - goto out_lock; - } - - start_zero_fsb = imap.br_startoff + imap.br_blockcount; - ASSERT(start_zero_fsb <= (end_zero_fsb + 1)); - - xfs_ilock(ip, XFS_ILOCK_EXCL); - } - - return 0; - -out_lock: - xfs_ilock(ip, XFS_ILOCK_EXCL); - ASSERT(error >= 0); - return error; -} - -ssize_t /* bytes written, or (-) error */ -xfs_write( - struct xfs_inode *xip, - struct kiocb *iocb, - const struct iovec *iovp, - unsigned int nsegs, - loff_t *offset, - int ioflags) -{ - struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; - unsigned long segs = nsegs; - xfs_mount_t *mp; - ssize_t ret = 0, error = 0; - xfs_fsize_t isize, new_size; - int iolock; - int eventsent = 0; - size_t ocount = 0, count; - loff_t pos; - int need_i_mutex; - - XFS_STATS_INC(xs_write_calls); - - error = generic_segment_checks(iovp, &segs, &ocount, VERIFY_READ); - if (error) - return error; - - count = ocount; - pos = *offset; - - if (count == 0) - return 0; - - mp = xip->i_mount; - - xfs_wait_for_freeze(mp, SB_FREEZE_WRITE); - - if (XFS_FORCED_SHUTDOWN(mp)) - return -EIO; - -relock: - if (ioflags & IO_ISDIRECT) { - iolock = XFS_IOLOCK_SHARED; - need_i_mutex = 0; - } else { - iolock = XFS_IOLOCK_EXCL; - need_i_mutex = 1; - mutex_lock(&inode->i_mutex); - } - - xfs_ilock(xip, XFS_ILOCK_EXCL|iolock); - -start: - error = -generic_write_checks(file, &pos, &count, - S_ISBLK(inode->i_mode)); - if (error) { - xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock); - goto out_unlock_mutex; - } - - if ((DM_EVENT_ENABLED(xip, DM_EVENT_WRITE) && - !(ioflags & IO_INVIS) && !eventsent)) { - int dmflags = FILP_DELAY_FLAG(file); - - if (need_i_mutex) - dmflags |= DM_FLAGS_IMUX; - - xfs_iunlock(xip, XFS_ILOCK_EXCL); - error = XFS_SEND_DATA(xip->i_mount, DM_EVENT_WRITE, xip, - pos, count, dmflags, &iolock); - if (error) { - goto out_unlock_internal; - } - xfs_ilock(xip, XFS_ILOCK_EXCL); - eventsent = 1; - - /* - * The iolock was dropped and reacquired in XFS_SEND_DATA - * so we have to recheck the size when appending. - * We will only "goto start;" once, since having sent the - * event prevents another call to XFS_SEND_DATA, which is - * what allows the size to change in the first place. - */ - if ((file->f_flags & O_APPEND) && pos != xip->i_size) - goto start; - } - - if (ioflags & IO_ISDIRECT) { - xfs_buftarg_t *target = - XFS_IS_REALTIME_INODE(xip) ? - mp->m_rtdev_targp : mp->m_ddev_targp; - - if ((pos & target->bt_smask) || (count & target->bt_smask)) { - xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock); - return XFS_ERROR(-EINVAL); - } - - if (!need_i_mutex && (mapping->nrpages || pos > xip->i_size)) { - xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock); - iolock = XFS_IOLOCK_EXCL; - need_i_mutex = 1; - mutex_lock(&inode->i_mutex); - xfs_ilock(xip, XFS_ILOCK_EXCL|iolock); - goto start; - } - } - - new_size = pos + count; - if (new_size > xip->i_size) - xip->i_new_size = new_size; - - if (likely(!(ioflags & IO_INVIS))) - file_update_time(file); - - /* - * If the offset is beyond the size of the file, we have a couple - * of things to do. First, if there is already space allocated - * we need to either create holes or zero the disk or ... - * - * If there is a page where the previous size lands, we need - * to zero it out up to the new size. - */ - - if (pos > xip->i_size) { - error = xfs_zero_eof(xip, pos, xip->i_size); - if (error) { - xfs_iunlock(xip, XFS_ILOCK_EXCL); - goto out_unlock_internal; - } - } - xfs_iunlock(xip, XFS_ILOCK_EXCL); - - /* - * If we're writing the file then make sure to clear the - * setuid and setgid bits if the process is not being run - * by root. This keeps people from modifying setuid and - * setgid binaries. - */ - error = -file_remove_suid(file); - if (unlikely(error)) - goto out_unlock_internal; - - /* We can write back this queue in page reclaim */ - current->backing_dev_info = mapping->backing_dev_info; - - if ((ioflags & IO_ISDIRECT)) { - if (mapping->nrpages) { - WARN_ON(need_i_mutex == 0); - error = xfs_flushinval_pages(xip, - (pos & PAGE_CACHE_MASK), - -1, FI_REMAPF_LOCKED); - if (error) - goto out_unlock_internal; - } - - if (need_i_mutex) { - /* demote the lock now the cached pages are gone */ - xfs_ilock_demote(xip, XFS_IOLOCK_EXCL); - mutex_unlock(&inode->i_mutex); - - iolock = XFS_IOLOCK_SHARED; - need_i_mutex = 0; - } - - trace_xfs_file_direct_write(xip, count, *offset, ioflags); - ret = generic_file_direct_write(iocb, iovp, - &segs, pos, offset, count, ocount); - - /* - * direct-io write to a hole: fall through to buffered I/O - * for completing the rest of the request. - */ - if (ret >= 0 && ret != count) { - XFS_STATS_ADD(xs_write_bytes, ret); - - pos += ret; - count -= ret; - - ioflags &= ~IO_ISDIRECT; - xfs_iunlock(xip, iolock); - goto relock; - } - } else { - int enospc = 0; - ssize_t ret2 = 0; - -write_retry: - trace_xfs_file_buffered_write(xip, count, *offset, ioflags); - ret2 = generic_file_buffered_write(iocb, iovp, segs, - pos, offset, count, ret); - /* - * if we just got an ENOSPC, flush the inode now we - * aren't holding any page locks and retry *once* - */ - if (ret2 == -ENOSPC && !enospc) { - error = xfs_flush_pages(xip, 0, -1, 0, FI_NONE); - if (error) - goto out_unlock_internal; - enospc = 1; - goto write_retry; - } - ret = ret2; - } - - current->backing_dev_info = NULL; - - isize = i_size_read(inode); - if (unlikely(ret < 0 && ret != -EFAULT && *offset > isize)) - *offset = isize; - - if (*offset > xip->i_size) { - xfs_ilock(xip, XFS_ILOCK_EXCL); - if (*offset > xip->i_size) - xip->i_size = *offset; - xfs_iunlock(xip, XFS_ILOCK_EXCL); - } - - if (ret == -ENOSPC && - DM_EVENT_ENABLED(xip, DM_EVENT_NOSPACE) && !(ioflags & IO_INVIS)) { - xfs_iunlock(xip, iolock); - if (need_i_mutex) - mutex_unlock(&inode->i_mutex); - error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, xip, - DM_RIGHT_NULL, xip, DM_RIGHT_NULL, NULL, NULL, - 0, 0, 0); /* Delay flag intentionally unused */ - if (need_i_mutex) - mutex_lock(&inode->i_mutex); - xfs_ilock(xip, iolock); - if (error) - goto out_unlock_internal; - goto start; - } - - error = -ret; - if (ret <= 0) - goto out_unlock_internal; - - XFS_STATS_ADD(xs_write_bytes, ret); - - /* Handle various SYNC-type writes */ - if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) { - loff_t end = pos + ret - 1; - int error2; - - xfs_iunlock(xip, iolock); - if (need_i_mutex) - mutex_unlock(&inode->i_mutex); - - error2 = filemap_write_and_wait_range(mapping, pos, end); - if (!error) - error = error2; - if (need_i_mutex) - mutex_lock(&inode->i_mutex); - xfs_ilock(xip, iolock); - - error2 = xfs_fsync(xip); - if (!error) - error = error2; - } - - out_unlock_internal: - if (xip->i_new_size) { - xfs_ilock(xip, XFS_ILOCK_EXCL); - xip->i_new_size = 0; - /* - * If this was a direct or synchronous I/O that failed (such - * as ENOSPC) then part of the I/O may have been written to - * disk before the error occured. In this case the on-disk - * file size may have been adjusted beyond the in-memory file - * size and now needs to be truncated back. - */ - if (xip->i_d.di_size > xip->i_size) - xip->i_d.di_size = xip->i_size; - xfs_iunlock(xip, XFS_ILOCK_EXCL); - } - xfs_iunlock(xip, iolock); - out_unlock_mutex: - if (need_i_mutex) - mutex_unlock(&inode->i_mutex); - return -error; -} - -/* - * If the underlying (data/log/rt) device is readonly, there are some - * operations that cannot proceed. - */ -int -xfs_dev_is_read_only( - xfs_mount_t *mp, - char *message) -{ - if (xfs_readonly_buftarg(mp->m_ddev_targp) || - xfs_readonly_buftarg(mp->m_logdev_targp) || - (mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp))) { - cmn_err(CE_NOTE, - "XFS: %s required on read-only device.", message); - cmn_err(CE_NOTE, - "XFS: write access unavailable, cannot proceed."); - return EROFS; - } - return 0; -} diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 6afaaeb2950a..c207fef6770b 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -2052,6 +2052,26 @@ xfs_mount_log_sb( return error; } +/* + * If the underlying (data/log/rt) device is readonly, there are some + * operations that cannot proceed. + */ +int +xfs_dev_is_read_only( + struct xfs_mount *mp, + char *message) +{ + if (xfs_readonly_buftarg(mp->m_ddev_targp) || + xfs_readonly_buftarg(mp->m_logdev_targp) || + (mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp))) { + cmn_err(CE_NOTE, + "XFS: %s required on read-only device.", message); + cmn_err(CE_NOTE, + "XFS: write access unavailable, cannot proceed."); + return EROFS; + } + return 0; +} #ifdef HAVE_PERCPU_SB /* diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h index 774f40729ca1..ee33e11d9872 100644 --- a/fs/xfs/xfs_vnodeops.h +++ b/fs/xfs/xfs_vnodeops.h @@ -50,18 +50,6 @@ int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name, int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags); int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize, int flags, struct attrlist_cursor_kern *cursor); -ssize_t xfs_read(struct xfs_inode *ip, struct kiocb *iocb, - const struct iovec *iovp, unsigned int segs, - loff_t *offset, int ioflags); -ssize_t xfs_splice_read(struct xfs_inode *ip, struct file *infilp, - loff_t *ppos, struct pipe_inode_info *pipe, size_t count, - int flags, int ioflags); -ssize_t xfs_splice_write(struct xfs_inode *ip, - struct pipe_inode_info *pipe, struct file *outfilp, - loff_t *ppos, size_t count, int flags, int ioflags); -ssize_t xfs_write(struct xfs_inode *xip, struct kiocb *iocb, - const struct iovec *iovp, unsigned int nsegs, - loff_t *offset, int ioflags); int xfs_bmap(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, int flags, struct xfs_iomap *iomapp, int *niomaps); void xfs_tosspages(struct xfs_inode *inode, xfs_off_t first, -- cgit v1.2.3 From 00258e36b2d33b1b5cef7b489e06c5e0a9df58b5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@infradead.org> Date: Mon, 15 Feb 2010 09:44:47 +0000 Subject: xfs: remove wrappers for read/write file operations Currently the aio_read, aio_write, splice_read and splice_write file operations are divided into a low-level routine doing all the work and one that implements the Linux file operations and does minimal argument wrapping. This is a leftover from the days of the vnode operations layer and can be removed to simplify the code a lot. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Dave Chinner <david@fromorbit.com> Signed-off-by: Alex Elder <aelder@sgi.com> --- fs/xfs/linux-2.6/xfs_file.c | 285 ++++++++++++++++++-------------------------- 1 file changed, 114 insertions(+), 171 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index 51fc510828a4..1eb561a10e26 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -96,28 +96,34 @@ xfs_iozero( return (-status); } -ssize_t /* bytes read, or (-) error */ -xfs_read( - xfs_inode_t *ip, +STATIC ssize_t +xfs_file_aio_read( struct kiocb *iocb, const struct iovec *iovp, - unsigned int segs, - loff_t *offset, - int ioflags) + unsigned long nr_segs, + loff_t pos) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; - xfs_mount_t *mp = ip->i_mount; + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; size_t size = 0; ssize_t ret = 0; + int ioflags = 0; xfs_fsize_t n; unsigned long seg; - XFS_STATS_INC(xs_read_calls); + BUG_ON(iocb->ki_pos != pos); + + if (unlikely(file->f_flags & O_DIRECT)) + ioflags |= IO_ISDIRECT; + if (file->f_mode & FMODE_NOCMTIME) + ioflags |= IO_INVIS; + /* START copy & waste from filemap.c */ - for (seg = 0; seg < segs; seg++) { + for (seg = 0; seg < nr_segs; seg++) { const struct iovec *iv = &iovp[seg]; /* @@ -134,17 +140,16 @@ xfs_read( xfs_buftarg_t *target = XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp; - if ((*offset & target->bt_smask) || + if ((iocb->ki_pos & target->bt_smask) || (size & target->bt_smask)) { - if (*offset == ip->i_size) { - return (0); - } + if (iocb->ki_pos == ip->i_size) + return 0; return -XFS_ERROR(EINVAL); } } - n = XFS_MAXIOFFSET(mp) - *offset; - if ((n <= 0) || (size == 0)) + n = XFS_MAXIOFFSET(mp) - iocb->ki_pos; + if (n <= 0 || size == 0) return 0; if (n < size) @@ -161,7 +166,7 @@ xfs_read( int dmflags = FILP_DELAY_FLAG(file) | DM_SEM_FLAG_RD(ioflags); int iolock = XFS_IOLOCK_SHARED; - ret = -XFS_SEND_DATA(mp, DM_EVENT_READ, ip, *offset, size, + ret = -XFS_SEND_DATA(mp, DM_EVENT_READ, ip, iocb->ki_pos, size, dmflags, &iolock); if (ret) { xfs_iunlock(ip, XFS_IOLOCK_SHARED); @@ -172,9 +177,11 @@ xfs_read( } if (unlikely(ioflags & IO_ISDIRECT)) { - if (inode->i_mapping->nrpages) - ret = -xfs_flushinval_pages(ip, (*offset & PAGE_CACHE_MASK), - -1, FI_REMAPF_LOCKED); + if (inode->i_mapping->nrpages) { + ret = -xfs_flushinval_pages(ip, + (iocb->ki_pos & PAGE_CACHE_MASK), + -1, FI_REMAPF_LOCKED); + } mutex_unlock(&inode->i_mutex); if (ret) { xfs_iunlock(ip, XFS_IOLOCK_SHARED); @@ -182,10 +189,9 @@ xfs_read( } } - trace_xfs_file_read(ip, size, *offset, ioflags); + trace_xfs_file_read(ip, size, iocb->ki_pos, ioflags); - iocb->ki_pos = *offset; - ret = generic_file_aio_read(iocb, iovp, segs, *offset); + ret = generic_file_aio_read(iocb, iovp, nr_segs, iocb->ki_pos); if (ret > 0) XFS_STATS_ADD(xs_read_bytes, ret); @@ -193,20 +199,24 @@ xfs_read( return ret; } -ssize_t -xfs_splice_read( - xfs_inode_t *ip, +STATIC ssize_t +xfs_file_splice_read( struct file *infilp, loff_t *ppos, struct pipe_inode_info *pipe, size_t count, - int flags, - int ioflags) + unsigned int flags) { - xfs_mount_t *mp = ip->i_mount; + struct xfs_inode *ip = XFS_I(infilp->f_mapping->host); + struct xfs_mount *mp = ip->i_mount; + int ioflags = 0; ssize_t ret; XFS_STATS_INC(xs_read_calls); + + if (infilp->f_mode & FMODE_NOCMTIME) + ioflags |= IO_INVIS; + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) return -EIO; @@ -234,22 +244,26 @@ xfs_splice_read( return ret; } -ssize_t -xfs_splice_write( - xfs_inode_t *ip, +STATIC ssize_t +xfs_file_splice_write( struct pipe_inode_info *pipe, struct file *outfilp, loff_t *ppos, size_t count, - int flags, - int ioflags) + unsigned int flags) { - xfs_mount_t *mp = ip->i_mount; - ssize_t ret; struct inode *inode = outfilp->f_mapping->host; + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; xfs_fsize_t isize, new_size; + int ioflags = 0; + ssize_t ret; XFS_STATS_INC(xs_write_calls); + + if (outfilp->f_mode & FMODE_NOCMTIME) + ioflags |= IO_INVIS; + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) return -EIO; @@ -484,42 +498,43 @@ out_lock: return error; } -ssize_t /* bytes written, or (-) error */ -xfs_write( - struct xfs_inode *xip, +STATIC ssize_t +xfs_file_aio_write( struct kiocb *iocb, const struct iovec *iovp, - unsigned int nsegs, - loff_t *offset, - int ioflags) + unsigned long nr_segs, + loff_t pos) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; - unsigned long segs = nsegs; - xfs_mount_t *mp; + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; ssize_t ret = 0, error = 0; + int ioflags = 0; xfs_fsize_t isize, new_size; int iolock; int eventsent = 0; size_t ocount = 0, count; - loff_t pos; int need_i_mutex; XFS_STATS_INC(xs_write_calls); - error = generic_segment_checks(iovp, &segs, &ocount, VERIFY_READ); + BUG_ON(iocb->ki_pos != pos); + + if (unlikely(file->f_flags & O_DIRECT)) + ioflags |= IO_ISDIRECT; + if (file->f_mode & FMODE_NOCMTIME) + ioflags |= IO_INVIS; + + error = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ); if (error) return error; count = ocount; - pos = *offset; - if (count == 0) return 0; - mp = xip->i_mount; - xfs_wait_for_freeze(mp, SB_FREEZE_WRITE); if (XFS_FORCED_SHUTDOWN(mp)) @@ -535,30 +550,30 @@ relock: mutex_lock(&inode->i_mutex); } - xfs_ilock(xip, XFS_ILOCK_EXCL|iolock); + xfs_ilock(ip, XFS_ILOCK_EXCL|iolock); start: error = -generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); if (error) { - xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock); + xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock); goto out_unlock_mutex; } - if ((DM_EVENT_ENABLED(xip, DM_EVENT_WRITE) && + if ((DM_EVENT_ENABLED(ip, DM_EVENT_WRITE) && !(ioflags & IO_INVIS) && !eventsent)) { int dmflags = FILP_DELAY_FLAG(file); if (need_i_mutex) dmflags |= DM_FLAGS_IMUX; - xfs_iunlock(xip, XFS_ILOCK_EXCL); - error = XFS_SEND_DATA(xip->i_mount, DM_EVENT_WRITE, xip, + xfs_iunlock(ip, XFS_ILOCK_EXCL); + error = XFS_SEND_DATA(ip->i_mount, DM_EVENT_WRITE, ip, pos, count, dmflags, &iolock); if (error) { goto out_unlock_internal; } - xfs_ilock(xip, XFS_ILOCK_EXCL); + xfs_ilock(ip, XFS_ILOCK_EXCL); eventsent = 1; /* @@ -568,33 +583,33 @@ start: * event prevents another call to XFS_SEND_DATA, which is * what allows the size to change in the first place. */ - if ((file->f_flags & O_APPEND) && pos != xip->i_size) + if ((file->f_flags & O_APPEND) && pos != ip->i_size) goto start; } if (ioflags & IO_ISDIRECT) { xfs_buftarg_t *target = - XFS_IS_REALTIME_INODE(xip) ? + XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp; if ((pos & target->bt_smask) || (count & target->bt_smask)) { - xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock); + xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock); return XFS_ERROR(-EINVAL); } - if (!need_i_mutex && (mapping->nrpages || pos > xip->i_size)) { - xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock); + if (!need_i_mutex && (mapping->nrpages || pos > ip->i_size)) { + xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock); iolock = XFS_IOLOCK_EXCL; need_i_mutex = 1; mutex_lock(&inode->i_mutex); - xfs_ilock(xip, XFS_ILOCK_EXCL|iolock); + xfs_ilock(ip, XFS_ILOCK_EXCL|iolock); goto start; } } new_size = pos + count; - if (new_size > xip->i_size) - xip->i_new_size = new_size; + if (new_size > ip->i_size) + ip->i_new_size = new_size; if (likely(!(ioflags & IO_INVIS))) file_update_time(file); @@ -608,14 +623,14 @@ start: * to zero it out up to the new size. */ - if (pos > xip->i_size) { - error = xfs_zero_eof(xip, pos, xip->i_size); + if (pos > ip->i_size) { + error = xfs_zero_eof(ip, pos, ip->i_size); if (error) { - xfs_iunlock(xip, XFS_ILOCK_EXCL); + xfs_iunlock(ip, XFS_ILOCK_EXCL); goto out_unlock_internal; } } - xfs_iunlock(xip, XFS_ILOCK_EXCL); + xfs_iunlock(ip, XFS_ILOCK_EXCL); /* * If we're writing the file then make sure to clear the @@ -633,7 +648,7 @@ start: if ((ioflags & IO_ISDIRECT)) { if (mapping->nrpages) { WARN_ON(need_i_mutex == 0); - error = xfs_flushinval_pages(xip, + error = xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1, FI_REMAPF_LOCKED); if (error) @@ -642,16 +657,16 @@ start: if (need_i_mutex) { /* demote the lock now the cached pages are gone */ - xfs_ilock_demote(xip, XFS_IOLOCK_EXCL); + xfs_ilock_demote(ip, XFS_IOLOCK_EXCL); mutex_unlock(&inode->i_mutex); iolock = XFS_IOLOCK_SHARED; need_i_mutex = 0; } - trace_xfs_file_direct_write(xip, count, *offset, ioflags); + trace_xfs_file_direct_write(ip, count, iocb->ki_pos, ioflags); ret = generic_file_direct_write(iocb, iovp, - &segs, pos, offset, count, ocount); + &nr_segs, pos, &iocb->ki_pos, count, ocount); /* * direct-io write to a hole: fall through to buffered I/O @@ -664,7 +679,7 @@ start: count -= ret; ioflags &= ~IO_ISDIRECT; - xfs_iunlock(xip, iolock); + xfs_iunlock(ip, iolock); goto relock; } } else { @@ -672,15 +687,15 @@ start: ssize_t ret2 = 0; write_retry: - trace_xfs_file_buffered_write(xip, count, *offset, ioflags); - ret2 = generic_file_buffered_write(iocb, iovp, segs, - pos, offset, count, ret); + trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, ioflags); + ret2 = generic_file_buffered_write(iocb, iovp, nr_segs, + pos, &iocb->ki_pos, count, ret); /* * if we just got an ENOSPC, flush the inode now we * aren't holding any page locks and retry *once* */ if (ret2 == -ENOSPC && !enospc) { - error = xfs_flush_pages(xip, 0, -1, 0, FI_NONE); + error = xfs_flush_pages(ip, 0, -1, 0, FI_NONE); if (error) goto out_unlock_internal; enospc = 1; @@ -692,27 +707,27 @@ write_retry: current->backing_dev_info = NULL; isize = i_size_read(inode); - if (unlikely(ret < 0 && ret != -EFAULT && *offset > isize)) - *offset = isize; - - if (*offset > xip->i_size) { - xfs_ilock(xip, XFS_ILOCK_EXCL); - if (*offset > xip->i_size) - xip->i_size = *offset; - xfs_iunlock(xip, XFS_ILOCK_EXCL); + if (unlikely(ret < 0 && ret != -EFAULT && iocb->ki_pos > isize)) + iocb->ki_pos = isize; + + if (iocb->ki_pos > ip->i_size) { + xfs_ilock(ip, XFS_ILOCK_EXCL); + if (iocb->ki_pos > ip->i_size) + ip->i_size = iocb->ki_pos; + xfs_iunlock(ip, XFS_ILOCK_EXCL); } if (ret == -ENOSPC && - DM_EVENT_ENABLED(xip, DM_EVENT_NOSPACE) && !(ioflags & IO_INVIS)) { - xfs_iunlock(xip, iolock); + DM_EVENT_ENABLED(ip, DM_EVENT_NOSPACE) && !(ioflags & IO_INVIS)) { + xfs_iunlock(ip, iolock); if (need_i_mutex) mutex_unlock(&inode->i_mutex); - error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, xip, - DM_RIGHT_NULL, xip, DM_RIGHT_NULL, NULL, NULL, + error = XFS_SEND_NAMESP(ip->i_mount, DM_EVENT_NOSPACE, ip, + DM_RIGHT_NULL, ip, DM_RIGHT_NULL, NULL, NULL, 0, 0, 0); /* Delay flag intentionally unused */ if (need_i_mutex) mutex_lock(&inode->i_mutex); - xfs_ilock(xip, iolock); + xfs_ilock(ip, iolock); if (error) goto out_unlock_internal; goto start; @@ -729,7 +744,7 @@ write_retry: loff_t end = pos + ret - 1; int error2; - xfs_iunlock(xip, iolock); + xfs_iunlock(ip, iolock); if (need_i_mutex) mutex_unlock(&inode->i_mutex); @@ -738,17 +753,17 @@ write_retry: error = error2; if (need_i_mutex) mutex_lock(&inode->i_mutex); - xfs_ilock(xip, iolock); + xfs_ilock(ip, iolock); - error2 = xfs_fsync(xip); + error2 = xfs_fsync(ip); if (!error) error = error2; } out_unlock_internal: - if (xip->i_new_size) { - xfs_ilock(xip, XFS_ILOCK_EXCL); - xip->i_new_size = 0; + if (ip->i_new_size) { + xfs_ilock(ip, XFS_ILOCK_EXCL); + ip->i_new_size = 0; /* * If this was a direct or synchronous I/O that failed (such * as ENOSPC) then part of the I/O may have been written to @@ -756,89 +771,17 @@ write_retry: * file size may have been adjusted beyond the in-memory file * size and now needs to be truncated back. */ - if (xip->i_d.di_size > xip->i_size) - xip->i_d.di_size = xip->i_size; - xfs_iunlock(xip, XFS_ILOCK_EXCL); + if (ip->i_d.di_size > ip->i_size) + ip->i_d.di_size = ip->i_size; + xfs_iunlock(ip, XFS_ILOCK_EXCL); } - xfs_iunlock(xip, iolock); + xfs_iunlock(ip, iolock); out_unlock_mutex: if (need_i_mutex) mutex_unlock(&inode->i_mutex); return -error; } -STATIC ssize_t -xfs_file_aio_read( - struct kiocb *iocb, - const struct iovec *iov, - unsigned long nr_segs, - loff_t pos) -{ - struct file *file = iocb->ki_filp; - int ioflags = 0; - - BUG_ON(iocb->ki_pos != pos); - if (unlikely(file->f_flags & O_DIRECT)) - ioflags |= IO_ISDIRECT; - if (file->f_mode & FMODE_NOCMTIME) - ioflags |= IO_INVIS; - return xfs_read(XFS_I(file->f_path.dentry->d_inode), iocb, iov, - nr_segs, &iocb->ki_pos, ioflags); -} - -STATIC ssize_t -xfs_file_aio_write( - struct kiocb *iocb, - const struct iovec *iov, - unsigned long nr_segs, - loff_t pos) -{ - struct file *file = iocb->ki_filp; - int ioflags = 0; - - BUG_ON(iocb->ki_pos != pos); - if (unlikely(file->f_flags & O_DIRECT)) - ioflags |= IO_ISDIRECT; - if (file->f_mode & FMODE_NOCMTIME) - ioflags |= IO_INVIS; - return xfs_write(XFS_I(file->f_mapping->host), iocb, iov, nr_segs, - &iocb->ki_pos, ioflags); -} - -STATIC ssize_t -xfs_file_splice_read( - struct file *infilp, - loff_t *ppos, - struct pipe_inode_info *pipe, - size_t len, - unsigned int flags) -{ - int ioflags = 0; - - if (infilp->f_mode & FMODE_NOCMTIME) - ioflags |= IO_INVIS; - - return xfs_splice_read(XFS_I(infilp->f_path.dentry->d_inode), - infilp, ppos, pipe, len, flags, ioflags); -} - -STATIC ssize_t -xfs_file_splice_write( - struct pipe_inode_info *pipe, - struct file *outfilp, - loff_t *ppos, - size_t len, - unsigned int flags) -{ - int ioflags = 0; - - if (outfilp->f_mode & FMODE_NOCMTIME) - ioflags |= IO_INVIS; - - return xfs_splice_write(XFS_I(outfilp->f_path.dentry->d_inode), - pipe, outfilp, ppos, len, flags, ioflags); -} - STATIC int xfs_file_open( struct inode *inode, -- cgit v1.2.3 From fd3200bef7d66ed3924f72c79a465fb7ff85478a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@infradead.org> Date: Mon, 15 Feb 2010 09:44:48 +0000 Subject: xfs: remove wrapper for the fsync file operation Currently the fsync file operation is divided into a low-level routine doing all the work and one that implements the Linux file operation and does minimal argument wrapping. This is a leftover from the days of the vnode operations layer and can be removed to simplify the code a bit, as well as preparing for the implementation of an optimized fdatasync which needs to look at the Linux inode state. Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Alex Elder <aelder@sgi.com> --- fs/xfs/linux-2.6/xfs_file.c | 140 ++++++++++++++++++++++++++++++++++++-------- fs/xfs/xfs_vnodeops.c | 107 --------------------------------- fs/xfs/xfs_vnodeops.h | 1 - 3 files changed, 117 insertions(+), 131 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index 1eb561a10e26..6c283b7be8ab 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -35,6 +35,7 @@ #include "xfs_dir2_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" +#include "xfs_inode_item.h" #include "xfs_bmap.h" #include "xfs_error.h" #include "xfs_rw.h" @@ -96,6 +97,120 @@ xfs_iozero( return (-status); } +/* + * We ignore the datasync flag here because a datasync is effectively + * identical to an fsync. That is, datasync implies that we need to write + * only the metadata needed to be able to access the data that is written + * if we crash after the call completes. Hence if we are writing beyond + * EOF we have to log the inode size change as well, which makes it a + * full fsync. If we don't write beyond EOF, the inode core will be + * clean in memory and so we don't need to log the inode, just like + * fsync. + */ +STATIC int +xfs_file_fsync( + struct file *file, + struct dentry *dentry, + int datasync) +{ + struct xfs_inode *ip = XFS_I(dentry->d_inode); + struct xfs_trans *tp; + int error = 0; + int log_flushed = 0; + + xfs_itrace_entry(ip); + + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + return -XFS_ERROR(EIO); + + xfs_iflags_clear(ip, XFS_ITRUNCATED); + + /* + * We always need to make sure that the required inode state is safe on + * disk. The inode might be clean but we still might need to force the + * log because of committed transactions that haven't hit the disk yet. + * Likewise, there could be unflushed non-transactional changes to the + * inode core that have to go to disk and this requires us to issue + * a synchronous transaction to capture these changes correctly. + * + * This code relies on the assumption that if the i_update_core field + * of the inode is clear and the inode is unpinned then it is clean + * and no action is required. + */ + xfs_ilock(ip, XFS_ILOCK_SHARED); + + if (ip->i_update_core) { + /* + * Kick off a transaction to log the inode core to get the + * updates. The sync transaction will also force the log. + */ + xfs_iunlock(ip, XFS_ILOCK_SHARED); + tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_FSYNC_TS); + error = xfs_trans_reserve(tp, 0, + XFS_FSYNC_TS_LOG_RES(ip->i_mount), 0, 0, 0); + if (error) { + xfs_trans_cancel(tp, 0); + return -error; + } + xfs_ilock(ip, XFS_ILOCK_EXCL); + + /* + * Note - it's possible that we might have pushed ourselves out + * of the way during trans_reserve which would flush the inode. + * But there's no guarantee that the inode buffer has actually + * gone out yet (it's delwri). Plus the buffer could be pinned + * anyway if it's part of an inode in another recent + * transaction. So we play it safe and fire off the + * transaction anyway. + */ + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_ihold(tp, ip); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + xfs_trans_set_sync(tp); + error = _xfs_trans_commit(tp, 0, &log_flushed); + + xfs_iunlock(ip, XFS_ILOCK_EXCL); + } else { + /* + * Timestamps/size haven't changed since last inode flush or + * inode transaction commit. That means either nothing got + * written or a transaction committed which caught the updates. + * If the latter happened and the transaction hasn't hit the + * disk yet, the inode will be still be pinned. If it is, + * force the log. + */ + xfs_iunlock(ip, XFS_ILOCK_SHARED); + if (xfs_ipincount(ip)) { + if (ip->i_itemp->ili_last_lsn) { + error = _xfs_log_force_lsn(ip->i_mount, + ip->i_itemp->ili_last_lsn, + XFS_LOG_SYNC, &log_flushed); + } else { + error = _xfs_log_force(ip->i_mount, + XFS_LOG_SYNC, &log_flushed); + } + } + } + + if (ip->i_mount->m_flags & XFS_MOUNT_BARRIER) { + /* + * If the log write didn't issue an ordered tag we need + * to flush the disk cache for the data device now. + */ + if (!log_flushed) + xfs_blkdev_issue_flush(ip->i_mount->m_ddev_targp); + + /* + * If this inode is on the RT dev we need to flush that + * cache as well. + */ + if (XFS_IS_REALTIME_INODE(ip)) + xfs_blkdev_issue_flush(ip->i_mount->m_rtdev_targp); + } + + return -error; +} + STATIC ssize_t xfs_file_aio_read( struct kiocb *iocb, @@ -755,7 +870,8 @@ write_retry: mutex_lock(&inode->i_mutex); xfs_ilock(ip, iolock); - error2 = xfs_fsync(ip); + error2 = -xfs_file_fsync(file, file->f_path.dentry, + (file->f_flags & __O_SYNC) ? 0 : 1); if (!error) error = error2; } @@ -826,28 +942,6 @@ xfs_file_release( return -xfs_release(XFS_I(inode)); } -/* - * We ignore the datasync flag here because a datasync is effectively - * identical to an fsync. That is, datasync implies that we need to write - * only the metadata needed to be able to access the data that is written - * if we crash after the call completes. Hence if we are writing beyond - * EOF we have to log the inode size change as well, which makes it a - * full fsync. If we don't write beyond EOF, the inode core will be - * clean in memory and so we don't need to log the inode, just like - * fsync. - */ -STATIC int -xfs_file_fsync( - struct file *file, - struct dentry *dentry, - int datasync) -{ - struct xfs_inode *ip = XFS_I(dentry->d_inode); - - xfs_iflags_clear(ip, XFS_ITRUNCATED); - return -xfs_fsync(ip); -} - STATIC int xfs_file_readdir( struct file *filp, diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index ddd2c5d1b854..9d376be0ea38 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -583,113 +583,6 @@ xfs_readlink( return error; } -/* - * xfs_fsync - * - * This is called to sync the inode and its data out to disk. We need to hold - * the I/O lock while flushing the data, and the inode lock while flushing the - * inode. The inode lock CANNOT be held while flushing the data, so acquire - * after we're done with that. - */ -int -xfs_fsync( - xfs_inode_t *ip) -{ - xfs_trans_t *tp; - int error = 0; - int log_flushed = 0; - - xfs_itrace_entry(ip); - - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) - return XFS_ERROR(EIO); - - /* - * We always need to make sure that the required inode state is safe on - * disk. The inode might be clean but we still might need to force the - * log because of committed transactions that haven't hit the disk yet. - * Likewise, there could be unflushed non-transactional changes to the - * inode core that have to go to disk and this requires us to issue - * a synchronous transaction to capture these changes correctly. - * - * This code relies on the assumption that if the update_* fields - * of the inode are clear and the inode is unpinned then it is clean - * and no action is required. - */ - xfs_ilock(ip, XFS_ILOCK_SHARED); - - if (!ip->i_update_core) { - /* - * Timestamps/size haven't changed since last inode flush or - * inode transaction commit. That means either nothing got - * written or a transaction committed which caught the updates. - * If the latter happened and the transaction hasn't hit the - * disk yet, the inode will be still be pinned. If it is, - * force the log. - */ - xfs_iunlock(ip, XFS_ILOCK_SHARED); - if (xfs_ipincount(ip)) { - if (ip->i_itemp->ili_last_lsn) { - error = _xfs_log_force_lsn(ip->i_mount, - ip->i_itemp->ili_last_lsn, - XFS_LOG_SYNC, &log_flushed); - } else { - error = _xfs_log_force(ip->i_mount, - XFS_LOG_SYNC, &log_flushed); - } - } - } else { - /* - * Kick off a transaction to log the inode core to get the - * updates. The sync transaction will also force the log. - */ - xfs_iunlock(ip, XFS_ILOCK_SHARED); - tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_FSYNC_TS); - error = xfs_trans_reserve(tp, 0, - XFS_FSYNC_TS_LOG_RES(ip->i_mount), 0, 0, 0); - if (error) { - xfs_trans_cancel(tp, 0); - return error; - } - xfs_ilock(ip, XFS_ILOCK_EXCL); - - /* - * Note - it's possible that we might have pushed ourselves out - * of the way during trans_reserve which would flush the inode. - * But there's no guarantee that the inode buffer has actually - * gone out yet (it's delwri). Plus the buffer could be pinned - * anyway if it's part of an inode in another recent - * transaction. So we play it safe and fire off the - * transaction anyway. - */ - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - xfs_trans_ihold(tp, ip); - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - xfs_trans_set_sync(tp); - error = _xfs_trans_commit(tp, 0, &log_flushed); - - xfs_iunlock(ip, XFS_ILOCK_EXCL); - } - - if (ip->i_mount->m_flags & XFS_MOUNT_BARRIER) { - /* - * If the log write didn't issue an ordered tag we need - * to flush the disk cache for the data device now. - */ - if (!log_flushed) - xfs_blkdev_issue_flush(ip->i_mount->m_ddev_targp); - - /* - * If this inode is on the RT dev we need to flush that - * cache as well. - */ - if (XFS_IS_REALTIME_INODE(ip)) - xfs_blkdev_issue_flush(ip->i_mount->m_rtdev_targp); - } - - return error; -} - /* * Flags for xfs_free_eofblocks */ diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h index ee33e11d9872..36f3858736f6 100644 --- a/fs/xfs/xfs_vnodeops.h +++ b/fs/xfs/xfs_vnodeops.h @@ -21,7 +21,6 @@ int xfs_setattr(struct xfs_inode *ip, struct iattr *vap, int flags); #define XFS_ATTR_NOACL 0x08 /* Don't call xfs_acl_chmod */ int xfs_readlink(struct xfs_inode *ip, char *link); -int xfs_fsync(struct xfs_inode *ip); int xfs_release(struct xfs_inode *ip); int xfs_inactive(struct xfs_inode *ip); int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name, -- cgit v1.2.3 From 66d834ea603d61bd90fedad90300ca91c5bba0a3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@infradead.org> Date: Mon, 15 Feb 2010 09:44:49 +0000 Subject: xfs: implement optimized fdatasync Allow us to track the difference between timestamp and size updates by using mark_inode_dirty from the I/O completion code, and checking the VFS inode flags in xfs_file_fsync. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Dave Chinner <david@fromorbit.com> Signed-off-by: Alex Elder <aelder@sgi.com> --- fs/xfs/linux-2.6/xfs_aops.c | 4 ++-- fs/xfs/linux-2.6/xfs_file.c | 23 ++++++++++++----------- fs/xfs/linux-2.6/xfs_iops.c | 10 ++++++++++ fs/xfs/xfs_inode.h | 1 + 4 files changed, 25 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index 66abe36c1213..ce369a816ce3 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -187,7 +187,7 @@ xfs_setfilesize( isize = xfs_ioend_new_eof(ioend); if (isize) { ip->i_d.di_size = isize; - xfs_mark_inode_dirty_sync(ip); + xfs_mark_inode_dirty(ip); } xfs_iunlock(ip, XFS_ILOCK_EXCL); @@ -341,7 +341,7 @@ xfs_submit_ioend_bio( * but don't update the inode size until I/O completion. */ if (xfs_ioend_new_eof(ioend)) - xfs_mark_inode_dirty_sync(XFS_I(ioend->io_inode)); + xfs_mark_inode_dirty(XFS_I(ioend->io_inode)); submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC_PLUG : WRITE, bio); diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index 6c283b7be8ab..43f9554adaac 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -97,16 +97,6 @@ xfs_iozero( return (-status); } -/* - * We ignore the datasync flag here because a datasync is effectively - * identical to an fsync. That is, datasync implies that we need to write - * only the metadata needed to be able to access the data that is written - * if we crash after the call completes. Hence if we are writing beyond - * EOF we have to log the inode size change as well, which makes it a - * full fsync. If we don't write beyond EOF, the inode core will be - * clean in memory and so we don't need to log the inode, just like - * fsync. - */ STATIC int xfs_file_fsync( struct file *file, @@ -139,7 +129,18 @@ xfs_file_fsync( */ xfs_ilock(ip, XFS_ILOCK_SHARED); - if (ip->i_update_core) { + /* + * First check if the VFS inode is marked dirty. All the dirtying + * of non-transactional updates no goes through mark_inode_dirty*, + * which allows us to distinguish beteeen pure timestamp updates + * and i_size updates which need to be caught for fdatasync. + * After that also theck for the dirty state in the XFS inode, which + * might gets cleared when the inode gets written out via the AIL + * or xfs_iflush_cluster. + */ + if (((dentry->d_inode->i_state & I_DIRTY_DATASYNC) || + ((dentry->d_inode->i_state & I_DIRTY_SYNC) && !datasync)) && + ip->i_update_core) { /* * Kick off a transaction to log the inode core to get the * updates. The sync transaction will also force the log. diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index e8566bbf0f00..61a99608731e 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -91,6 +91,16 @@ xfs_mark_inode_dirty_sync( mark_inode_dirty_sync(inode); } +void +xfs_mark_inode_dirty( + xfs_inode_t *ip) +{ + struct inode *inode = VFS_I(ip); + + if (!(inode->i_state & (I_WILL_FREE|I_FREEING|I_CLEAR))) + mark_inode_dirty(inode); +} + /* * Change the requested timestamp in the given inode. * We don't lock across timestamp updates, and we don't log them but diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 6c912b027596..41e8a4e2e3be 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -480,6 +480,7 @@ void xfs_lock_inodes(xfs_inode_t **, int, uint); void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint); void xfs_synchronize_times(xfs_inode_t *); +void xfs_mark_inode_dirty(xfs_inode_t *); void xfs_mark_inode_dirty_sync(xfs_inode_t *); #define IHOLD(ip) \ -- cgit v1.2.3 From 77d7a0c2eeb285c9069e15396703d0cb9690ac50 Mon Sep 17 00:00:00 2001 From: Dave Chinner <david@fromorbit.com> Date: Wed, 17 Feb 2010 05:36:29 +0000 Subject: xfs: Non-blocking inode locking in IO completion The introduction of barriers to loop devices has created a new IO order completion dependency that XFS does not handle. The loop device implements barriers using fsync and so turns a log IO in the XFS filesystem on the loop device into a data IO in the backing filesystem. That is, the completion of log IOs in the loop filesystem are now dependent on completion of data IO in the backing filesystem. This can cause deadlocks when a flush daemon issues a log force with an inode locked because the IO completion of IO on the inode is blocked by the inode lock. This in turn prevents further data IO completion from occuring on all XFS filesystems on that CPU (due to the shared nature of the completion queues). This then prevents the log IO from completing because the log is waiting for data IO completion as well. The fix for this new completion order dependency issue is to make the IO completion inode locking non-blocking. If the inode lock can't be grabbed, simply requeue the IO completion back to the work queue so that it can be processed later. This prevents the completion queue from being blocked and allows data IO completion on other inodes to proceed, hence avoiding completion order dependent deadlocks. Signed-off-by: Dave Chinner <david@fromorbit.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Alex Elder <aelder@sgi.com> --- fs/xfs/linux-2.6/xfs_aops.c | 93 +++++++++++++++++++++++++++------------------ 1 file changed, 56 insertions(+), 37 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index ce369a816ce3..b493c63976cd 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -163,14 +163,17 @@ xfs_ioend_new_eof( } /* - * Update on-disk file size now that data has been written to disk. - * The current in-memory file size is i_size. If a write is beyond - * eof i_new_size will be the intended file size until i_size is - * updated. If this write does not extend all the way to the valid - * file size then restrict this update to the end of the write. + * Update on-disk file size now that data has been written to disk. The + * current in-memory file size is i_size. If a write is beyond eof i_new_size + * will be the intended file size until i_size is updated. If this write does + * not extend all the way to the valid file size then restrict this update to + * the end of the write. + * + * This function does not block as blocking on the inode lock in IO completion + * can lead to IO completion order dependency deadlocks.. If it can't get the + * inode ilock it will return EAGAIN. Callers must handle this. */ - -STATIC void +STATIC int xfs_setfilesize( xfs_ioend_t *ioend) { @@ -181,9 +184,11 @@ xfs_setfilesize( ASSERT(ioend->io_type != IOMAP_READ); if (unlikely(ioend->io_error)) - return; + return 0; + + if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) + return EAGAIN; - xfs_ilock(ip, XFS_ILOCK_EXCL); isize = xfs_ioend_new_eof(ioend); if (isize) { ip->i_d.di_size = isize; @@ -191,6 +196,28 @@ xfs_setfilesize( } xfs_iunlock(ip, XFS_ILOCK_EXCL); + return 0; +} + +/* + * Schedule IO completion handling on a xfsdatad if this was + * the final hold on this ioend. If we are asked to wait, + * flush the workqueue. + */ +STATIC void +xfs_finish_ioend( + xfs_ioend_t *ioend, + int wait) +{ + if (atomic_dec_and_test(&ioend->io_remaining)) { + struct workqueue_struct *wq; + + wq = (ioend->io_type == IOMAP_UNWRITTEN) ? + xfsconvertd_workqueue : xfsdatad_workqueue; + queue_work(wq, &ioend->io_work); + if (wait) + flush_workqueue(wq); + } } /* @@ -198,11 +225,11 @@ xfs_setfilesize( */ STATIC void xfs_end_io( - struct work_struct *work) + struct work_struct *work) { - xfs_ioend_t *ioend = - container_of(work, xfs_ioend_t, io_work); - struct xfs_inode *ip = XFS_I(ioend->io_inode); + xfs_ioend_t *ioend = container_of(work, xfs_ioend_t, io_work); + struct xfs_inode *ip = XFS_I(ioend->io_inode); + int error; /* * For unwritten extents we need to issue transactions to convert a @@ -210,7 +237,6 @@ xfs_end_io( */ if (ioend->io_type == IOMAP_UNWRITTEN && likely(!ioend->io_error && !XFS_FORCED_SHUTDOWN(ip->i_mount))) { - int error; error = xfs_iomap_write_unwritten(ip, ioend->io_offset, ioend->io_size); @@ -222,30 +248,23 @@ xfs_end_io( * We might have to update the on-disk file size after extending * writes. */ - if (ioend->io_type != IOMAP_READ) - xfs_setfilesize(ioend); - xfs_destroy_ioend(ioend); -} - -/* - * Schedule IO completion handling on a xfsdatad if this was - * the final hold on this ioend. If we are asked to wait, - * flush the workqueue. - */ -STATIC void -xfs_finish_ioend( - xfs_ioend_t *ioend, - int wait) -{ - if (atomic_dec_and_test(&ioend->io_remaining)) { - struct workqueue_struct *wq; - - wq = (ioend->io_type == IOMAP_UNWRITTEN) ? - xfsconvertd_workqueue : xfsdatad_workqueue; - queue_work(wq, &ioend->io_work); - if (wait) - flush_workqueue(wq); + if (ioend->io_type != IOMAP_READ) { + error = xfs_setfilesize(ioend); + ASSERT(!error || error == EAGAIN); } + + /* + * If we didn't complete processing of the ioend, requeue it to the + * tail of the workqueue for another attempt later. Otherwise destroy + * it. + */ + if (error == EAGAIN) { + atomic_inc(&ioend->io_remaining); + xfs_finish_ioend(ioend, 0); + /* ensure we don't spin on blocked ioends */ + delay(1); + } else + xfs_destroy_ioend(ioend); } /* -- cgit v1.2.3 From 024910cbac323ab2e5ad6d7fa7958799b04b9728 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@infradead.org> Date: Wed, 17 Feb 2010 19:34:57 +0000 Subject: xfs: fix inode pincount check in fsync We need to hold the ilock to check the inode pincount safely. While we're at it also remove the check for ip->i_itemp->ili_last_lsn, a pinned inode always has it set. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Dave Chinner <david@fromorbit.com> Signed-off-by: Alex Elder <aelder@sgi.com> --- fs/xfs/linux-2.6/xfs_file.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index 43f9554adaac..42dd3bcfba6b 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -180,17 +180,12 @@ xfs_file_fsync( * disk yet, the inode will be still be pinned. If it is, * force the log. */ - xfs_iunlock(ip, XFS_ILOCK_SHARED); if (xfs_ipincount(ip)) { - if (ip->i_itemp->ili_last_lsn) { - error = _xfs_log_force_lsn(ip->i_mount, - ip->i_itemp->ili_last_lsn, - XFS_LOG_SYNC, &log_flushed); - } else { - error = _xfs_log_force(ip->i_mount, - XFS_LOG_SYNC, &log_flushed); - } + error = _xfs_log_force_lsn(ip->i_mount, + ip->i_itemp->ili_last_lsn, + XFS_LOG_SYNC, &log_flushed); } + xfs_iunlock(ip, XFS_ILOCK_SHARED); } if (ip->i_mount->m_flags & XFS_MOUNT_BARRIER) { -- cgit v1.2.3 From f7008d0aeba21396b3422df135b692ae701bd0c8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@infradead.org> Date: Mon, 15 Feb 2010 22:02:19 +0000 Subject: xfs: fix xfs_fsblock_t tracing Using a static buffer in xfs_fmtfsblock means we can corrupt traces if multiple CPUs hit this code path at the same. Just remove xfs_fmtfsblock for now and print the block number purely numerical. If we want the NULLFSBLOCK and NULLSTARTBLOCK formatting back the best way would be a decoding plugin in the trace-cmd userspace command. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Dave Chinner <david@fromorbit.com> Signed-off-by: Alex Elder <aelder@sgi.com> --- fs/xfs/linux-2.6/xfs_trace.c | 16 ---------------- fs/xfs/linux-2.6/xfs_trace.h | 12 ++++++------ 2 files changed, 6 insertions(+), 22 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_trace.c b/fs/xfs/linux-2.6/xfs_trace.c index 856eb3c8d605..5a107601e969 100644 --- a/fs/xfs/linux-2.6/xfs_trace.c +++ b/fs/xfs/linux-2.6/xfs_trace.c @@ -51,22 +51,6 @@ #include "quota/xfs_dquot_item.h" #include "quota/xfs_dquot.h" -/* - * Format fsblock number into a static buffer & return it. - */ -STATIC char *xfs_fmtfsblock(xfs_fsblock_t bno) -{ - static char rval[50]; - - if (bno == NULLFSBLOCK) - sprintf(rval, "NULLFSBLOCK"); - else if (isnullstartblock(bno)) - sprintf(rval, "NULLSTARTBLOCK(%lld)", startblockval(bno)); - else - sprintf(rval, "%lld", (xfs_dfsbno_t)bno); - return rval; -} - /* * We include this last to have the helpers above available for the trace * event implementations. diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h index 87f9fad80aac..fcaa62f0799e 100644 --- a/fs/xfs/linux-2.6/xfs_trace.h +++ b/fs/xfs/linux-2.6/xfs_trace.h @@ -197,13 +197,13 @@ TRACE_EVENT(xfs_iext_insert, __entry->caller_ip = caller_ip; ), TP_printk("dev %d:%d ino 0x%llx state %s idx %ld " - "offset %lld block %s count %lld flag %d caller %pf", + "offset %lld block %lld count %lld flag %d caller %pf", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS), (long)__entry->idx, __entry->startoff, - xfs_fmtfsblock(__entry->startblock), + (__int64_t)__entry->startblock, __entry->blockcount, __entry->state, (char *)__entry->caller_ip) @@ -241,13 +241,13 @@ DECLARE_EVENT_CLASS(xfs_bmap_class, __entry->caller_ip = caller_ip; ), TP_printk("dev %d:%d ino 0x%llx state %s idx %ld " - "offset %lld block %s count %lld flag %d caller %pf", + "offset %lld block %lld count %lld flag %d caller %pf", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS), (long)__entry->idx, __entry->startoff, - xfs_fmtfsblock(__entry->startblock), + (__int64_t)__entry->startblock, __entry->blockcount, __entry->state, (char *)__entry->caller_ip) @@ -881,7 +881,7 @@ TRACE_EVENT(name, \ ), \ TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " \ "offset 0x%llx count %zd flags %s " \ - "startoff 0x%llx startblock %s blockcount 0x%llx", \ + "startoff 0x%llx startblock %lld blockcount 0x%llx", \ MAJOR(__entry->dev), MINOR(__entry->dev), \ __entry->ino, \ __entry->size, \ @@ -890,7 +890,7 @@ TRACE_EVENT(name, \ __entry->count, \ __print_flags(__entry->flags, "|", BMAPI_FLAGS), \ __entry->startoff, \ - xfs_fmtfsblock(__entry->startblock), \ + (__int64_t)__entry->startblock, \ __entry->blockcount) \ ) DEFINE_IOMAP_EVENT(xfs_iomap_enter); -- cgit v1.2.3 From c467c049e7e6e8b8fe9f38b13b20a4cf7e03f36b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@infradead.org> Date: Mon, 15 Feb 2010 23:34:42 +0000 Subject: xfs: split xfs_bmap_btalloc Split out the nullfb case into a separate function to reduce the stack footprint and make the code more readable. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Dave Chinner <david@fromorbit.com> Signed-off-by: Alex Elder <aelder@sgi.com> --- fs/xfs/xfs_bmap.c | 220 +++++++++++++++++++++++++++++------------------------- 1 file changed, 119 insertions(+), 101 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 1869fb973819..5c11e4d17010 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -2549,6 +2549,121 @@ xfs_bmap_rtalloc( return 0; } +STATIC int +xfs_bmap_btalloc_nullfb( + struct xfs_bmalloca *ap, + struct xfs_alloc_arg *args, + xfs_extlen_t *blen) +{ + struct xfs_mount *mp = ap->ip->i_mount; + struct xfs_perag *pag; + xfs_agnumber_t ag, startag; + int notinit = 0; + int error; + + if (ap->userdata && xfs_inode_is_filestream(ap->ip)) + args->type = XFS_ALLOCTYPE_NEAR_BNO; + else + args->type = XFS_ALLOCTYPE_START_BNO; + args->total = ap->total; + + /* + * Search for an allocation group with a single extent large enough + * for the request. If one isn't found, then adjust the minimum + * allocation size to the largest space found. + */ + startag = ag = XFS_FSB_TO_AGNO(mp, args->fsbno); + if (startag == NULLAGNUMBER) + startag = ag = 0; + + pag = xfs_perag_get(mp, ag); + while (*blen < ap->alen) { + if (!pag->pagf_init) { + error = xfs_alloc_pagf_init(mp, args->tp, ag, + XFS_ALLOC_FLAG_TRYLOCK); + if (error) { + xfs_perag_put(pag); + return error; + } + } + + /* + * See xfs_alloc_fix_freelist... + */ + if (pag->pagf_init) { + xfs_extlen_t longest; + longest = xfs_alloc_longest_free_extent(mp, pag); + if (*blen < longest) + *blen = longest; + } else + notinit = 1; + + if (xfs_inode_is_filestream(ap->ip)) { + if (*blen >= ap->alen) + break; + + if (ap->userdata) { + /* + * If startag is an invalid AG, we've + * come here once before and + * xfs_filestream_new_ag picked the + * best currently available. + * + * Don't continue looping, since we + * could loop forever. + */ + if (startag == NULLAGNUMBER) + break; + + error = xfs_filestream_new_ag(ap, &ag); + xfs_perag_put(pag); + if (error) + return error; + + /* loop again to set 'blen'*/ + startag = NULLAGNUMBER; + pag = xfs_perag_get(mp, ag); + continue; + } + } + if (++ag == mp->m_sb.sb_agcount) + ag = 0; + if (ag == startag) + break; + xfs_perag_put(pag); + pag = xfs_perag_get(mp, ag); + } + xfs_perag_put(pag); + + /* + * Since the above loop did a BUF_TRYLOCK, it is + * possible that there is space for this request. + */ + if (notinit || *blen < ap->minlen) + args->minlen = ap->minlen; + /* + * If the best seen length is less than the request + * length, use the best as the minimum. + */ + else if (*blen < ap->alen) + args->minlen = *blen; + /* + * Otherwise we've seen an extent as big as alen, + * use that as the minimum. + */ + else + args->minlen = ap->alen; + + /* + * set the failure fallback case to look in the selected + * AG as the stream may have moved. + */ + if (xfs_inode_is_filestream(ap->ip)) + ap->rval = args->fsbno = XFS_AGB_TO_FSB(mp, ag, 0); + + return 0; +} + STATIC int xfs_bmap_btalloc( xfs_bmalloca_t *ap) /* bmap alloc argument struct */ @@ -2556,16 +2671,13 @@ xfs_bmap_btalloc( xfs_mount_t *mp; /* mount point structure */ xfs_alloctype_t atype = 0; /* type for allocation routines */ xfs_extlen_t align; /* minimum allocation alignment */ - xfs_agnumber_t ag; xfs_agnumber_t fb_agno; /* ag number of ap->firstblock */ - xfs_agnumber_t startag; + xfs_agnumber_t ag; xfs_alloc_arg_t args; xfs_extlen_t blen; xfs_extlen_t nextminlen = 0; - xfs_perag_t *pag; int nullfb; /* true if ap->firstblock isn't set */ int isaligned; - int notinit; int tryagain; int error; @@ -2612,103 +2724,9 @@ xfs_bmap_btalloc( args.firstblock = ap->firstblock; blen = 0; if (nullfb) { - if (ap->userdata && xfs_inode_is_filestream(ap->ip)) - args.type = XFS_ALLOCTYPE_NEAR_BNO; - else - args.type = XFS_ALLOCTYPE_START_BNO; - args.total = ap->total; - - /* - * Search for an allocation group with a single extent - * large enough for the request. - * - * If one isn't found, then adjust the minimum allocation - * size to the largest space found. - */ - startag = ag = XFS_FSB_TO_AGNO(mp, args.fsbno); - if (startag == NULLAGNUMBER) - startag = ag = 0; - notinit = 0; - pag = xfs_perag_get(mp, ag); - while (blen < ap->alen) { - if (!pag->pagf_init && - (error = xfs_alloc_pagf_init(mp, args.tp, - ag, XFS_ALLOC_FLAG_TRYLOCK))) { - xfs_perag_put(pag); - return error; - } - /* - * See xfs_alloc_fix_freelist... - */ - if (pag->pagf_init) { - xfs_extlen_t longest; - longest = xfs_alloc_longest_free_extent(mp, pag); - if (blen < longest) - blen = longest; - } else - notinit = 1; - - if (xfs_inode_is_filestream(ap->ip)) { - if (blen >= ap->alen) - break; - - if (ap->userdata) { - /* - * If startag is an invalid AG, we've - * come here once before and - * xfs_filestream_new_ag picked the - * best currently available. - * - * Don't continue looping, since we - * could loop forever. - */ - if (startag == NULLAGNUMBER) - break; - - error = xfs_filestream_new_ag(ap, &ag); - xfs_perag_put(pag); - if (error) - return error; - - /* loop again to set 'blen'*/ - startag = NULLAGNUMBER; - pag = xfs_perag_get(mp, ag); - continue; - } - } - if (++ag == mp->m_sb.sb_agcount) - ag = 0; - if (ag == startag) - break; - xfs_perag_put(pag); - pag = xfs_perag_get(mp, ag); - } - xfs_perag_put(pag); - /* - * Since the above loop did a BUF_TRYLOCK, it is - * possible that there is space for this request. - */ - if (notinit || blen < ap->minlen) - args.minlen = ap->minlen; - /* - * If the best seen length is less than the request - * length, use the best as the minimum. - */ - else if (blen < ap->alen) - args.minlen = blen; - /* - * Otherwise we've seen an extent as big as alen, - * use that as the minimum. - */ - else - args.minlen = ap->alen; - - /* - * set the failure fallback case to look in the selected - * AG as the stream may have moved. - */ - if (xfs_inode_is_filestream(ap->ip)) - ap->rval = args.fsbno = XFS_AGB_TO_FSB(mp, ag, 0); + error = xfs_bmap_btalloc_nullfb(ap, &args, &blen); + if (error) + return error; } else if (ap->low) { if (xfs_inode_is_filestream(ap->ip)) args.type = XFS_ALLOCTYPE_FIRST_AG; -- cgit v1.2.3 From 35a8a72f064105807b091a21e004893b219c9ed2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@infradead.org> Date: Mon, 15 Feb 2010 23:34:54 +0000 Subject: xfs: stop passing opaque handles to xfs_log.c routines Currenly we pass opaque xfs_log_ticket_t handles instead of struct xlog_ticket pointers, and void pointers instead of struct xlog_in_core pointers to various log manager functions. Instead pass properly typed pointers after adding forward declarations for them to xfs_log.h, and adjust the touched function prototypes to the standard XFS style while at it. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Dave Chinner <david@fromorbit.com> Signed-off-by: Alex Elder <aelder@sgi.com> --- fs/xfs/xfs_log.c | 106 ++++++++++++++++++++++++++--------------------------- fs/xfs/xfs_log.h | 16 ++++---- fs/xfs/xfs_trans.c | 2 +- fs/xfs/xfs_trans.h | 2 +- 4 files changed, 62 insertions(+), 64 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 4f16be4b6ee5..e8fba92d7cd9 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -60,7 +60,7 @@ STATIC int xlog_space_left(xlog_t *log, int cycle, int bytes); STATIC int xlog_sync(xlog_t *log, xlog_in_core_t *iclog); STATIC void xlog_dealloc_log(xlog_t *log); STATIC int xlog_write(xfs_mount_t *mp, xfs_log_iovec_t region[], - int nentries, xfs_log_ticket_t tic, + int nentries, struct xlog_ticket *tic, xfs_lsn_t *start_lsn, xlog_in_core_t **commit_iclog, uint flags); @@ -243,14 +243,14 @@ xlog_tic_add_region(xlog_ticket_t *tic, uint len, uint type) * out when the next write occurs. */ xfs_lsn_t -xfs_log_done(xfs_mount_t *mp, - xfs_log_ticket_t xtic, - void **iclog, - uint flags) +xfs_log_done( + struct xfs_mount *mp, + struct xlog_ticket *ticket, + struct xlog_in_core **iclog, + uint flags) { - xlog_t *log = mp->m_log; - xlog_ticket_t *ticket = (xfs_log_ticket_t) xtic; - xfs_lsn_t lsn = 0; + struct log *log = mp->m_log; + xfs_lsn_t lsn = 0; if (XLOG_FORCED_SHUTDOWN(log) || /* @@ -258,8 +258,7 @@ xfs_log_done(xfs_mount_t *mp, * If we get an error, just continue and give back the log ticket. */ (((ticket->t_flags & XLOG_TIC_INITED) == 0) && - (xlog_commit_record(mp, ticket, - (xlog_in_core_t **)iclog, &lsn)))) { + (xlog_commit_record(mp, ticket, iclog, &lsn)))) { lsn = (xfs_lsn_t) -1; if (ticket->t_flags & XLOG_TIC_PERM_RESERV) { flags |= XFS_LOG_REL_PERM_RESERV; @@ -289,7 +288,7 @@ xfs_log_done(xfs_mount_t *mp, } return lsn; -} /* xfs_log_done */ +} /* * Attaches a new iclog I/O completion callback routine during @@ -298,11 +297,11 @@ xfs_log_done(xfs_mount_t *mp, * executing the callback at an appropriate time. */ int -xfs_log_notify(xfs_mount_t *mp, /* mount of partition */ - void *iclog_hndl, /* iclog to hang callback off */ - xfs_log_callback_t *cb) +xfs_log_notify( + struct xfs_mount *mp, + struct xlog_in_core *iclog, + xfs_log_callback_t *cb) { - xlog_in_core_t *iclog = (xlog_in_core_t *)iclog_hndl; int abortflg; spin_lock(&iclog->ic_callback_lock); @@ -316,16 +315,14 @@ xfs_log_notify(xfs_mount_t *mp, /* mount of partition */ } spin_unlock(&iclog->ic_callback_lock); return abortflg; -} /* xfs_log_notify */ +} int -xfs_log_release_iclog(xfs_mount_t *mp, - void *iclog_hndl) +xfs_log_release_iclog( + struct xfs_mount *mp, + struct xlog_in_core *iclog) { - xlog_t *log = mp->m_log; - xlog_in_core_t *iclog = (xlog_in_core_t *)iclog_hndl; - - if (xlog_state_release_iclog(log, iclog)) { + if (xlog_state_release_iclog(mp->m_log, iclog)) { xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); return EIO; } @@ -344,17 +341,18 @@ xfs_log_release_iclog(xfs_mount_t *mp, * reservation, we prevent over allocation problems. */ int -xfs_log_reserve(xfs_mount_t *mp, - int unit_bytes, - int cnt, - xfs_log_ticket_t *ticket, - __uint8_t client, - uint flags, - uint t_type) +xfs_log_reserve( + struct xfs_mount *mp, + int unit_bytes, + int cnt, + struct xlog_ticket **ticket, + __uint8_t client, + uint flags, + uint t_type) { - xlog_t *log = mp->m_log; - xlog_ticket_t *internal_ticket; - int retval = 0; + struct log *log = mp->m_log; + struct xlog_ticket *internal_ticket; + int retval = 0; ASSERT(client == XFS_TRANSACTION || client == XFS_LOG); ASSERT((flags & XFS_LOG_NOSLEEP) == 0); @@ -367,7 +365,7 @@ xfs_log_reserve(xfs_mount_t *mp, if (*ticket != NULL) { ASSERT(flags & XFS_LOG_PERM_RESERV); - internal_ticket = (xlog_ticket_t *)*ticket; + internal_ticket = *ticket; trace_xfs_log_reserve(log, internal_ticket); @@ -519,7 +517,7 @@ xfs_log_unmount_write(xfs_mount_t *mp) xlog_in_core_t *first_iclog; #endif xfs_log_iovec_t reg[1]; - xfs_log_ticket_t tic = NULL; + xlog_ticket_t *tic = NULL; xfs_lsn_t lsn; int error; @@ -656,24 +654,24 @@ xfs_log_unmount(xfs_mount_t *mp) * transaction occur with one call to xfs_log_write(). */ int -xfs_log_write(xfs_mount_t * mp, - xfs_log_iovec_t reg[], - int nentries, - xfs_log_ticket_t tic, - xfs_lsn_t *start_lsn) +xfs_log_write( + struct xfs_mount *mp, + struct xfs_log_iovec reg[], + int nentries, + struct xlog_ticket *tic, + xfs_lsn_t *start_lsn) { - int error; - xlog_t *log = mp->m_log; + struct log *log = mp->m_log; + int error; if (XLOG_FORCED_SHUTDOWN(log)) return XFS_ERROR(EIO); - if ((error = xlog_write(mp, reg, nentries, tic, start_lsn, NULL, 0))) { + error = xlog_write(mp, reg, nentries, tic, start_lsn, NULL, 0); + if (error) xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); - } return error; -} /* xfs_log_write */ - +} void xfs_log_move_tail(xfs_mount_t *mp, @@ -1642,16 +1640,16 @@ xlog_print_tic_res(xfs_mount_t *mp, xlog_ticket_t *ticket) * bytes have been written out. */ STATIC int -xlog_write(xfs_mount_t * mp, - xfs_log_iovec_t reg[], - int nentries, - xfs_log_ticket_t tic, - xfs_lsn_t *start_lsn, - xlog_in_core_t **commit_iclog, - uint flags) +xlog_write( + struct xfs_mount *mp, + struct xfs_log_iovec reg[], + int nentries, + struct xlog_ticket *ticket, + xfs_lsn_t *start_lsn, + struct xlog_in_core **commit_iclog, + uint flags) { xlog_t *log = mp->m_log; - xlog_ticket_t *ticket = (xlog_ticket_t *)tic; xlog_in_core_t *iclog = NULL; /* ptr to current in-core log */ xlog_op_header_t *logop_head; /* ptr to log operation header */ __psint_t ptr; /* copy address into data region */ @@ -1765,7 +1763,7 @@ xlog_write(xfs_mount_t * mp, default: xfs_fs_cmn_err(CE_WARN, mp, "Bad XFS transaction clientid 0x%x in ticket 0x%p", - logop_head->oh_clientid, tic); + logop_head->oh_clientid, ticket); return XFS_ERROR(EIO); } diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 7074be9d13e9..97a24c7795a4 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -110,8 +110,6 @@ typedef struct xfs_log_iovec { uint i_type; /* type of region */ } xfs_log_iovec_t; -typedef void* xfs_log_ticket_t; - /* * Structure used to pass callback function and the function's argument * to the log manager. @@ -126,10 +124,12 @@ typedef struct xfs_log_callback { #ifdef __KERNEL__ /* Log manager interfaces */ struct xfs_mount; +struct xlog_in_core; struct xlog_ticket; + xfs_lsn_t xfs_log_done(struct xfs_mount *mp, - xfs_log_ticket_t ticket, - void **iclog, + struct xlog_ticket *ticket, + struct xlog_in_core **iclog, uint flags); int _xfs_log_force(struct xfs_mount *mp, uint flags, @@ -151,21 +151,21 @@ int xfs_log_mount_finish(struct xfs_mount *mp); void xfs_log_move_tail(struct xfs_mount *mp, xfs_lsn_t tail_lsn); int xfs_log_notify(struct xfs_mount *mp, - void *iclog, + struct xlog_in_core *iclog, xfs_log_callback_t *callback_entry); int xfs_log_release_iclog(struct xfs_mount *mp, - void *iclog_hndl); + struct xlog_in_core *iclog); int xfs_log_reserve(struct xfs_mount *mp, int length, int count, - xfs_log_ticket_t *ticket, + struct xlog_ticket **ticket, __uint8_t clientid, uint flags, uint t_type); int xfs_log_write(struct xfs_mount *mp, xfs_log_iovec_t region[], int nentries, - xfs_log_ticket_t ticket, + struct xlog_ticket *ticket, xfs_lsn_t *start_lsn); int xfs_log_unmount_write(struct xfs_mount *mp); void xfs_log_unmount(struct xfs_mount *mp); diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index be942d4e3324..f73e358bae8d 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -796,7 +796,7 @@ _xfs_trans_commit( int sync; #define XFS_TRANS_LOGVEC_COUNT 16 xfs_log_iovec_t log_vector_fast[XFS_TRANS_LOGVEC_COUNT]; - void *commit_iclog; + struct xlog_in_core *commit_iclog; int shutdown; commit_lsn = -1; diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index c93e3a102857..79c8bab9dfff 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -910,7 +910,7 @@ typedef struct xfs_trans { unsigned int t_blk_res_used; /* # of resvd blocks used */ unsigned int t_rtx_res; /* # of rt extents resvd */ unsigned int t_rtx_res_used; /* # of resvd rt extents used */ - xfs_log_ticket_t t_ticket; /* log mgr ticket */ + struct xlog_ticket *t_ticket; /* log mgr ticket */ xfs_lsn_t t_lsn; /* log seq num of start of * transaction. */ xfs_lsn_t t_commit_lsn; /* log seq num of end of -- cgit v1.2.3 From d7e84f413726876c0ec66bbf90770f69841f7663 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@infradead.org> Date: Mon, 15 Feb 2010 23:35:09 +0000 Subject: xfs: factor common xfs_trans_bjoin code Most of xfs_trans_bjoin is duplicated in xfs_trans_get_buf, xfs_trans_getsb and xfs_trans_read_buf. Add a new _xfs_trans_bjoin which can be called by all four functions. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Dave Chinner <david@fromorbit.com> Signed-off-by: Alex Elder <aelder@sgi.com> --- fs/xfs/xfs_trans_buf.c | 216 +++++++++++++++---------------------------------- 1 file changed, 66 insertions(+), 150 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 5ffd544434eb..fb586360d1c9 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -46,6 +46,65 @@ STATIC xfs_buf_t *xfs_trans_buf_item_match(xfs_trans_t *, xfs_buftarg_t *, STATIC xfs_buf_t *xfs_trans_buf_item_match_all(xfs_trans_t *, xfs_buftarg_t *, xfs_daddr_t, int); +/* + * Add the locked buffer to the transaction. + * + * The buffer must be locked, and it cannot be associated with any + * transaction. + * + * If the buffer does not yet have a buf log item associated with it, + * then allocate one for it. Then add the buf item to the transaction. + */ +STATIC void +_xfs_trans_bjoin( + struct xfs_trans *tp, + struct xfs_buf *bp, + int reset_recur) +{ + struct xfs_buf_log_item *bip; + + ASSERT(XFS_BUF_ISBUSY(bp)); + ASSERT(XFS_BUF_FSPRIVATE2(bp, void *) == NULL); + + /* + * The xfs_buf_log_item pointer is stored in b_fsprivate. If + * it doesn't have one yet, then allocate one and initialize it. + * The checks to see if one is there are in xfs_buf_item_init(). + */ + xfs_buf_item_init(bp, tp->t_mountp); + bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); + ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); + ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL)); + ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED)); + if (reset_recur) + bip->bli_recur = 0; + + /* + * Take a reference for this transaction on the buf item. + */ + atomic_inc(&bip->bli_refcount); + + /* + * Get a log_item_desc to point at the new item. + */ + (void) xfs_trans_add_item(tp, (xfs_log_item_t *)bip); + + /* + * Initialize b_fsprivate2 so we can find it with incore_match() + * in xfs_trans_get_buf() and friends above. + */ + XFS_BUF_SET_FSPRIVATE2(bp, tp); + +} + +void +xfs_trans_bjoin( + struct xfs_trans *tp, + struct xfs_buf *bp) +{ + _xfs_trans_bjoin(tp, bp, 0); + trace_xfs_trans_bjoin(bp->b_fspriv); +} /* * Get and lock the buffer for the caller if it is not already @@ -132,40 +191,8 @@ xfs_trans_get_buf(xfs_trans_t *tp, ASSERT(!XFS_BUF_GETERROR(bp)); - /* - * The xfs_buf_log_item pointer is stored in b_fsprivate. If - * it doesn't have one yet, then allocate one and initialize it. - * The checks to see if one is there are in xfs_buf_item_init(). - */ - xfs_buf_item_init(bp, tp->t_mountp); - - /* - * Set the recursion count for the buffer within this transaction - * to 0. - */ - bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*); - ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); - ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL)); - ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED)); - bip->bli_recur = 0; - - /* - * Take a reference for this transaction on the buf item. - */ - atomic_inc(&bip->bli_refcount); - - /* - * Get a log_item_desc to point at the new item. - */ - (void) xfs_trans_add_item(tp, (xfs_log_item_t*)bip); - - /* - * Initialize b_fsprivate2 so we can find it with incore_match() - * above. - */ - XFS_BUF_SET_FSPRIVATE2(bp, tp); - - trace_xfs_trans_get_buf(bip); + _xfs_trans_bjoin(tp, bp, 1); + trace_xfs_trans_get_buf(bp->b_fspriv); return (bp); } @@ -210,44 +237,11 @@ xfs_trans_getsb(xfs_trans_t *tp, } bp = xfs_getsb(mp, flags); - if (bp == NULL) { + if (bp == NULL) return NULL; - } - - /* - * The xfs_buf_log_item pointer is stored in b_fsprivate. If - * it doesn't have one yet, then allocate one and initialize it. - * The checks to see if one is there are in xfs_buf_item_init(). - */ - xfs_buf_item_init(bp, mp); - - /* - * Set the recursion count for the buffer within this transaction - * to 0. - */ - bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*); - ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); - ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL)); - ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED)); - bip->bli_recur = 0; - - /* - * Take a reference for this transaction on the buf item. - */ - atomic_inc(&bip->bli_refcount); - - /* - * Get a log_item_desc to point at the new item. - */ - (void) xfs_trans_add_item(tp, (xfs_log_item_t*)bip); - - /* - * Initialize b_fsprivate2 so we can find it with incore_match() - * above. - */ - XFS_BUF_SET_FSPRIVATE2(bp, tp); - trace_xfs_trans_getsb(bip); + _xfs_trans_bjoin(tp, bp, 1); + trace_xfs_trans_getsb(bp->b_fspriv); return (bp); } @@ -425,40 +419,9 @@ xfs_trans_read_buf( if (XFS_FORCED_SHUTDOWN(mp)) goto shutdown_abort; - /* - * The xfs_buf_log_item pointer is stored in b_fsprivate. If - * it doesn't have one yet, then allocate one and initialize it. - * The checks to see if one is there are in xfs_buf_item_init(). - */ - xfs_buf_item_init(bp, tp->t_mountp); + _xfs_trans_bjoin(tp, bp, 1); + trace_xfs_trans_read_buf(bp->b_fspriv); - /* - * Set the recursion count for the buffer within this transaction - * to 0. - */ - bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*); - ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); - ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL)); - ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED)); - bip->bli_recur = 0; - - /* - * Take a reference for this transaction on the buf item. - */ - atomic_inc(&bip->bli_refcount); - - /* - * Get a log_item_desc to point at the new item. - */ - (void) xfs_trans_add_item(tp, (xfs_log_item_t*)bip); - - /* - * Initialize b_fsprivate2 so we can find it with incore_match() - * above. - */ - XFS_BUF_SET_FSPRIVATE2(bp, tp); - - trace_xfs_trans_read_buf(bip); *bpp = bp; return 0; @@ -622,53 +585,6 @@ xfs_trans_brelse(xfs_trans_t *tp, return; } -/* - * Add the locked buffer to the transaction. - * The buffer must be locked, and it cannot be associated with any - * transaction. - * - * If the buffer does not yet have a buf log item associated with it, - * then allocate one for it. Then add the buf item to the transaction. - */ -void -xfs_trans_bjoin(xfs_trans_t *tp, - xfs_buf_t *bp) -{ - xfs_buf_log_item_t *bip; - - ASSERT(XFS_BUF_ISBUSY(bp)); - ASSERT(XFS_BUF_FSPRIVATE2(bp, void *) == NULL); - - /* - * The xfs_buf_log_item pointer is stored in b_fsprivate. If - * it doesn't have one yet, then allocate one and initialize it. - * The checks to see if one is there are in xfs_buf_item_init(). - */ - xfs_buf_item_init(bp, tp->t_mountp); - bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); - ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); - ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL)); - ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED)); - - /* - * Take a reference for this transaction on the buf item. - */ - atomic_inc(&bip->bli_refcount); - - /* - * Get a log_item_desc to point at the new item. - */ - (void) xfs_trans_add_item(tp, (xfs_log_item_t *)bip); - - /* - * Initialize b_fsprivate2 so we can find it with incore_match() - * in xfs_trans_get_buf() and friends above. - */ - XFS_BUF_SET_FSPRIVATE2(bp, tp); - - trace_xfs_trans_bjoin(bip); -} - /* * Mark the buffer as not needing to be unlocked when the buf item's * IOP_UNLOCK() routine is called. The buffer must already be locked -- cgit v1.2.3 From d7658d487f4c62983015ba6c99f02dedb3ea97c7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@infradead.org> Date: Wed, 17 Feb 2010 19:36:13 +0000 Subject: xfs: kill xfs_lrw.h Move the two declarations to better fitting headers now that xfs_lrw.c is gone. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Dave Chinner <david@fromorbit.com> Signed-off-by: Alex Elder <aelder@sgi.com> --- fs/xfs/linux-2.6/xfs_linux.h | 1 - fs/xfs/linux-2.6/xfs_lrw.h | 29 ----------------------------- fs/xfs/xfs_mount.h | 2 ++ fs/xfs/xfs_vnodeops.h | 2 ++ 4 files changed, 4 insertions(+), 30 deletions(-) delete mode 100644 fs/xfs/linux-2.6/xfs_lrw.h (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h index 5af0c81ca1ae..facfb323a706 100644 --- a/fs/xfs/linux-2.6/xfs_linux.h +++ b/fs/xfs/linux-2.6/xfs_linux.h @@ -88,7 +88,6 @@ #include <xfs_super.h> #include <xfs_globals.h> #include <xfs_fs_subr.h> -#include <xfs_lrw.h> #include <xfs_buf.h> /* diff --git a/fs/xfs/linux-2.6/xfs_lrw.h b/fs/xfs/linux-2.6/xfs_lrw.h deleted file mode 100644 index 342ae8c0d011..000000000000 --- a/fs/xfs/linux-2.6/xfs_lrw.h +++ /dev/null @@ -1,29 +0,0 @@ -/* - * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_LRW_H__ -#define __XFS_LRW_H__ - -struct xfs_mount; -struct xfs_inode; -struct xfs_buf; - -extern int xfs_dev_is_read_only(struct xfs_mount *, char *); - -extern int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t); - -#endif /* __XFS_LRW_H__ */ diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 70504fcf14cd..8fc2c91d1877 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -436,6 +436,8 @@ extern void xfs_freesb(xfs_mount_t *); extern int xfs_fs_writable(xfs_mount_t *); extern int xfs_sb_validate_fsb_count(struct xfs_sb *, __uint64_t); +extern int xfs_dev_is_read_only(struct xfs_mount *, char *); + extern int xfs_dmops_get(struct xfs_mount *); extern void xfs_dmops_put(struct xfs_mount *); diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h index 36f3858736f6..d8dfa8d0dadd 100644 --- a/fs/xfs/xfs_vnodeops.h +++ b/fs/xfs/xfs_vnodeops.h @@ -59,4 +59,6 @@ int xfs_flush_pages(struct xfs_inode *ip, xfs_off_t first, xfs_off_t last, uint64_t flags, int fiopt); int xfs_wait_on_pages(struct xfs_inode *ip, xfs_off_t first, xfs_off_t last); +int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t); + #endif /* _XFS_VNODEOPS_H */ -- cgit v1.2.3 From 60ec678371183f6e0d487e55fe8adbb33816518e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@infradead.org> Date: Wed, 17 Feb 2010 19:43:56 +0000 Subject: xfs: cleanup xfs_iunpin_wait/xfs_iunpin_nowait Remove the inode item pointer and ili_last_lsn checks in __xfs_iunpin_wait as any pinned inode is guaranteed to have them valid. After this the xfs_iunpin_nowait case is nothing more than a xfs_log_force_lsn, as we know that the caller has already checked the pincount. Make xfs_iunpin_nowait the new low-level routine just doing the log force and rewrite xfs_iunpin_wait around it. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Dave Chinner <david@fromorbit.com> Signed-off-by: Alex Elder <aelder@sgi.com> --- fs/xfs/xfs_inode.c | 40 ++++++++++++---------------------------- 1 file changed, 12 insertions(+), 28 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index fa31360046d4..16c3968c1bbe 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2467,47 +2467,31 @@ xfs_iunpin( } /* - * This is called to unpin an inode. It can be directed to wait or to return - * immediately without waiting for the inode to be unpinned. The caller must - * have the inode locked in at least shared mode so that the buffer cannot be - * subsequently pinned once someone is waiting for it to be unpinned. + * This is called to unpin an inode. The caller must have the inode locked + * in at least shared mode so that the buffer cannot be subsequently pinned + * once someone is waiting for it to be unpinned. */ -STATIC void -__xfs_iunpin_wait( - xfs_inode_t *ip, - int wait) +static void +xfs_iunpin_nowait( + struct xfs_inode *ip) { - xfs_inode_log_item_t *iip = ip->i_itemp; - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); - if (atomic_read(&ip->i_pincount) == 0) - return; /* Give the log a push to start the unpinning I/O */ - if (iip && iip->ili_last_lsn) - xfs_log_force_lsn(ip->i_mount, iip->ili_last_lsn, 0); - else - xfs_log_force(ip->i_mount, 0); + xfs_log_force_lsn(ip->i_mount, ip->i_itemp->ili_last_lsn, 0); - if (wait) - wait_event(ip->i_ipin_wait, (atomic_read(&ip->i_pincount) == 0)); } void xfs_iunpin_wait( - xfs_inode_t *ip) + struct xfs_inode *ip) { - __xfs_iunpin_wait(ip, 1); -} - -static inline void -xfs_iunpin_nowait( - xfs_inode_t *ip) -{ - __xfs_iunpin_wait(ip, 0); + if (xfs_ipincount(ip)) { + xfs_iunpin_nowait(ip); + wait_event(ip->i_ipin_wait, (xfs_ipincount(ip) == 0)); + } } - /* * xfs_iextents_copy() * -- cgit v1.2.3 From a14a5ab58f9d783ec3a2a287320fab22e1764813 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Thu, 18 Feb 2010 12:43:22 +0000 Subject: xfs: remove xfs_ipin/xfs_iunpin Inodes are only pinned/unpinned via the inode item methods, and lots of code relies on that fact. So remove the separate xfs_ipin/xfs_iunpin helpers and merge them into their only callers. This also fixes up various duplicate and/or incorrect comments. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Dave Chinner <david@fromorbit.com> Signed-off-by: Alex Elder <aelder@sgi.com> --- fs/xfs/xfs_inode.c | 28 ---------------------------- fs/xfs/xfs_inode.h | 2 -- fs/xfs/xfs_inode_item.c | 18 +++++++++++------- 3 files changed, 11 insertions(+), 37 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 16c3968c1bbe..0ffd56447045 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2438,34 +2438,6 @@ xfs_idestroy_fork( } } -/* - * Increment the pin count of the given buffer. - * This value is protected by ipinlock spinlock in the mount structure. - */ -void -xfs_ipin( - xfs_inode_t *ip) -{ - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - - atomic_inc(&ip->i_pincount); -} - -/* - * Decrement the pin count of the given inode, and wake up - * anyone in xfs_iwait_unpin() if the count goes to 0. The - * inode must have been previously pinned with a call to xfs_ipin(). - */ -void -xfs_iunpin( - xfs_inode_t *ip) -{ - ASSERT(atomic_read(&ip->i_pincount) > 0); - - if (atomic_dec_and_test(&ip->i_pincount)) - wake_up(&ip->i_ipin_wait); -} - /* * This is called to unpin an inode. The caller must have the inode locked * in at least shared mode so that the buffer cannot be subsequently pinned diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 41e8a4e2e3be..9965e40a4615 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -471,8 +471,6 @@ int xfs_itruncate_finish(struct xfs_trans **, xfs_inode_t *, int xfs_iunlink(struct xfs_trans *, xfs_inode_t *); void xfs_iext_realloc(xfs_inode_t *, int, int); -void xfs_ipin(xfs_inode_t *); -void xfs_iunpin(xfs_inode_t *); void xfs_iunpin_wait(xfs_inode_t *); int xfs_iflush(xfs_inode_t *, uint); void xfs_ichgtime(xfs_inode_t *, int); diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index d4dc063111f8..7bfea8540159 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -535,23 +535,23 @@ xfs_inode_item_format( /* * This is called to pin the inode associated with the inode log - * item in memory so it cannot be written out. Do this by calling - * xfs_ipin() to bump the pin count in the inode while holding the - * inode pin lock. + * item in memory so it cannot be written out. */ STATIC void xfs_inode_item_pin( xfs_inode_log_item_t *iip) { ASSERT(xfs_isilocked(iip->ili_inode, XFS_ILOCK_EXCL)); - xfs_ipin(iip->ili_inode); + + atomic_inc(&iip->ili_inode->i_pincount); } /* * This is called to unpin the inode associated with the inode log * item which was previously pinned with a call to xfs_inode_item_pin(). - * Just call xfs_iunpin() on the inode to do this. + * + * Also wake up anyone in xfs_iunpin_wait() if the count goes to 0. */ /* ARGSUSED */ STATIC void @@ -559,7 +559,11 @@ xfs_inode_item_unpin( xfs_inode_log_item_t *iip, int stale) { - xfs_iunpin(iip->ili_inode); + struct xfs_inode *ip = iip->ili_inode; + + ASSERT(atomic_read(&ip->i_pincount) > 0); + if (atomic_dec_and_test(&ip->i_pincount)) + wake_up(&ip->i_ipin_wait); } /* ARGSUSED */ @@ -568,7 +572,7 @@ xfs_inode_item_unpin_remove( xfs_inode_log_item_t *iip, xfs_trans_t *tp) { - xfs_iunpin(iip->ili_inode); + xfs_inode_item_unpin(iip, 0); } /* -- cgit v1.2.3 From cc483f102c3f703e853c96f95a654f0106fb2603 Mon Sep 17 00:00:00 2001 From: Tao Ma <tao.ma@oracle.com> Date: Mon, 1 Mar 2010 19:06:35 -0500 Subject: ext4: Fix fencepost error in chosing choosing group vs file preallocation. The ext4 multiblock allocator decides whether to use group or file preallocation based on the file size. When the file size reaches s_mb_stream_request (default is 16 blocks), it changes to use a file-specific preallocation. This is cool, but it has a tiny problem. See a simple script: mkfs.ext4 -b 1024 /dev/sda8 1000000 mount -t ext4 -o nodelalloc /dev/sda8 /mnt/ext4 for((i=0;i<5;i++)) do cat /mnt/4096>>/mnt/ext4/a #4096 is a file with 4096 characters. cat /mnt/4096>>/mnt/ext4/b done debuge4fs -R 'stat a' /dev/sda8|grep BLOCKS -A 1 And you get BLOCKS: (0-14):8705-8719, (15):2356, (16-19):8465-8468 So there are 3 extents, a bit strange for the lonely 15th logical block. As we write to the 16 blocks, we choose file preallocation in ext4_mb_group_or_file, but in ext4_mb_normalize_request, we meet with the 16*1024 range, so no preallocation will be carried. file b then reserves the space after '2356', so when when write 16, we start from another part. This patch just change the check in ext4_mb_group_or_file, so that for the lonely 15 we will still use group preallocation. After the patch, we will get: debuge4fs -R 'stat a' /dev/sda8|grep BLOCKS -A 1 BLOCKS: (0-15):8705-8720, (16-19):8465-8468 Looks more sane. Thanks. Signed-off-by: Tao Ma <tao.ma@oracle.com> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu> --- fs/ext4/mballoc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 415e11f1e9ee..72d5ceb18523 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3935,7 +3935,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) /* don't use group allocation for large files */ size = max(size, isize); - if (size >= sbi->s_mb_stream_request) { + if (size > sbi->s_mb_stream_request) { ac->ac_flags |= EXT4_MB_STREAM_ALLOC; return; } -- cgit v1.2.3 From f1f724e4b523d444c5a598d74505aefa3d6844d2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@infradead.org> Date: Mon, 1 Mar 2010 11:30:31 +0000 Subject: xfs: fix locking for inode cache radix tree tag updates The radix-tree code requires it's users to serialize tag updates against other updates to the tree. While XFS protects tag updates against each other it does not serialize them against updates of the tree contents, which can lead to tag corruption. Fix the inode cache to always take pag_ici_lock in exclusive mode when updating radix tree tags. Signed-off-by: Christoph Hellwig <hch@lst.de> Reported-by: Patrick Schreurs <patrick@news-service.com> Tested-by: Patrick Schreurs <patrick@news-service.com> Signed-off-by: Alex Elder <aelder@sgi.com> --- fs/xfs/linux-2.6/xfs_sync.c | 4 ++-- fs/xfs/xfs_iget.c | 19 +++++++++++++------ 2 files changed, 15 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index a9f6d20aff41..f9fc154d9f72 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c @@ -688,12 +688,12 @@ xfs_inode_set_reclaim_tag( struct xfs_perag *pag; pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); - read_lock(&pag->pag_ici_lock); + write_lock(&pag->pag_ici_lock); spin_lock(&ip->i_flags_lock); __xfs_inode_set_reclaim_tag(pag, ip); __xfs_iflags_set(ip, XFS_IRECLAIMABLE); spin_unlock(&ip->i_flags_lock); - read_unlock(&pag->pag_ici_lock); + write_unlock(&pag->pag_ici_lock); xfs_perag_put(pag); } diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c index e281eb4a1c49..6845db90818f 100644 --- a/fs/xfs/xfs_iget.c +++ b/fs/xfs/xfs_iget.c @@ -190,13 +190,12 @@ xfs_iget_cache_hit( trace_xfs_iget_reclaim(ip); /* - * We need to set XFS_INEW atomically with clearing the - * reclaimable tag so that we do have an indicator of the - * inode still being initialized. + * We need to set XFS_IRECLAIM to prevent xfs_reclaim_inode + * from stomping over us while we recycle the inode. We can't + * clear the radix tree reclaimable tag yet as it requires + * pag_ici_lock to be held exclusive. */ - ip->i_flags |= XFS_INEW; - ip->i_flags &= ~XFS_IRECLAIMABLE; - __xfs_inode_clear_reclaim_tag(mp, pag, ip); + ip->i_flags |= XFS_IRECLAIM; spin_unlock(&ip->i_flags_lock); read_unlock(&pag->pag_ici_lock); @@ -216,7 +215,15 @@ xfs_iget_cache_hit( trace_xfs_iget_reclaim(ip); goto out_error; } + + write_lock(&pag->pag_ici_lock); + spin_lock(&ip->i_flags_lock); + ip->i_flags &= ~(XFS_IRECLAIMABLE | XFS_IRECLAIM); + ip->i_flags |= XFS_INEW; + __xfs_inode_clear_reclaim_tag(mp, pag, ip); inode->i_state = I_NEW; + spin_unlock(&ip->i_flags_lock); + write_unlock(&pag->pag_ici_lock); } else { /* If the VFS inode is being torn down, pause and try again. */ if (!igrab(inode)) { -- cgit v1.2.3 From 437ca0fda3b442dff9e591581b5e1ffdfec24660 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov <dmonakhov@openvz.org> Date: Mon, 1 Mar 2010 22:29:21 -0500 Subject: ext4: deprecate obsoleted mount options Declare following list of mount options as deprecated: - bsddf, miniddf - grpid, bsdgroups, nogrpid, sysvgroups Declare following list of default mount options as deprecated: - bsdgroups Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu> --- fs/ext4/super.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 79937e921988..c832508d5515 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1205,6 +1205,8 @@ static ext4_fsblk_t get_sb_block(void **data) } #define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3)) +static char deprecated_msg[] = "Mount option \"%s\" will be removed by %s\n" + "Contact linux-ext4@vger.kernel.org if you think we should keep it.\n"; #ifdef CONFIG_QUOTA static int set_qf_name(struct super_block *sb, int qtype, substring_t *args) @@ -1294,16 +1296,23 @@ static int parse_options(char *options, struct super_block *sb, token = match_token(p, tokens, args); switch (token) { case Opt_bsd_df: + ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38"); clear_opt(sbi->s_mount_opt, MINIX_DF); break; case Opt_minix_df: + ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38"); set_opt(sbi->s_mount_opt, MINIX_DF); + break; case Opt_grpid: + ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38"); set_opt(sbi->s_mount_opt, GRPID); + break; case Opt_nogrpid: + ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38"); clear_opt(sbi->s_mount_opt, GRPID); + break; case Opt_resuid: if (match_int(&args[0], &option)) @@ -2449,8 +2458,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) def_mount_opts = le32_to_cpu(es->s_default_mount_opts); if (def_mount_opts & EXT4_DEFM_DEBUG) set_opt(sbi->s_mount_opt, DEBUG); - if (def_mount_opts & EXT4_DEFM_BSDGROUPS) + if (def_mount_opts & EXT4_DEFM_BSDGROUPS) { + ext4_msg(sb, KERN_WARNING, deprecated_msg, "bsdgroups", + "2.6.38"); set_opt(sbi->s_mount_opt, GRPID); + } if (def_mount_opts & EXT4_DEFM_UID16) set_opt(sbi->s_mount_opt, NO_UID32); #ifdef CONFIG_EXT4_FS_XATTR -- cgit v1.2.3 From f39490bcd1691d65dc33689222a12e1fc13dd824 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov <dmonakhov@openvz.org> Date: Mon, 1 Mar 2010 23:14:36 -0500 Subject: ext4: fix error handling in migrate Set i_nlink to zero for temporary inode from very beginning. otherwise we may fail to start new journal handle and this inode will be unreferenced but with i_nlink == 1 Since we hold inode reference it can not be pruned. Also add missed journal_start retval check. Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu> --- fs/ext4/migrate.c | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index 46a4101e0aec..8b87bd0eac95 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -503,14 +503,10 @@ int ext4_ext_migrate(struct inode *inode) } i_size_write(tmp_inode, i_size_read(inode)); /* - * We don't want the inode to be reclaimed - * if we got interrupted in between. We have - * this tmp inode carrying reference to the - * data blocks of the original file. We set - * the i_nlink to zero at the last stage after - * switching the original file to extent format + * Set the i_nlink to zero so it will be deleted later + * when we drop inode reference. */ - tmp_inode->i_nlink = 1; + tmp_inode->i_nlink = 0; ext4_ext_tree_init(handle, tmp_inode); ext4_orphan_add(handle, tmp_inode); @@ -537,6 +533,16 @@ int ext4_ext_migrate(struct inode *inode) up_read((&EXT4_I(inode)->i_data_sem)); handle = ext4_journal_start(inode, 1); + if (IS_ERR(handle)) { + /* + * It is impossible to update on-disk structures without + * a handle, so just rollback in-core changes and live other + * work to orphan_list_cleanup() + */ + ext4_orphan_del(NULL, tmp_inode); + retval = PTR_ERR(handle); + goto out; + } ei = EXT4_I(inode); i_data = ei->i_data; @@ -618,15 +624,8 @@ err_out: /* Reset the extent details */ ext4_ext_tree_init(handle, tmp_inode); - - /* - * Set the i_nlink to zero so that - * generic_drop_inode really deletes the - * inode - */ - tmp_inode->i_nlink = 0; - ext4_journal_stop(handle); +out: unlock_new_inode(tmp_inode); iput(tmp_inode); -- cgit v1.2.3 From da1dafca84413145f5ac59998b4cdd06fb89f721 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov <dmonakhov@openvz.org> Date: Mon, 1 Mar 2010 23:15:02 -0500 Subject: ext4: explicitly remove inode from orphan list after failed direct io Otherwise non-empty orphan list will be triggered on umount. Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu> --- fs/ext4/inode.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index edb7edc99f71..427f4690ad6d 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3438,6 +3438,9 @@ retry: * but cannot extend i_size. Bail out and pretend * the write failed... */ ret = PTR_ERR(handle); + if (inode->i_nlink) + ext4_orphan_del(NULL, inode); + goto out; } if (inode->i_nlink) -- cgit v1.2.3 From 6e3617e579e070d3655a93ee9ed7149113e795e0 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov <dmonakhov@openvz.org> Date: Mon, 1 Mar 2010 23:29:39 -0500 Subject: ext4: Handle non empty on-disk orphan link In case of truncate errors we explicitly remove inode from in-core orphan list via orphan_del(NULL, inode) without modifying the on-disk list. But later on, the same inode may be inserted in the orphan list again which will result the on-disk linked list getting corrupted. If inode i_dtime contains valid value, then skip on-disk list modification. Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu> --- fs/ext4/namei.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'fs') diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index a13948f8242f..608d21f873ec 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2013,6 +2013,13 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) err = ext4_reserve_inode_write(handle, inode, &iloc); if (err) goto out_unlock; + /* + * Due to previous errors inode may be already a part of on-disk + * orphan list. If so skip on-disk list modification. + */ + if (NEXT_ORPHAN(inode) && NEXT_ORPHAN(inode) <= + (le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))) + goto mem_insert; /* Insert this inode at the head of the on-disk orphan list... */ NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan); @@ -2030,6 +2037,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) * * This is safe: on error we're going to ignore the orphan list * anyway on the next recovery. */ +mem_insert: if (!err) list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan); -- cgit v1.2.3 From b8b8afe236e97b6359d46d3a3f8c46455e192271 Mon Sep 17 00:00:00 2001 From: Toshiyuki Okajima <toshi.okajima@jp.fujitsu.com> Date: Tue, 2 Mar 2010 00:21:35 -0500 Subject: ext4: make "offset" consistent in ext4_check_dir_entry() The callers of ext4_check_dir_entry() usually pass in the "file offset" (ext4_readdir, htree_dirblock_to_tree, search_dirblock, ext4_dx_find_entry, empty_dir), but a few callers (add_dirent_to_buf, ext4_delete_entry) only pass in the buffer offset. To accomodate those last two (which would be hard to fix otherwise), this patch changes ext4_check_dir_entry() to print the physical block number and the relative offset as well as the passed-in offset. Signed-off-by: Toshiyuki Okajima <toshi.okajima@jp.fujitsu.com> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu> --- fs/ext4/dir.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 0e0bef3ba91e..29857ddd9e26 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -84,9 +84,11 @@ int ext4_check_dir_entry(const char *function, struct inode *dir, if (error_msg != NULL) __ext4_error(dir->i_sb, function, - "bad entry in directory #%lu: %s - " - "offset=%u, inode=%u, rec_len=%d, name_len=%d", - dir->i_ino, error_msg, offset, + "bad entry in directory #%lu: %s - block=%llu" + "offset=%u(%u), inode=%u, rec_len=%d, name_len=%d", + dir->i_ino, error_msg, + (unsigned long long) bh->b_blocknr, + (unsigned) (offset%bh->b_size), offset, le32_to_cpu(de->inode), rlen, de->name_len); return error_msg == NULL ? 1 : 0; -- cgit v1.2.3 From c7064ef13b2181a489836349f9baf87df0dab28f Mon Sep 17 00:00:00 2001 From: Jiaying Zhang <jiayingz@google.com> Date: Tue, 2 Mar 2010 13:28:44 -0500 Subject: ext4: mechanical rename some of the direct I/O get_block's identifiers This commit renames some of the direct I/O's block allocation flags, variables, and functions introduced in Mingming's "Direct IO for holes and fallocate" patches so that they can be used by ext4's buffered write path as well. Also changed the related function comments accordingly to cover both direct write and buffered write cases. Signed-off-by: Jiaying Zhang <jiayingz@google.com> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu> --- fs/ext4/ext4.h | 18 ++++++------ fs/ext4/extents.c | 24 ++++++++-------- fs/ext4/fsync.c | 2 +- fs/ext4/inode.c | 84 +++++++++++++++++++++++++------------------------------ fs/ext4/super.c | 2 +- 5 files changed, 61 insertions(+), 69 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 74664ca19e22..c831a580bd76 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -133,7 +133,7 @@ struct mpage_da_data { int pages_written; int retval; }; -#define DIO_AIO_UNWRITTEN 0x1 +#define EXT4_IO_UNWRITTEN 0x1 typedef struct ext4_io_end { struct list_head list; /* per-file finished AIO list */ struct inode *inode; /* file being written to */ @@ -355,13 +355,13 @@ struct ext4_new_group_data { /* caller is from the direct IO path, request to creation of an unitialized extents if not allocated, split the uninitialized extent if blocks has been preallocated already*/ -#define EXT4_GET_BLOCKS_DIO 0x0008 +#define EXT4_GET_BLOCKS_PRE_IO 0x0008 #define EXT4_GET_BLOCKS_CONVERT 0x0010 -#define EXT4_GET_BLOCKS_DIO_CREATE_EXT (EXT4_GET_BLOCKS_DIO|\ +#define EXT4_GET_BLOCKS_IO_CREATE_EXT (EXT4_GET_BLOCKS_PRE_IO|\ EXT4_GET_BLOCKS_CREATE_UNINIT_EXT) - /* Convert extent to initialized after direct IO complete */ -#define EXT4_GET_BLOCKS_DIO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\ - EXT4_GET_BLOCKS_DIO_CREATE_EXT) + /* Convert extent to initialized after IO complete */ +#define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\ + EXT4_GET_BLOCKS_IO_CREATE_EXT) /* * Flags used by ext4_free_blocks @@ -700,8 +700,8 @@ struct ext4_inode_info { qsize_t i_reserved_quota; #endif - /* completed async DIOs that might need unwritten extents handling */ - struct list_head i_aio_dio_complete_list; + /* completed IOs that might need unwritten extents handling */ + struct list_head i_completed_io_list; /* current io_end structure for async DIO write*/ ext4_io_end_t *cur_aio_dio; @@ -1461,7 +1461,7 @@ extern int ext4_block_truncate_page(handle_t *handle, struct address_space *mapping, loff_t from); extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); extern qsize_t *ext4_get_reserved_space(struct inode *inode); -extern int flush_aio_dio_completed_IO(struct inode *inode); +extern int flush_completed_IO(struct inode *inode); extern void ext4_da_update_reserve_space(struct inode *inode, int used, int quota_claim); /* ioctl.c */ diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index a2c21aa09e2b..90ba8d9df697 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -1619,7 +1619,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, BUG_ON(path[depth].p_hdr == NULL); /* try to insert block into found extent and return */ - if (ex && (flag != EXT4_GET_BLOCKS_DIO_CREATE_EXT) + if (ex && (flag != EXT4_GET_BLOCKS_PRE_IO) && ext4_can_extents_be_merged(inode, ex, newext)) { ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n", ext4_ext_is_uninitialized(newext), @@ -1740,7 +1740,7 @@ has_space: merge: /* try to merge extents to the right */ - if (flag != EXT4_GET_BLOCKS_DIO_CREATE_EXT) + if (flag != EXT4_GET_BLOCKS_PRE_IO) ext4_ext_try_to_merge(inode, path, nearex); /* try to merge extents to the left */ @@ -2984,7 +2984,7 @@ fix_extent_len: ext4_ext_dirty(handle, inode, path + depth); return err; } -static int ext4_convert_unwritten_extents_dio(handle_t *handle, +static int ext4_convert_unwritten_extents_endio(handle_t *handle, struct inode *inode, struct ext4_ext_path *path) { @@ -3064,8 +3064,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, flags, allocated); ext4_ext_show_leaf(inode, path); - /* DIO get_block() before submit the IO, split the extent */ - if (flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) { + /* get_block() before submit the IO, split the extent */ + if (flags == EXT4_GET_BLOCKS_PRE_IO) { ret = ext4_split_unwritten_extents(handle, inode, path, iblock, max_blocks, flags); @@ -3075,14 +3075,14 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, * completed */ if (io) - io->flag = DIO_AIO_UNWRITTEN; + io->flag = EXT4_IO_UNWRITTEN; else ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); goto out; } - /* async DIO end_io complete, convert the filled extent to written */ - if (flags == EXT4_GET_BLOCKS_DIO_CONVERT_EXT) { - ret = ext4_convert_unwritten_extents_dio(handle, inode, + /* IO end_io complete, convert the filled extent to written */ + if (flags == EXT4_GET_BLOCKS_CONVERT) { + ret = ext4_convert_unwritten_extents_endio(handle, inode, path); if (ret >= 0) ext4_update_inode_fsync_trans(handle, inode, 1); @@ -3359,9 +3359,9 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, * For non asycn direct IO case, flag the inode state * that we need to perform convertion when IO is done. */ - if (flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) { + if (flags == EXT4_GET_BLOCKS_PRE_IO) { if (io) - io->flag = DIO_AIO_UNWRITTEN; + io->flag = EXT4_IO_UNWRITTEN; else ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); @@ -3656,7 +3656,7 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, map_bh.b_state = 0; ret = ext4_get_blocks(handle, inode, block, max_blocks, &map_bh, - EXT4_GET_BLOCKS_DIO_CONVERT_EXT); + EXT4_GET_BLOCKS_IO_CONVERT_EXT); if (ret <= 0) { WARN_ON(ret <= 0); printk(KERN_ERR "%s: ext4_ext_get_blocks " diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 98bd140aad01..0d0c3239c1cd 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -63,7 +63,7 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync) if (inode->i_sb->s_flags & MS_RDONLY) return 0; - ret = flush_aio_dio_completed_IO(inode); + ret = flush_completed_IO(inode); if (ret < 0) return ret; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 427f4690ad6d..28f116bdc405 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3468,7 +3468,7 @@ out: return ret; } -static int ext4_get_block_dio_write(struct inode *inode, sector_t iblock, +static int ext4_get_block_write(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { handle_t *handle = NULL; @@ -3476,28 +3476,14 @@ static int ext4_get_block_dio_write(struct inode *inode, sector_t iblock, unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; int dio_credits; - ext4_debug("ext4_get_block_dio_write: inode %lu, create flag %d\n", + ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n", inode->i_ino, create); /* - * DIO VFS code passes create = 0 flag for write to - * the middle of file. It does this to avoid block - * allocation for holes, to prevent expose stale data - * out when there is parallel buffered read (which does - * not hold the i_mutex lock) while direct IO write has - * not completed. DIO request on holes finally falls back - * to buffered IO for this reason. - * - * For ext4 extent based file, since we support fallocate, - * new allocated extent as uninitialized, for holes, we - * could fallocate blocks for holes, thus parallel - * buffered IO read will zero out the page when read on - * a hole while parallel DIO write to the hole has not completed. - * - * when we come here, we know it's a direct IO write to - * to the middle of file (<i_size) - * so it's safe to override the create flag from VFS. + * ext4_get_block in prepare for a DIO write or buffer write. + * We allocate an uinitialized extent if blocks haven't been allocated. + * The extent will be converted to initialized after IO complete. */ - create = EXT4_GET_BLOCKS_DIO_CREATE_EXT; + create = EXT4_GET_BLOCKS_IO_CREATE_EXT; if (max_blocks > DIO_MAX_BLOCKS) max_blocks = DIO_MAX_BLOCKS; @@ -3524,19 +3510,20 @@ static void ext4_free_io_end(ext4_io_end_t *io) iput(io->inode); kfree(io); } -static void dump_aio_dio_list(struct inode * inode) + +static void dump_completed_IO(struct inode * inode) { #ifdef EXT4_DEBUG struct list_head *cur, *before, *after; ext4_io_end_t *io, *io0, *io1; - if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){ - ext4_debug("inode %lu aio dio list is empty\n", inode->i_ino); + if (list_empty(&EXT4_I(inode)->i_completed_io_list)){ + ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino); return; } - ext4_debug("Dump inode %lu aio_dio_completed_IO list \n", inode->i_ino); - list_for_each_entry(io, &EXT4_I(inode)->i_aio_dio_complete_list, list){ + ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino); + list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){ cur = &io->list; before = cur->prev; io0 = container_of(before, ext4_io_end_t, list); @@ -3552,21 +3539,21 @@ static void dump_aio_dio_list(struct inode * inode) /* * check a range of space and convert unwritten extents to written. */ -static int ext4_end_aio_dio_nolock(ext4_io_end_t *io) +static int ext4_end_io_nolock(ext4_io_end_t *io) { struct inode *inode = io->inode; loff_t offset = io->offset; ssize_t size = io->size; int ret = 0; - ext4_debug("end_aio_dio_onlock: io 0x%p from inode %lu,list->next 0x%p," + ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p," "list->prev 0x%p\n", io, inode->i_ino, io->list.next, io->list.prev); if (list_empty(&io->list)) return ret; - if (io->flag != DIO_AIO_UNWRITTEN) + if (io->flag != EXT4_IO_UNWRITTEN) return ret; if (offset + size <= i_size_read(inode)) @@ -3584,17 +3571,18 @@ static int ext4_end_aio_dio_nolock(ext4_io_end_t *io) io->flag = 0; return ret; } + /* * work on completed aio dio IO, to convert unwritten extents to extents */ -static void ext4_end_aio_dio_work(struct work_struct *work) +static void ext4_end_io_work(struct work_struct *work) { ext4_io_end_t *io = container_of(work, ext4_io_end_t, work); struct inode *inode = io->inode; int ret = 0; mutex_lock(&inode->i_mutex); - ret = ext4_end_aio_dio_nolock(io); + ret = ext4_end_io_nolock(io); if (ret >= 0) { if (!list_empty(&io->list)) list_del_init(&io->list); @@ -3602,32 +3590,35 @@ static void ext4_end_aio_dio_work(struct work_struct *work) } mutex_unlock(&inode->i_mutex); } + /* * This function is called from ext4_sync_file(). * - * When AIO DIO IO is completed, the work to convert unwritten - * extents to written is queued on workqueue but may not get immediately + * When IO is completed, the work to convert unwritten extents to + * written is queued on workqueue but may not get immediately * scheduled. When fsync is called, we need to ensure the * conversion is complete before fsync returns. - * The inode keeps track of a list of completed AIO from DIO path - * that might needs to do the conversion. This function walks through - * the list and convert the related unwritten extents to written. + * The inode keeps track of a list of pending/completed IO that + * might needs to do the conversion. This function walks through + * the list and convert the related unwritten extents for completed IO + * to written. + * The function return the number of pending IOs on success. */ -int flush_aio_dio_completed_IO(struct inode *inode) +int flush_completed_IO(struct inode *inode) { ext4_io_end_t *io; int ret = 0; int ret2 = 0; - if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)) + if (list_empty(&EXT4_I(inode)->i_completed_io_list)) return ret; - dump_aio_dio_list(inode); - while (!list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){ - io = list_entry(EXT4_I(inode)->i_aio_dio_complete_list.next, + dump_completed_IO(inode); + while (!list_empty(&EXT4_I(inode)->i_completed_io_list)){ + io = list_entry(EXT4_I(inode)->i_completed_io_list.next, ext4_io_end_t, list); /* - * Calling ext4_end_aio_dio_nolock() to convert completed + * Calling ext4_end_io_nolock() to convert completed * IO to written. * * When ext4_sync_file() is called, run_queue() may already @@ -3640,7 +3631,7 @@ int flush_aio_dio_completed_IO(struct inode *inode) * avoid double converting from both fsync and background work * queue work. */ - ret = ext4_end_aio_dio_nolock(io); + ret = ext4_end_io_nolock(io); if (ret < 0) ret2 = ret; else @@ -3662,7 +3653,7 @@ static ext4_io_end_t *ext4_init_io_end (struct inode *inode) io->offset = 0; io->size = 0; io->error = 0; - INIT_WORK(&io->work, ext4_end_aio_dio_work); + INIT_WORK(&io->work, ext4_end_io_work); INIT_LIST_HEAD(&io->list); } @@ -3685,7 +3676,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, size); /* if not aio dio with unwritten extents, just free io and return */ - if (io_end->flag != DIO_AIO_UNWRITTEN){ + if (io_end->flag != EXT4_IO_UNWRITTEN){ ext4_free_io_end(io_end); iocb->private = NULL; return; @@ -3700,9 +3691,10 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, /* Add the io_end to per-inode completed aio dio list*/ list_add_tail(&io_end->list, - &EXT4_I(io_end->inode)->i_aio_dio_complete_list); + &EXT4_I(io_end->inode)->i_completed_io_list); iocb->private = NULL; } + /* * For ext4 extent files, ext4 will do direct-io write to holes, * preallocated extents, and those write extend the file, no need to @@ -3772,7 +3764,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, offset, nr_segs, - ext4_get_block_dio_write, + ext4_get_block_write, ext4_end_io_dio); if (iocb->private) EXT4_I(inode)->cur_aio_dio = NULL; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index c832508d5515..dc7a97e79e3b 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -708,7 +708,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) #ifdef CONFIG_QUOTA ei->i_reserved_quota = 0; #endif - INIT_LIST_HEAD(&ei->i_aio_dio_complete_list); + INIT_LIST_HEAD(&ei->i_completed_io_list); ei->cur_aio_dio = NULL; ei->i_sync_tid = 0; ei->i_datasync_tid = 0; -- cgit v1.2.3 From 744692dc059845b2a3022119871846e74d4f6e11 Mon Sep 17 00:00:00 2001 From: Jiaying Zhang <jiayingz@google.com> Date: Thu, 4 Mar 2010 16:14:02 -0500 Subject: ext4: use ext4_get_block_write in buffer write Allocate uninitialized extent before ext4 buffer write and convert the extent to initialized after io completes. The purpose is to make sure an extent can only be marked initialized after it has been written with new data so we can safely drop the i_mutex lock in ext4 DIO read without exposing stale data. This helps to improve multi-thread DIO read performance on high-speed disks. Skip the nobh and data=journal mount cases to make things simple for now. Signed-off-by: Jiaying Zhang <jiayingz@google.com> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu> --- fs/ext4/ext4.h | 15 +++- fs/ext4/ext4_jbd2.h | 24 ++++++ fs/ext4/extents.c | 22 +++--- fs/ext4/inode.c | 213 ++++++++++++++++++++++++++++++++++++++++++---------- fs/ext4/super.c | 37 ++++++++- 5 files changed, 256 insertions(+), 55 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index c831a580bd76..dee45800dc95 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -138,7 +138,7 @@ typedef struct ext4_io_end { struct list_head list; /* per-file finished AIO list */ struct inode *inode; /* file being written to */ unsigned int flag; /* unwritten or not */ - int error; /* I/O error code */ + struct page *page; /* page struct for buffer write */ loff_t offset; /* offset in the file */ ssize_t size; /* size of the extent */ struct work_struct work; /* data work queue */ @@ -361,7 +361,7 @@ struct ext4_new_group_data { EXT4_GET_BLOCKS_CREATE_UNINIT_EXT) /* Convert extent to initialized after IO complete */ #define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\ - EXT4_GET_BLOCKS_IO_CREATE_EXT) + EXT4_GET_BLOCKS_CREATE_UNINIT_EXT) /* * Flags used by ext4_free_blocks @@ -702,6 +702,7 @@ struct ext4_inode_info { /* completed IOs that might need unwritten extents handling */ struct list_head i_completed_io_list; + spinlock_t i_completed_io_lock; /* current io_end structure for async DIO write*/ ext4_io_end_t *cur_aio_dio; @@ -752,6 +753,7 @@ struct ext4_inode_info { #define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */ #define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */ #define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */ +#define EXT4_MOUNT_DIOREAD_NOLOCK 0x400000 /* Enable support for dio read nolocking */ #define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ #define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ @@ -1781,6 +1783,15 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 len, __u64 *moved_len); +/* BH_Uninit flag: blocks are allocated but uninitialized on disk */ +enum ext4_state_bits { + BH_Uninit /* blocks are allocated but uninitialized on disk */ + = BH_JBDPrivateStart, +}; + +BUFFER_FNS(Uninit, uninit) +TAS_BUFFER_FNS(Uninit, uninit) + /* * Add new method to test wether block and inode bitmaps are properly * initialized. With uninit_bg reading the block from disk is not enough diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 05eca817d704..b79ad5126468 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -304,4 +304,28 @@ static inline int ext4_should_writeback_data(struct inode *inode) return 0; } +/* + * This function controls whether or not we should try to go down the + * dioread_nolock code paths, which makes it safe to avoid taking + * i_mutex for direct I/O reads. This only works for extent-based + * files, and it doesn't work for nobh or if data journaling is + * enabled, since the dioread_nolock code uses b_private to pass + * information back to the I/O completion handler, and this conflicts + * with the jbd's use of b_private. + */ +static inline int ext4_should_dioread_nolock(struct inode *inode) +{ + if (!test_opt(inode->i_sb, DIOREAD_NOLOCK)) + return 0; + if (test_opt(inode->i_sb, NOBH)) + return 0; + if (!S_ISREG(inode->i_mode)) + return 0; + if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) + return 0; + if (ext4_should_journal_data(inode)) + return 0; + return 1; +} + #endif /* _EXT4_JBD2_H */ diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 90ba8d9df697..c7f166ab50eb 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -1619,7 +1619,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, BUG_ON(path[depth].p_hdr == NULL); /* try to insert block into found extent and return */ - if (ex && (flag != EXT4_GET_BLOCKS_PRE_IO) + if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO) && ext4_can_extents_be_merged(inode, ex, newext)) { ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n", ext4_ext_is_uninitialized(newext), @@ -1740,7 +1740,7 @@ has_space: merge: /* try to merge extents to the right */ - if (flag != EXT4_GET_BLOCKS_PRE_IO) + if (!(flag & EXT4_GET_BLOCKS_PRE_IO)) ext4_ext_try_to_merge(inode, path, nearex); /* try to merge extents to the left */ @@ -3065,7 +3065,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, ext4_ext_show_leaf(inode, path); /* get_block() before submit the IO, split the extent */ - if (flags == EXT4_GET_BLOCKS_PRE_IO) { + if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { ret = ext4_split_unwritten_extents(handle, inode, path, iblock, max_blocks, flags); @@ -3078,10 +3078,12 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, io->flag = EXT4_IO_UNWRITTEN; else ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); + if (ext4_should_dioread_nolock(inode)) + set_buffer_uninit(bh_result); goto out; } /* IO end_io complete, convert the filled extent to written */ - if (flags == EXT4_GET_BLOCKS_CONVERT) { + if ((flags & EXT4_GET_BLOCKS_CONVERT)) { ret = ext4_convert_unwritten_extents_endio(handle, inode, path); if (ret >= 0) @@ -3351,21 +3353,21 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){ ext4_ext_mark_uninitialized(&newex); /* - * io_end structure was created for every async - * direct IO write to the middle of the file. - * To avoid unecessary convertion for every aio dio rewrite - * to the mid of file, here we flag the IO that is really - * need the convertion. + * io_end structure was created for every IO write to an + * uninitialized extent. To avoid unecessary conversion, + * here we flag the IO that really needs the conversion. * For non asycn direct IO case, flag the inode state * that we need to perform convertion when IO is done. */ - if (flags == EXT4_GET_BLOCKS_PRE_IO) { + if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { if (io) io->flag = EXT4_IO_UNWRITTEN; else ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); } + if (ext4_should_dioread_nolock(inode)) + set_buffer_uninit(bh_result); } if (unlikely(EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL)) { diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 28f116bdc405..d291310aef6b 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -38,6 +38,7 @@ #include <linux/uio.h> #include <linux/bio.h> #include <linux/workqueue.h> +#include <linux/kernel.h> #include "ext4_jbd2.h" #include "xattr.h" @@ -1534,6 +1535,8 @@ static void ext4_truncate_failed_write(struct inode *inode) ext4_truncate(inode); } +static int ext4_get_block_write(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create); static int ext4_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) @@ -1575,8 +1578,12 @@ retry: } *pagep = page; - ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, - ext4_get_block); + if (ext4_should_dioread_nolock(inode)) + ret = block_write_begin(file, mapping, pos, len, flags, pagep, + fsdata, ext4_get_block_write); + else + ret = block_write_begin(file, mapping, pos, len, flags, pagep, + fsdata, ext4_get_block); if (!ret && ext4_should_journal_data(inode)) { ret = walk_page_buffers(handle, page_buffers(page), @@ -2092,6 +2099,8 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical, } else if (buffer_mapped(bh)) BUG_ON(bh->b_blocknr != pblock); + if (buffer_uninit(exbh)) + set_buffer_uninit(bh); cur_logical++; pblock++; } while ((bh = bh->b_this_page) != head); @@ -2221,6 +2230,8 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd) */ new.b_state = 0; get_blocks_flags = EXT4_GET_BLOCKS_CREATE; + if (ext4_should_dioread_nolock(mpd->inode)) + get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT; if (mpd->b_state & (1 << BH_Delay)) get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE; @@ -2636,6 +2647,9 @@ out: return ret; } +static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode); +static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate); + /* * Note that we don't need to start a transaction unless we're journaling data * because we should have holes filled from ext4_page_mkwrite(). We even don't @@ -2683,7 +2697,7 @@ static int ext4_writepage(struct page *page, int ret = 0; loff_t size; unsigned int len; - struct buffer_head *page_bufs; + struct buffer_head *page_bufs = NULL; struct inode *inode = page->mapping->host; trace_ext4_writepage(inode, page); @@ -2759,7 +2773,11 @@ static int ext4_writepage(struct page *page, if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode)) ret = nobh_writepage(page, noalloc_get_block_write, wbc); - else + else if (page_bufs && buffer_uninit(page_bufs)) { + ext4_set_bh_endio(page_bufs, inode); + ret = block_write_full_page_endio(page, noalloc_get_block_write, + wbc, ext4_end_io_buffer_write); + } else ret = block_write_full_page(page, noalloc_get_block_write, wbc); @@ -3347,10 +3365,44 @@ ext4_readpages(struct file *file, struct address_space *mapping, return mpage_readpages(mapping, pages, nr_pages, ext4_get_block); } +static void ext4_free_io_end(ext4_io_end_t *io) +{ + BUG_ON(!io); + if (io->page) + put_page(io->page); + iput(io->inode); + kfree(io); +} + +static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offset) +{ + struct buffer_head *head, *bh; + unsigned int curr_off = 0; + + if (!page_has_buffers(page)) + return; + head = bh = page_buffers(page); + do { + if (offset <= curr_off && test_clear_buffer_uninit(bh) + && bh->b_private) { + ext4_free_io_end(bh->b_private); + bh->b_private = NULL; + bh->b_end_io = NULL; + } + curr_off = curr_off + bh->b_size; + bh = bh->b_this_page; + } while (bh != head); +} + static void ext4_invalidatepage(struct page *page, unsigned long offset) { journal_t *journal = EXT4_JOURNAL(page->mapping->host); + /* + * free any io_end structure allocated for buffers to be discarded + */ + if (ext4_should_dioread_nolock(page->mapping->host)) + ext4_invalidatepage_free_endio(page, offset); /* * If it's a full truncate we just forget about the pending dirtying */ @@ -3471,10 +3523,11 @@ out: static int ext4_get_block_write(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { - handle_t *handle = NULL; + handle_t *handle = ext4_journal_current_handle(); int ret = 0; unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; int dio_credits; + int started = 0; ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n", inode->i_ino, create); @@ -3485,37 +3538,36 @@ static int ext4_get_block_write(struct inode *inode, sector_t iblock, */ create = EXT4_GET_BLOCKS_IO_CREATE_EXT; - if (max_blocks > DIO_MAX_BLOCKS) - max_blocks = DIO_MAX_BLOCKS; - dio_credits = ext4_chunk_trans_blocks(inode, max_blocks); - handle = ext4_journal_start(inode, dio_credits); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto out; + if (!handle) { + if (max_blocks > DIO_MAX_BLOCKS) + max_blocks = DIO_MAX_BLOCKS; + dio_credits = ext4_chunk_trans_blocks(inode, max_blocks); + handle = ext4_journal_start(inode, dio_credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + started = 1; } + ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result, create); if (ret > 0) { bh_result->b_size = (ret << inode->i_blkbits); ret = 0; } - ext4_journal_stop(handle); + if (started) + ext4_journal_stop(handle); out: return ret; } -static void ext4_free_io_end(ext4_io_end_t *io) -{ - BUG_ON(!io); - iput(io->inode); - kfree(io); -} - static void dump_completed_IO(struct inode * inode) { #ifdef EXT4_DEBUG struct list_head *cur, *before, *after; ext4_io_end_t *io, *io0, *io1; + unsigned long flags; if (list_empty(&EXT4_I(inode)->i_completed_io_list)){ ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino); @@ -3523,6 +3575,7 @@ static void dump_completed_IO(struct inode * inode) } ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino); + spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags); list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){ cur = &io->list; before = cur->prev; @@ -3533,6 +3586,7 @@ static void dump_completed_IO(struct inode * inode) ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n", io, inode->i_ino, io0, io1); } + spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags); #endif } @@ -3556,9 +3610,7 @@ static int ext4_end_io_nolock(ext4_io_end_t *io) if (io->flag != EXT4_IO_UNWRITTEN) return ret; - if (offset + size <= i_size_read(inode)) - ret = ext4_convert_unwritten_extents(inode, offset, size); - + ret = ext4_convert_unwritten_extents(inode, offset, size); if (ret < 0) { printk(KERN_EMERG "%s: failed to convert unwritten" "extents to written extents, error is %d" @@ -3577,18 +3629,25 @@ static int ext4_end_io_nolock(ext4_io_end_t *io) */ static void ext4_end_io_work(struct work_struct *work) { - ext4_io_end_t *io = container_of(work, ext4_io_end_t, work); - struct inode *inode = io->inode; - int ret = 0; + ext4_io_end_t *io = container_of(work, ext4_io_end_t, work); + struct inode *inode = io->inode; + struct ext4_inode_info *ei = EXT4_I(inode); + unsigned long flags; + int ret; mutex_lock(&inode->i_mutex); ret = ext4_end_io_nolock(io); - if (ret >= 0) { - if (!list_empty(&io->list)) - list_del_init(&io->list); - ext4_free_io_end(io); + if (ret < 0) { + mutex_unlock(&inode->i_mutex); + return; } + + spin_lock_irqsave(&ei->i_completed_io_lock, flags); + if (!list_empty(&io->list)) + list_del_init(&io->list); + spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); mutex_unlock(&inode->i_mutex); + ext4_free_io_end(io); } /* @@ -3607,15 +3666,18 @@ static void ext4_end_io_work(struct work_struct *work) int flush_completed_IO(struct inode *inode) { ext4_io_end_t *io; + struct ext4_inode_info *ei = EXT4_I(inode); + unsigned long flags; int ret = 0; int ret2 = 0; - if (list_empty(&EXT4_I(inode)->i_completed_io_list)) + if (list_empty(&ei->i_completed_io_list)) return ret; dump_completed_IO(inode); - while (!list_empty(&EXT4_I(inode)->i_completed_io_list)){ - io = list_entry(EXT4_I(inode)->i_completed_io_list.next, + spin_lock_irqsave(&ei->i_completed_io_lock, flags); + while (!list_empty(&ei->i_completed_io_list)){ + io = list_entry(ei->i_completed_io_list.next, ext4_io_end_t, list); /* * Calling ext4_end_io_nolock() to convert completed @@ -3631,20 +3693,23 @@ int flush_completed_IO(struct inode *inode) * avoid double converting from both fsync and background work * queue work. */ + spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); ret = ext4_end_io_nolock(io); + spin_lock_irqsave(&ei->i_completed_io_lock, flags); if (ret < 0) ret2 = ret; else list_del_init(&io->list); } + spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); return (ret2 < 0) ? ret2 : 0; } -static ext4_io_end_t *ext4_init_io_end (struct inode *inode) +static ext4_io_end_t *ext4_init_io_end (struct inode *inode, gfp_t flags) { ext4_io_end_t *io = NULL; - io = kmalloc(sizeof(*io), GFP_NOFS); + io = kmalloc(sizeof(*io), flags); if (io) { igrab(inode); @@ -3652,7 +3717,7 @@ static ext4_io_end_t *ext4_init_io_end (struct inode *inode) io->flag = 0; io->offset = 0; io->size = 0; - io->error = 0; + io->page = NULL; INIT_WORK(&io->work, ext4_end_io_work); INIT_LIST_HEAD(&io->list); } @@ -3665,6 +3730,8 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, { ext4_io_end_t *io_end = iocb->private; struct workqueue_struct *wq; + unsigned long flags; + struct ext4_inode_info *ei; /* if not async direct IO or dio with 0 bytes write, just return */ if (!io_end || !size) @@ -3684,17 +3751,85 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, io_end->offset = offset; io_end->size = size; + io_end->flag = EXT4_IO_UNWRITTEN; wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq; /* queue the work to convert unwritten extents to written */ queue_work(wq, &io_end->work); /* Add the io_end to per-inode completed aio dio list*/ - list_add_tail(&io_end->list, - &EXT4_I(io_end->inode)->i_completed_io_list); + ei = EXT4_I(io_end->inode); + spin_lock_irqsave(&ei->i_completed_io_lock, flags); + list_add_tail(&io_end->list, &ei->i_completed_io_list); + spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); iocb->private = NULL; } +static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate) +{ + ext4_io_end_t *io_end = bh->b_private; + struct workqueue_struct *wq; + struct inode *inode; + unsigned long flags; + + if (!test_clear_buffer_uninit(bh) || !io_end) + goto out; + + if (!(io_end->inode->i_sb->s_flags & MS_ACTIVE)) { + printk("sb umounted, discard end_io request for inode %lu\n", + io_end->inode->i_ino); + ext4_free_io_end(io_end); + goto out; + } + + io_end->flag = EXT4_IO_UNWRITTEN; + inode = io_end->inode; + + /* Add the io_end to per-inode completed io list*/ + spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags); + list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list); + spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags); + + wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq; + /* queue the work to convert unwritten extents to written */ + queue_work(wq, &io_end->work); +out: + bh->b_private = NULL; + bh->b_end_io = NULL; + clear_buffer_uninit(bh); + end_buffer_async_write(bh, uptodate); +} + +static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode) +{ + ext4_io_end_t *io_end; + struct page *page = bh->b_page; + loff_t offset = (sector_t)page->index << PAGE_CACHE_SHIFT; + size_t size = bh->b_size; + +retry: + io_end = ext4_init_io_end(inode, GFP_ATOMIC); + if (!io_end) { + if (printk_ratelimit()) + printk(KERN_WARNING "%s: allocation fail\n", __func__); + schedule(); + goto retry; + } + io_end->offset = offset; + io_end->size = size; + /* + * We need to hold a reference to the page to make sure it + * doesn't get evicted before ext4_end_io_work() has a chance + * to convert the extent from written to unwritten. + */ + io_end->page = page; + get_page(io_end->page); + + bh->b_private = io_end; + bh->b_end_io = ext4_end_io_buffer_write; + return 0; +} + /* * For ext4 extent files, ext4 will do direct-io write to holes, * preallocated extents, and those write extend the file, no need to @@ -3748,7 +3883,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, iocb->private = NULL; EXT4_I(inode)->cur_aio_dio = NULL; if (!is_sync_kiocb(iocb)) { - iocb->private = ext4_init_io_end(inode); + iocb->private = ext4_init_io_end(inode, GFP_NOFS); if (!iocb->private) return -ENOMEM; /* diff --git a/fs/ext4/super.c b/fs/ext4/super.c index dc7a97e79e3b..5e8f9077b0fc 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -709,6 +709,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) ei->i_reserved_quota = 0; #endif INIT_LIST_HEAD(&ei->i_completed_io_list); + spin_lock_init(&ei->i_completed_io_lock); ei->cur_aio_dio = NULL; ei->i_sync_tid = 0; ei->i_datasync_tid = 0; @@ -926,6 +927,9 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) if (test_opt(sb, NOLOAD)) seq_puts(seq, ",norecovery"); + if (test_opt(sb, DIOREAD_NOLOCK)) + seq_puts(seq, ",dioread_nolock"); + ext4_show_quota_options(seq, sb); return 0; @@ -1109,6 +1113,7 @@ enum { Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_block_validity, Opt_noblock_validity, Opt_inode_readahead_blks, Opt_journal_ioprio, + Opt_dioread_nolock, Opt_dioread_lock, Opt_discard, Opt_nodiscard, }; @@ -1176,6 +1181,8 @@ static const match_table_t tokens = { {Opt_auto_da_alloc, "auto_da_alloc=%u"}, {Opt_auto_da_alloc, "auto_da_alloc"}, {Opt_noauto_da_alloc, "noauto_da_alloc"}, + {Opt_dioread_nolock, "dioread_nolock"}, + {Opt_dioread_lock, "dioread_lock"}, {Opt_discard, "discard"}, {Opt_nodiscard, "nodiscard"}, {Opt_err, NULL}, @@ -1640,6 +1647,12 @@ set_qf_format: case Opt_nodiscard: clear_opt(sbi->s_mount_opt, DISCARD); break; + case Opt_dioread_nolock: + set_opt(sbi->s_mount_opt, DIOREAD_NOLOCK); + break; + case Opt_dioread_lock: + clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK); + break; default: ext4_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" " @@ -2795,7 +2808,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) { ext4_msg(sb, KERN_ERR, "required journal recovery " "suppressed and not mounted read-only"); - goto failed_mount4; + goto failed_mount_wq; } else { clear_opt(sbi->s_mount_opt, DATA_FLAGS); set_opt(sbi->s_mount_opt, WRITEBACK_DATA); @@ -2808,7 +2821,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_64BIT)) { ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature"); - goto failed_mount4; + goto failed_mount_wq; } if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { @@ -2847,7 +2860,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) { ext4_msg(sb, KERN_ERR, "Journal does not support " "requested data journaling mode"); - goto failed_mount4; + goto failed_mount_wq; } default: break; @@ -2855,13 +2868,17 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); no_journal: - if (test_opt(sb, NOBH)) { if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) { ext4_msg(sb, KERN_WARNING, "Ignoring nobh option - " "its supported only with writeback mode"); clear_opt(sbi->s_mount_opt, NOBH); } + if (test_opt(sb, DIOREAD_NOLOCK)) { + ext4_msg(sb, KERN_WARNING, "dioread_nolock option is " + "not supported with nobh mode"); + goto failed_mount_wq; + } } EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten"); if (!EXT4_SB(sb)->dio_unwritten_wq) { @@ -2926,6 +2943,18 @@ no_journal: "requested data journaling mode"); clear_opt(sbi->s_mount_opt, DELALLOC); } + if (test_opt(sb, DIOREAD_NOLOCK)) { + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { + ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock " + "option - requested data journaling mode"); + clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK); + } + if (sb->s_blocksize < PAGE_SIZE) { + ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock " + "option - block size is too small"); + clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK); + } + } err = ext4_setup_system_zone(sb); if (err) { -- cgit v1.2.3 From b7adc1f363e72e9131a582cc2cb00eaf83f51a39 Mon Sep 17 00:00:00 2001 From: Jiaying Zhang <jiayingz@google.com> Date: Tue, 2 Mar 2010 13:26:36 -0500 Subject: ext4: Use direct_IO_no_locking in ext4 dio read Signed-off-by: Jiaying Zhang <jiayingz@google.com> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu> --- fs/ext4/inode.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index d291310aef6b..92214d4e5afa 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3474,7 +3474,14 @@ static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, } retry: - ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, + if (rw == READ && ext4_should_dioread_nolock(inode)) + ret = blockdev_direct_IO_no_locking(rw, iocb, inode, + inode->i_sb->s_bdev, iov, + offset, nr_segs, + ext4_get_block, NULL); + else + ret = blockdev_direct_IO(rw, iocb, inode, + inode->i_sb->s_bdev, iov, offset, nr_segs, ext4_get_block, NULL); if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) -- cgit v1.2.3 From 273df556b6ee2065bfe96edab5888d3dc9b108d8 Mon Sep 17 00:00:00 2001 From: Frank Mayhar <fmayhar@google.com> Date: Tue, 2 Mar 2010 11:46:09 -0500 Subject: ext4: Convert BUG_ON checks to use ext4_error() instead Convert a bunch of BUG_ONs to emit a ext4_error() message and return EIO. This is a first pass and most notably does _not_ cover mballoc.c, which is a morass of void functions. Signed-off-by: Frank Mayhar <fmayhar@google.com> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu> --- fs/ext4/ext4.h | 10 +++ fs/ext4/extents.c | 189 ++++++++++++++++++++++++++++++++++++++++++------------ fs/ext4/inode.c | 18 +++++- fs/ext4/super.c | 36 +++++++++++ 4 files changed, 209 insertions(+), 44 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index dee45800dc95..3d85bbb09f1b 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -53,6 +53,12 @@ #define ext4_debug(f, a...) do {} while (0) #endif +#define EXT4_ERROR_INODE(inode, fmt, a...) \ + ext4_error_inode(__func__, (inode), (fmt), ## a); + +#define EXT4_ERROR_FILE(file, fmt, a...) \ + ext4_error_file(__func__, (file), (fmt), ## a); + /* data type for block offset of block group */ typedef int ext4_grpblk_t; @@ -1492,6 +1498,10 @@ extern int ext4_group_extend(struct super_block *sb, extern void __ext4_error(struct super_block *, const char *, const char *, ...) __attribute__ ((format (printf, 3, 4))); #define ext4_error(sb, message...) __ext4_error(sb, __func__, ## message) +extern void ext4_error_inode(const char *, struct inode *, const char *, ...) + __attribute__ ((format (printf, 3, 4))); +extern void ext4_error_file(const char *, struct file *, const char *, ...) + __attribute__ ((format (printf, 3, 4))); extern void __ext4_std_error(struct super_block *, const char *, int); extern void ext4_abort(struct super_block *, const char *, const char *, ...) __attribute__ ((format (printf, 3, 4))); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index c7f166ab50eb..4bb69206f175 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -703,7 +703,12 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, } eh = ext_block_hdr(bh); ppos++; - BUG_ON(ppos > depth); + if (unlikely(ppos > depth)) { + put_bh(bh); + EXT4_ERROR_INODE(inode, + "ppos %d > depth %d", ppos, depth); + goto err; + } path[ppos].p_bh = bh; path[ppos].p_hdr = eh; i--; @@ -749,7 +754,12 @@ int ext4_ext_insert_index(handle_t *handle, struct inode *inode, if (err) return err; - BUG_ON(logical == le32_to_cpu(curp->p_idx->ei_block)); + if (unlikely(logical == le32_to_cpu(curp->p_idx->ei_block))) { + EXT4_ERROR_INODE(inode, + "logical %d == ei_block %d!", + logical, le32_to_cpu(curp->p_idx->ei_block)); + return -EIO; + } len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx; if (logical > le32_to_cpu(curp->p_idx->ei_block)) { /* insert after */ @@ -779,9 +789,17 @@ int ext4_ext_insert_index(handle_t *handle, struct inode *inode, ext4_idx_store_pblock(ix, ptr); le16_add_cpu(&curp->p_hdr->eh_entries, 1); - BUG_ON(le16_to_cpu(curp->p_hdr->eh_entries) - > le16_to_cpu(curp->p_hdr->eh_max)); - BUG_ON(ix > EXT_LAST_INDEX(curp->p_hdr)); + if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries) + > le16_to_cpu(curp->p_hdr->eh_max))) { + EXT4_ERROR_INODE(inode, + "logical %d == ei_block %d!", + logical, le32_to_cpu(curp->p_idx->ei_block)); + return -EIO; + } + if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) { + EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!"); + return -EIO; + } err = ext4_ext_dirty(handle, inode, curp); ext4_std_error(inode->i_sb, err); @@ -819,7 +837,10 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, /* if current leaf will be split, then we should use * border from split point */ - BUG_ON(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr)); + if (unlikely(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr))) { + EXT4_ERROR_INODE(inode, "p_ext > EXT_MAX_EXTENT!"); + return -EIO; + } if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) { border = path[depth].p_ext[1].ee_block; ext_debug("leaf will be split." @@ -860,7 +881,11 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, /* initialize new leaf */ newblock = ablocks[--a]; - BUG_ON(newblock == 0); + if (unlikely(newblock == 0)) { + EXT4_ERROR_INODE(inode, "newblock == 0!"); + err = -EIO; + goto cleanup; + } bh = sb_getblk(inode->i_sb, newblock); if (!bh) { err = -EIO; @@ -880,7 +905,14 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, ex = EXT_FIRST_EXTENT(neh); /* move remainder of path[depth] to the new leaf */ - BUG_ON(path[depth].p_hdr->eh_entries != path[depth].p_hdr->eh_max); + if (unlikely(path[depth].p_hdr->eh_entries != + path[depth].p_hdr->eh_max)) { + EXT4_ERROR_INODE(inode, "eh_entries %d != eh_max %d!", + path[depth].p_hdr->eh_entries, + path[depth].p_hdr->eh_max); + err = -EIO; + goto cleanup; + } /* start copy from next extent */ /* TODO: we could do it by single memmove */ m = 0; @@ -927,7 +959,11 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, /* create intermediate indexes */ k = depth - at - 1; - BUG_ON(k < 0); + if (unlikely(k < 0)) { + EXT4_ERROR_INODE(inode, "k %d < 0!", k); + err = -EIO; + goto cleanup; + } if (k) ext_debug("create %d intermediate indices\n", k); /* insert new index into current index block */ @@ -964,8 +1000,14 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx, EXT_MAX_INDEX(path[i].p_hdr)); - BUG_ON(EXT_MAX_INDEX(path[i].p_hdr) != - EXT_LAST_INDEX(path[i].p_hdr)); + if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) != + EXT_LAST_INDEX(path[i].p_hdr))) { + EXT4_ERROR_INODE(inode, + "EXT_MAX_INDEX != EXT_LAST_INDEX ee_block %d!", + le32_to_cpu(path[i].p_ext->ee_block)); + err = -EIO; + goto cleanup; + } while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) { ext_debug("%d: move %d:%llu in new index %llu\n", i, le32_to_cpu(path[i].p_idx->ei_block), @@ -1203,7 +1245,10 @@ ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path, struct ext4_extent *ex; int depth, ee_len; - BUG_ON(path == NULL); + if (unlikely(path == NULL)) { + EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical); + return -EIO; + } depth = path->p_depth; *phys = 0; @@ -1217,15 +1262,33 @@ ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path, ex = path[depth].p_ext; ee_len = ext4_ext_get_actual_len(ex); if (*logical < le32_to_cpu(ex->ee_block)) { - BUG_ON(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex); + if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) { + EXT4_ERROR_INODE(inode, + "EXT_FIRST_EXTENT != ex *logical %d ee_block %d!", + *logical, le32_to_cpu(ex->ee_block)); + return -EIO; + } while (--depth >= 0) { ix = path[depth].p_idx; - BUG_ON(ix != EXT_FIRST_INDEX(path[depth].p_hdr)); + if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) { + EXT4_ERROR_INODE(inode, + "ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!", + ix != NULL ? ix->ei_block : 0, + EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ? + EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block : 0, + depth); + return -EIO; + } } return 0; } - BUG_ON(*logical < (le32_to_cpu(ex->ee_block) + ee_len)); + if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) { + EXT4_ERROR_INODE(inode, + "logical %d < ee_block %d + ee_len %d!", + *logical, le32_to_cpu(ex->ee_block), ee_len); + return -EIO; + } *logical = le32_to_cpu(ex->ee_block) + ee_len - 1; *phys = ext_pblock(ex) + ee_len - 1; @@ -1251,7 +1314,10 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path, int depth; /* Note, NOT eh_depth; depth from top of tree */ int ee_len; - BUG_ON(path == NULL); + if (unlikely(path == NULL)) { + EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical); + return -EIO; + } depth = path->p_depth; *phys = 0; @@ -1265,17 +1331,32 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path, ex = path[depth].p_ext; ee_len = ext4_ext_get_actual_len(ex); if (*logical < le32_to_cpu(ex->ee_block)) { - BUG_ON(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex); + if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) { + EXT4_ERROR_INODE(inode, + "first_extent(path[%d].p_hdr) != ex", + depth); + return -EIO; + } while (--depth >= 0) { ix = path[depth].p_idx; - BUG_ON(ix != EXT_FIRST_INDEX(path[depth].p_hdr)); + if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) { + EXT4_ERROR_INODE(inode, + "ix != EXT_FIRST_INDEX *logical %d!", + *logical); + return -EIO; + } } *logical = le32_to_cpu(ex->ee_block); *phys = ext_pblock(ex); return 0; } - BUG_ON(*logical < (le32_to_cpu(ex->ee_block) + ee_len)); + if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) { + EXT4_ERROR_INODE(inode, + "logical %d < ee_block %d + ee_len %d!", + *logical, le32_to_cpu(ex->ee_block), ee_len); + return -EIO; + } if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) { /* next allocated block in this leaf */ @@ -1414,8 +1495,12 @@ static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode, eh = path[depth].p_hdr; ex = path[depth].p_ext; - BUG_ON(ex == NULL); - BUG_ON(eh == NULL); + + if (unlikely(ex == NULL || eh == NULL)) { + EXT4_ERROR_INODE(inode, + "ex %p == NULL or eh %p == NULL", ex, eh); + return -EIO; + } if (depth == 0) { /* there is no tree at all */ @@ -1613,10 +1698,16 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, ext4_lblk_t next; unsigned uninitialized = 0; - BUG_ON(ext4_ext_get_actual_len(newext) == 0); + if (unlikely(ext4_ext_get_actual_len(newext) == 0)) { + EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0"); + return -EIO; + } depth = ext_depth(inode); ex = path[depth].p_ext; - BUG_ON(path[depth].p_hdr == NULL); + if (unlikely(path[depth].p_hdr == NULL)) { + EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); + return -EIO; + } /* try to insert block into found extent and return */ if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO) @@ -1788,7 +1879,11 @@ int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block, } depth = ext_depth(inode); - BUG_ON(path[depth].p_hdr == NULL); + if (unlikely(path[depth].p_hdr == NULL)) { + EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); + err = -EIO; + break; + } ex = path[depth].p_ext; next = ext4_ext_next_allocated_block(path); @@ -1839,7 +1934,11 @@ int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block, cbex.ec_type = EXT4_EXT_CACHE_EXTENT; } - BUG_ON(cbex.ec_len == 0); + if (unlikely(cbex.ec_len == 0)) { + EXT4_ERROR_INODE(inode, "cbex.ec_len == 0"); + err = -EIO; + break; + } err = func(inode, path, &cbex, ex, cbdata); ext4_ext_drop_refs(path); @@ -1982,7 +2081,10 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, /* free index block */ path--; leaf = idx_pblock(path->p_idx); - BUG_ON(path->p_hdr->eh_entries == 0); + if (unlikely(path->p_hdr->eh_entries == 0)) { + EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0"); + return -EIO; + } err = ext4_ext_get_access(handle, inode, path); if (err) return err; @@ -2120,8 +2222,10 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, if (!path[depth].p_hdr) path[depth].p_hdr = ext_block_hdr(path[depth].p_bh); eh = path[depth].p_hdr; - BUG_ON(eh == NULL); - + if (unlikely(path[depth].p_hdr == NULL)) { + EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); + return -EIO; + } /* find where to start removing */ ex = EXT_LAST_EXTENT(eh); @@ -3240,10 +3344,10 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, * this situation is possible, though, _during_ tree modification; * this is why assert can't be put in ext4_ext_find_extent() */ - if (path[depth].p_ext == NULL && depth != 0) { - ext4_error(inode->i_sb, "bad extent address " - "inode: %lu, iblock: %d, depth: %d", - inode->i_ino, iblock, depth); + if (unlikely(path[depth].p_ext == NULL && depth != 0)) { + EXT4_ERROR_INODE(inode, "bad extent address " + "iblock: %d, depth: %d pblock %lld", + iblock, depth, path[depth].p_block); err = -EIO; goto out2; } @@ -3371,16 +3475,17 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, } if (unlikely(EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL)) { - if (eh->eh_entries) { - last_ex = EXT_LAST_EXTENT(eh); - if (iblock + ar.len > le32_to_cpu(last_ex->ee_block) - + ext4_ext_get_actual_len(last_ex)) - EXT4_I(inode)->i_flags &= ~EXT4_EOFBLOCKS_FL; - } else { - WARN_ON(eh->eh_entries == 0); - ext4_error(inode->i_sb, __func__, - "inode#%lu, eh->eh_entries = 0!", inode->i_ino); - } + if (unlikely(!eh->eh_entries)) { + EXT4_ERROR_INODE(inode, + "eh->eh_entries == 0 ee_block %d", + ex->ee_block); + err = -EIO; + goto out2; + } + last_ex = EXT_LAST_EXTENT(eh); + if (iblock + ar.len > le32_to_cpu(last_ex->ee_block) + + ext4_ext_get_actual_len(last_ex)) + EXT4_I(inode)->i_flags &= ~EXT4_EOFBLOCKS_FL; } err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); if (err) { diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 92214d4e5afa..c717a74f2178 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -607,7 +607,14 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, if (*err) goto failed_out; - BUG_ON(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS); + if (unlikely(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS)) { + EXT4_ERROR_INODE(inode, + "current_block %llu + count %lu > %d!", + current_block, count, + EXT4_MAX_BLOCK_FILE_PHYS); + *err = -EIO; + goto failed_out; + } target -= count; /* allocate blocks for indirect blocks */ @@ -643,7 +650,14 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, ar.flags = EXT4_MB_HINT_DATA; current_block = ext4_mb_new_blocks(handle, &ar, err); - BUG_ON(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS); + if (unlikely(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS)) { + EXT4_ERROR_INODE(inode, + "current_block %llu + ar.len %d > %d!", + current_block, ar.len, + EXT4_MAX_BLOCK_FILE_PHYS); + *err = -EIO; + goto failed_out; + } if (*err && (target == blks)) { /* diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 5e8f9077b0fc..5a18e9ec7cf9 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -347,6 +347,42 @@ void __ext4_error(struct super_block *sb, const char *function, ext4_handle_error(sb); } +void ext4_error_inode(const char *function, struct inode *inode, + const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + printk(KERN_CRIT "EXT4-fs error (device %s): %s: inode #%lu: (comm %s) ", + inode->i_sb->s_id, function, inode->i_ino, current->comm); + vprintk(fmt, args); + printk("\n"); + va_end(args); + + ext4_handle_error(inode->i_sb); +} + +void ext4_error_file(const char *function, struct file *file, + const char *fmt, ...) +{ + va_list args; + struct inode *inode = file->f_dentry->d_inode; + char pathname[80], *path; + + va_start(args, fmt); + path = d_path(&(file->f_path), pathname, sizeof(pathname)); + if (!path) + path = "(unknown)"; + printk(KERN_CRIT + "EXT4-fs error (device %s): %s: inode #%lu (comm %s path %s): ", + inode->i_sb->s_id, function, inode->i_ino, current->comm, path); + vprintk(fmt, args); + printk("\n"); + va_end(args); + + ext4_handle_error(inode->i_sb); +} + static const char *ext4_decode_error(struct super_block *sb, int errno, char nbuf[16]) { -- cgit v1.2.3 From 67eeb5685d2a211c0252ac7884142e503c759500 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov <dmonakhov@openvz.org> Date: Tue, 2 Mar 2010 08:08:51 -0500 Subject: ext4: Fix ext4_quota_write cross block boundary behaviour We always assume what dquot update result in changes in one data block But ext4_quota_write() function may handle cross block boundary writes In fact if this ever happen it will result in incorrect journal credits reservation, and later a BUG_ON. As soon this never happen the boundary cross loop is NOOP. In order to make things straight let's remove this loop and assert cross boundary condition. Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu> --- fs/ext4/super.c | 69 ++++++++++++++++++++++++++++----------------------------- 1 file changed, 34 insertions(+), 35 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 5a18e9ec7cf9..ad1ee5f21bab 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4010,9 +4010,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb); int err = 0; int offset = off & (sb->s_blocksize - 1); - int tocopy; int journal_quota = EXT4_SB(sb)->s_qf_names[type] != NULL; - size_t towrite = len; struct buffer_head *bh; handle_t *handle = journal_current_handle(); @@ -4022,52 +4020,53 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, (unsigned long long)off, (unsigned long long)len); return -EIO; } + /* + * Since we account only one data block in transaction credits, + * then it is impossible to cross a block boundary. + */ + if (sb->s_blocksize - offset < len) { + ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)" + " cancelled because not block aligned", + (unsigned long long)off, (unsigned long long)len); + return -EIO; + } + mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA); - while (towrite > 0) { - tocopy = sb->s_blocksize - offset < towrite ? - sb->s_blocksize - offset : towrite; - bh = ext4_bread(handle, inode, blk, 1, &err); - if (!bh) + bh = ext4_bread(handle, inode, blk, 1, &err); + if (!bh) + goto out; + if (journal_quota) { + err = ext4_journal_get_write_access(handle, bh); + if (err) { + brelse(bh); goto out; - if (journal_quota) { - err = ext4_journal_get_write_access(handle, bh); - if (err) { - brelse(bh); - goto out; - } } - lock_buffer(bh); - memcpy(bh->b_data+offset, data, tocopy); - flush_dcache_page(bh->b_page); - unlock_buffer(bh); - if (journal_quota) - err = ext4_handle_dirty_metadata(handle, NULL, bh); - else { - /* Always do at least ordered writes for quotas */ - err = ext4_jbd2_file_inode(handle, inode); - mark_buffer_dirty(bh); - } - brelse(bh); - if (err) - goto out; - offset = 0; - towrite -= tocopy; - data += tocopy; - blk++; } + lock_buffer(bh); + memcpy(bh->b_data+offset, data, len); + flush_dcache_page(bh->b_page); + unlock_buffer(bh); + if (journal_quota) + err = ext4_handle_dirty_metadata(handle, NULL, bh); + else { + /* Always do at least ordered writes for quotas */ + err = ext4_jbd2_file_inode(handle, inode); + mark_buffer_dirty(bh); + } + brelse(bh); out: - if (len == towrite) { + if (err) { mutex_unlock(&inode->i_mutex); return err; } - if (inode->i_size < off+len-towrite) { - i_size_write(inode, off+len-towrite); + if (inode->i_size < off + len) { + i_size_write(inode, off + len); EXT4_I(inode)->i_disksize = inode->i_size; } inode->i_mtime = inode->i_ctime = CURRENT_TIME; ext4_mark_inode_dirty(handle, inode); mutex_unlock(&inode->i_mutex); - return len - towrite; + return len; } #endif -- cgit v1.2.3 From 888ef2e3f8b7b8daeb031bfb4ad1fd4fa817e193 Mon Sep 17 00:00:00 2001 From: Alexandros Batsakis <batsakis@netapp.com> Date: Fri, 5 Feb 2010 03:45:03 -0800 Subject: nfs: kill renewd before clearing client minor version renewd should be synchronously killed before we destroy the session in nfs4_clear_minor_version Signed-off-by: Alexandros Batsakis <batsakis@netapp.com> [Trond.Myklebust@netapp.com: clean up to remove 'unused function warning when !CONFIG_NFS_V4] Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com> --- fs/nfs/client.c | 48 ++++++++++++++++++++++++------------------------ 1 file changed, 24 insertions(+), 24 deletions(-) (limited to 'fs') diff --git a/fs/nfs/client.c b/fs/nfs/client.c index ee77713ce68b..2274f1737336 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -164,30 +164,7 @@ error_0: return ERR_PTR(err); } -static void nfs4_shutdown_client(struct nfs_client *clp) -{ -#ifdef CONFIG_NFS_V4 - if (__test_and_clear_bit(NFS_CS_RENEWD, &clp->cl_res_state)) - nfs4_kill_renewd(clp); - BUG_ON(!RB_EMPTY_ROOT(&clp->cl_state_owners)); - if (__test_and_clear_bit(NFS_CS_IDMAP, &clp->cl_res_state)) - nfs_idmap_delete(clp); - - rpc_destroy_wait_queue(&clp->cl_rpcwaitq); -#endif -} - -/* - * Destroy the NFS4 callback service - */ -static void nfs4_destroy_callback(struct nfs_client *clp) -{ #ifdef CONFIG_NFS_V4 - if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state)) - nfs_callback_down(clp->cl_minorversion); -#endif /* CONFIG_NFS_V4 */ -} - /* * Clears/puts all minor version specific parts from an nfs_client struct * reverting it to minorversion 0. @@ -202,9 +179,33 @@ static void nfs4_clear_client_minor_version(struct nfs_client *clp) clp->cl_call_sync = _nfs4_call_sync; #endif /* CONFIG_NFS_V4_1 */ +} +/* + * Destroy the NFS4 callback service + */ +static void nfs4_destroy_callback(struct nfs_client *clp) +{ + if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state)) + nfs_callback_down(clp->cl_minorversion); +} + +static void nfs4_shutdown_client(struct nfs_client *clp) +{ + if (__test_and_clear_bit(NFS_CS_RENEWD, &clp->cl_res_state)) + nfs4_kill_renewd(clp); + nfs4_clear_client_minor_version(clp); nfs4_destroy_callback(clp); + if (__test_and_clear_bit(NFS_CS_IDMAP, &clp->cl_res_state)) + nfs_idmap_delete(clp); + + rpc_destroy_wait_queue(&clp->cl_rpcwaitq); } +#else +static void nfs4_shutdown_client(struct nfs_client *clp) +{ +} +#endif /* CONFIG_NFS_V4 */ /* * Destroy a shared client record @@ -213,7 +214,6 @@ static void nfs_free_client(struct nfs_client *clp) { dprintk("--> nfs_free_client(%u)\n", clp->rpc_ops->version); - nfs4_clear_client_minor_version(clp); nfs4_shutdown_client(clp); nfs_fscache_release_client_cookie(clp); -- cgit v1.2.3 From dc96aef96a75348b4d1b01c4c0429ab52780683e Mon Sep 17 00:00:00 2001 From: Alexandros Batsakis <batsakis@netapp.com> Date: Fri, 5 Feb 2010 03:45:04 -0800 Subject: nfs: prevent backlogging of renewd requests If the renewd send queue gets backlogged (e.g., if the server goes down), we will keep filling the queue with periodic RENEW/SEQUENCE requests. This patch schedules a new renewd request if and only if the previous one returns (either success or failure) Signed-off-by: Alexandros Batsakis <batsakis@netapp.com> [Trond.Myklebust@netapp.com: moved nfs4_schedule_state_renewal() into separate nfs4_renew_release() and nfs41_sequence_release() callbacks to ensure correct behaviour on call setup failure] Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com> --- fs/nfs/nfs4proc.c | 24 ++++++++++++++++++++---- fs/nfs/nfs4renewd.c | 24 +++++++----------------- 2 files changed, 27 insertions(+), 21 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 84b53d38f50b..726bc195039d 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -3147,10 +3147,17 @@ static void nfs4_proc_commit_setup(struct nfs_write_data *data, struct rpc_messa * nfs4_proc_async_renew(): This is not one of the nfs_rpc_ops; it is a special * standalone procedure for queueing an asynchronous RENEW. */ +static void nfs4_renew_release(void *data) +{ + struct nfs_client *clp = data; + + nfs4_schedule_state_renewal(clp); +} + static void nfs4_renew_done(struct rpc_task *task, void *data) { - struct nfs_client *clp = (struct nfs_client *)task->tk_msg.rpc_argp; - unsigned long timestamp = (unsigned long)data; + struct nfs_client *clp = data; + unsigned long timestamp = task->tk_start; if (task->tk_status < 0) { /* Unless we're shutting down, schedule state recovery! */ @@ -3166,6 +3173,7 @@ static void nfs4_renew_done(struct rpc_task *task, void *data) static const struct rpc_call_ops nfs4_renew_ops = { .rpc_call_done = nfs4_renew_done, + .rpc_release = nfs4_renew_release, }; int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred) @@ -3177,7 +3185,7 @@ int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred) }; return rpc_call_async(clp->cl_rpcclient, &msg, RPC_TASK_SOFT, - &nfs4_renew_ops, (void *)jiffies); + &nfs4_renew_ops, clp); } int nfs4_proc_renew(struct nfs_client *clp, struct rpc_cred *cred) @@ -5023,7 +5031,14 @@ static int nfs4_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred) &res, args.sa_cache_this, 1); } -void nfs41_sequence_call_done(struct rpc_task *task, void *data) +static void nfs41_sequence_release(void *data) +{ + struct nfs_client *clp = (struct nfs_client *)data; + + nfs4_schedule_state_renewal(clp); +} + +static void nfs41_sequence_call_done(struct rpc_task *task, void *data) { struct nfs_client *clp = (struct nfs_client *)data; @@ -5064,6 +5079,7 @@ static void nfs41_sequence_prepare(struct rpc_task *task, void *data) static const struct rpc_call_ops nfs41_sequence_ops = { .rpc_call_done = nfs41_sequence_call_done, .rpc_call_prepare = nfs41_sequence_prepare, + .rpc_release = nfs41_sequence_release, }; static int nfs41_proc_async_sequence(struct nfs_client *clp, diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c index 0156c01c212c..d87f10327b72 100644 --- a/fs/nfs/nfs4renewd.c +++ b/fs/nfs/nfs4renewd.c @@ -36,11 +36,6 @@ * as an rpc_task, not a real kernel thread, so it always runs in rpciod's * context. There is one renewd per nfs_server. * - * TODO: If the send queue gets backlogged (e.g., if the server goes down), - * we will keep filling the queue with periodic RENEW requests. We need a - * mechanism for ensuring that if renewd successfully sends off a request, - * then it only wakes up when the request is finished. Maybe use the - * child task framework of the RPC layer? */ #include <linux/mm.h> @@ -63,7 +58,7 @@ nfs4_renew_state(struct work_struct *work) struct nfs_client *clp = container_of(work, struct nfs_client, cl_renewd.work); struct rpc_cred *cred; - long lease, timeout; + long lease; unsigned long last, now; ops = nfs4_state_renewal_ops[clp->cl_minorversion]; @@ -75,7 +70,6 @@ nfs4_renew_state(struct work_struct *work) lease = clp->cl_lease_time; last = clp->cl_last_renewal; now = jiffies; - timeout = (2 * lease) / 3 + (long)last - (long)now; /* Are we close to a lease timeout? */ if (time_after(now, last + lease/3)) { cred = ops->get_state_renewal_cred_locked(clp); @@ -90,19 +84,15 @@ nfs4_renew_state(struct work_struct *work) /* Queue an asynchronous RENEW. */ ops->sched_state_renewal(clp, cred); put_rpccred(cred); + goto out_exp; } - timeout = (2 * lease) / 3; - spin_lock(&clp->cl_lock); - } else + } else { dprintk("%s: failed to call renewd. Reason: lease not expired \n", __func__); - if (timeout < 5 * HZ) /* safeguard */ - timeout = 5 * HZ; - dprintk("%s: requeueing work. Lease period = %ld\n", - __func__, (timeout + HZ - 1) / HZ); - cancel_delayed_work(&clp->cl_renewd); - schedule_delayed_work(&clp->cl_renewd, timeout); - spin_unlock(&clp->cl_lock); + spin_unlock(&clp->cl_lock); + } + nfs4_schedule_state_renewal(clp); +out_exp: nfs_expire_unreferenced_delegations(clp); out: dprintk("%s: done\n", __func__); -- cgit v1.2.3 From 7135840fc74699513d50e0c9c64922f2d38aa5e3 Mon Sep 17 00:00:00 2001 From: Alexandros Batsakis <batsakis@netapp.com> Date: Fri, 5 Feb 2010 03:45:05 -0800 Subject: nfs41: renewd sequence operations should take/put client reference renewd sends SEQUENCE requests to the NFS server in order to renew state. As the request is asynchronous, renewd should take a reference to the nfs_client to prevent concurrent umounts from freeing the session/client Signed-off-by: Alexandros Batsakis <batsakis@netapp.com> Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com> --- fs/nfs/nfs4proc.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 726bc195039d..663ae0c36834 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -419,7 +419,8 @@ static void nfs41_sequence_done(struct nfs_client *clp, clp->cl_last_renewal = timestamp; spin_unlock(&clp->cl_lock); /* Check sequence flags */ - nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags); + if (atomic_read(&clp->cl_count) > 1) + nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags); } out: /* The session may be reset by one of the error handlers. */ @@ -5035,7 +5036,9 @@ static void nfs41_sequence_release(void *data) { struct nfs_client *clp = (struct nfs_client *)data; - nfs4_schedule_state_renewal(clp); + if (atomic_read(&clp->cl_count) > 1) + nfs4_schedule_state_renewal(clp); + nfs_put_client(clp); } static void nfs41_sequence_call_done(struct rpc_task *task, void *data) @@ -5046,6 +5049,8 @@ static void nfs41_sequence_call_done(struct rpc_task *task, void *data) if (task->tk_status < 0) { dprintk("%s ERROR %d\n", __func__, task->tk_status); + if (atomic_read(&clp->cl_count) == 1) + goto out; if (_nfs4_async_handle_error(task, NULL, clp, NULL) == -EAGAIN) { @@ -5054,7 +5059,7 @@ static void nfs41_sequence_call_done(struct rpc_task *task, void *data) } } dprintk("%s rpc_cred %p\n", __func__, task->tk_msg.rpc_cred); - +out: kfree(task->tk_msg.rpc_argp); kfree(task->tk_msg.rpc_resp); @@ -5092,12 +5097,13 @@ static int nfs41_proc_async_sequence(struct nfs_client *clp, .rpc_cred = cred, }; + if (!atomic_inc_not_zero(&clp->cl_count)) + return -EIO; args = kzalloc(sizeof(*args), GFP_KERNEL); - if (!args) - return -ENOMEM; res = kzalloc(sizeof(*res), GFP_KERNEL); - if (!res) { + if (!args || !res) { kfree(args); + nfs_put_client(clp); return -ENOMEM; } res->sr_slotid = NFS4_MAX_SLOT_TABLE; -- cgit v1.2.3 From 0851de06174e9800e76b26e4be0ca94294c09c8c Mon Sep 17 00:00:00 2001 From: Alexandros Batsakis <batsakis@netapp.com> Date: Fri, 5 Feb 2010 03:45:06 -0800 Subject: nfs4: renewd renew operations should take/put a client reference renewd sends RENEW requests to the NFS server in order to renew state. As the request is asynchronous, renewd should take a reference to the nfs_client to prevent concurrent umounts from freeing the client Signed-off-by: Alexandros Batsakis <batsakis@netapp.com> Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com> --- fs/nfs/nfs4proc.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 663ae0c36834..68f1fe00c08c 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -3152,7 +3152,9 @@ static void nfs4_renew_release(void *data) { struct nfs_client *clp = data; - nfs4_schedule_state_renewal(clp); + if (atomic_read(&clp->cl_count) > 1) + nfs4_schedule_state_renewal(clp); + nfs_put_client(clp); } static void nfs4_renew_done(struct rpc_task *task, void *data) @@ -3185,6 +3187,8 @@ int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred) .rpc_cred = cred, }; + if (!atomic_inc_not_zero(&clp->cl_count)) + return -EIO; return rpc_call_async(clp->cl_rpcclient, &msg, RPC_TASK_SOFT, &nfs4_renew_ops, clp); } -- cgit v1.2.3 From 0f79fd6f5c52e05918e44996b0a1b18383d0fbc2 Mon Sep 17 00:00:00 2001 From: Trond Myklebust <Trond.Myklebust@netapp.com> Date: Tue, 2 Mar 2010 13:06:21 -0500 Subject: NFSv4.1: Various fixes to the sequence flag error handling Ensure that we change the EXCHANGE_ID verifier (i.e. clp->cl_boot_time) when we want to reset all state. This is mainly needed when the server tells us that it is revoking our open or lock stateids. Handle revoking of recallable state by expiring the delegations. Handle callback path issues by expiring the delegations and then resetting the session. Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com> --- fs/nfs/nfs4state.c | 57 ++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 45 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 2931c46c4127..6c5ed51f105e 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1255,26 +1255,59 @@ void nfs41_handle_recall_slot(struct nfs_client *clp) nfs4_schedule_state_recovery(clp); } +static void nfs4_reset_all_state(struct nfs_client *clp) +{ + if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) { + clp->cl_boot_time = CURRENT_TIME; + nfs4_state_start_reclaim_nograce(clp); + nfs4_schedule_state_recovery(clp); + } +} + +static void nfs41_handle_server_reboot(struct nfs_client *clp) +{ + if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) { + nfs4_state_start_reclaim_reboot(clp); + nfs4_schedule_state_recovery(clp); + } +} + +static void nfs41_handle_state_revoked(struct nfs_client *clp) +{ + /* Temporary */ + nfs4_reset_all_state(clp); +} + +static void nfs41_handle_recallable_state_revoked(struct nfs_client *clp) +{ + /* This will need to handle layouts too */ + nfs_expire_all_delegations(clp); +} + +static void nfs41_handle_cb_path_down(struct nfs_client *clp) +{ + nfs_expire_all_delegations(clp); + if (test_and_set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) == 0) + nfs4_schedule_state_recovery(clp); +} + void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags) { if (!flags) return; - else if (flags & SEQ4_STATUS_RESTART_RECLAIM_NEEDED) { - set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); - nfs4_state_start_reclaim_reboot(clp); - nfs4_schedule_state_recovery(clp); - } else if (flags & (SEQ4_STATUS_EXPIRED_ALL_STATE_REVOKED | + else if (flags & SEQ4_STATUS_RESTART_RECLAIM_NEEDED) + nfs41_handle_server_reboot(clp); + else if (flags & (SEQ4_STATUS_EXPIRED_ALL_STATE_REVOKED | SEQ4_STATUS_EXPIRED_SOME_STATE_REVOKED | SEQ4_STATUS_ADMIN_STATE_REVOKED | - SEQ4_STATUS_RECALLABLE_STATE_REVOKED | - SEQ4_STATUS_LEASE_MOVED)) { - set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); - nfs4_state_start_reclaim_nograce(clp); - nfs4_schedule_state_recovery(clp); - } else if (flags & (SEQ4_STATUS_CB_PATH_DOWN | + SEQ4_STATUS_LEASE_MOVED)) + nfs41_handle_state_revoked(clp); + else if (flags & SEQ4_STATUS_RECALLABLE_STATE_REVOKED) + nfs41_handle_recallable_state_revoked(clp); + else if (flags & (SEQ4_STATUS_CB_PATH_DOWN | SEQ4_STATUS_BACKCHANNEL_FAULT | SEQ4_STATUS_CB_PATH_DOWN_SESSION)) - nfs_expire_all_delegations(clp); + nfs41_handle_cb_path_down(clp); } static int nfs4_reset_session(struct nfs_client *clp) -- cgit v1.2.3 From ebed9203b68a4f333ce5d17e874b26c3afcfeff1 Mon Sep 17 00:00:00 2001 From: Trond Myklebust <Trond.Myklebust@netapp.com> Date: Tue, 2 Mar 2010 13:06:22 -0500 Subject: NFS: Fix an allocation-under-spinlock bug sunrpc_cache_update() will always call detail->update() from inside the detail->hash_lock, so it cannot allocate memory. Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com> Cc: stable@kernel.org --- fs/nfs/dns_resolve.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c index 95e1ca765d47..3f0cd4dfddaf 100644 --- a/fs/nfs/dns_resolve.c +++ b/fs/nfs/dns_resolve.c @@ -36,6 +36,19 @@ struct nfs_dns_ent { }; +static void nfs_dns_ent_update(struct cache_head *cnew, + struct cache_head *ckey) +{ + struct nfs_dns_ent *new; + struct nfs_dns_ent *key; + + new = container_of(cnew, struct nfs_dns_ent, h); + key = container_of(ckey, struct nfs_dns_ent, h); + + memcpy(&new->addr, &key->addr, key->addrlen); + new->addrlen = key->addrlen; +} + static void nfs_dns_ent_init(struct cache_head *cnew, struct cache_head *ckey) { @@ -49,8 +62,7 @@ static void nfs_dns_ent_init(struct cache_head *cnew, new->hostname = kstrndup(key->hostname, key->namelen, GFP_KERNEL); if (new->hostname) { new->namelen = key->namelen; - memcpy(&new->addr, &key->addr, key->addrlen); - new->addrlen = key->addrlen; + nfs_dns_ent_update(cnew, ckey); } else { new->namelen = 0; new->addrlen = 0; @@ -234,7 +246,7 @@ static struct cache_detail nfs_dns_resolve = { .cache_show = nfs_dns_show, .match = nfs_dns_match, .init = nfs_dns_ent_init, - .update = nfs_dns_ent_init, + .update = nfs_dns_ent_update, .alloc = nfs_dns_ent_alloc, }; -- cgit v1.2.3 From 9599945bac93b344519ea97f502cf537124b5a6e Mon Sep 17 00:00:00 2001 From: Jens Axboe <jens.axboe@oracle.com> Date: Tue, 2 Mar 2010 19:17:34 +0100 Subject: Revert "blkdev: fix merge_bvec_fn return value checks" This reverts commit 9f7cdbc33f36d28e57eaba0093f68f0d14c38c5b. It's causing oopses om dm setups, so revert it until we investigate. Reported-by: Dmitry Torokhov <dmitry.torokhov@gmail.com> Tested-by: Steven Rostedt <rostedt@goodmis.org> Signed-off-by: Jens Axboe <jens.axboe@oracle.com> --- fs/bio.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/bio.c b/fs/bio.c index 0bda289f86fc..dc17afd672e3 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -555,7 +555,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page .bi_rw = bio->bi_rw, }; - if (q->merge_bvec_fn(q, &bvm, prev) != prev->bv_len) { + if (q->merge_bvec_fn(q, &bvm, prev) < len) { prev->bv_len -= len; return 0; } @@ -608,7 +608,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page * merge_bvec_fn() returns number of bytes it can accept * at this offset */ - if (q->merge_bvec_fn(q, &bvm, bvec) != bvec->bv_len) { + if (q->merge_bvec_fn(q, &bvm, bvec) < len) { bvec->bv_page = NULL; bvec->bv_len = 0; bvec->bv_offset = 0; -- cgit v1.2.3 From 180b62a3d837613fcac3ce89576526423926c3c3 Mon Sep 17 00:00:00 2001 From: Andy Adamson <andros@netapp.com> Date: Tue, 2 Mar 2010 13:19:36 -0500 Subject: nfs41 fix NFS4ERR_CLID_INUSE for exchange id Signed-off-by: Andy Adamson <andros@netapp.com> Signed-off-by: Benny Halevy <bhalevy@panasas.com> Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com> --- fs/nfs/nfs4proc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 68f1fe00c08c..adc116c57e14 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -4519,7 +4519,7 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred) status = rpc_call_sync(clp->cl_rpcclient, &msg, 0); - if (status != NFS4ERR_CLID_INUSE) + if (status != -NFS4ERR_CLID_INUSE) break; if (signalled()) -- cgit v1.2.3 From 9df5778ecee8b301b447fc05706792d5f447ace5 Mon Sep 17 00:00:00 2001 From: Tristan Ye <tristan.ye@oracle.com> Date: Tue, 2 Mar 2010 13:59:42 +0800 Subject: Ocfs2: Move ocfs2 ioctl definitions from ocfs2_fs.h to newly added ocfs2_ioctl.h Currently we were adding ioctl cmds/structures for ocfs2 into ocfs2_fs.h which was used for define ocfs2 on-disk layout. That sounds a little bit confusing, and it may be quickly polluted espcially when growing the ocfs2_info_request ioctls afterwards(it will grow i bet). As a result, such OCFS2 IOCs do need to be placed somewhere other than ocfs2_fs.h, a separated ocfs2_ioctl.h will be added to store such ioctl structures and definitions which could also be used from userspace to invoke ioctls call. Signed-off-by: Tristan Ye <tristan.ye@oracle.com> Signed-off-by: Joel Becker <joel.becker@oracle.com> --- fs/ocfs2/ioctl.h | 6 ++-- fs/ocfs2/ocfs2.h | 1 + fs/ocfs2/ocfs2_fs.h | 57 ------------------------------------ fs/ocfs2/ocfs2_ioctl.h | 79 ++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 83 insertions(+), 60 deletions(-) create mode 100644 fs/ocfs2/ocfs2_ioctl.h (limited to 'fs') diff --git a/fs/ocfs2/ioctl.h b/fs/ocfs2/ioctl.h index cf9a5ee30fef..0cd5323bd3f0 100644 --- a/fs/ocfs2/ioctl.h +++ b/fs/ocfs2/ioctl.h @@ -7,10 +7,10 @@ * */ -#ifndef OCFS2_IOCTL_H -#define OCFS2_IOCTL_H +#ifndef OCFS2_IOCTL_PROTO_H +#define OCFS2_IOCTL_PROTO_H long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg); -#endif /* OCFS2_IOCTL_H */ +#endif /* OCFS2_IOCTL_PROTO_H */ diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index b27fe2489e0c..1238b491db90 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -42,6 +42,7 @@ #include "ocfs2_fs.h" #include "ocfs2_lockid.h" +#include "ocfs2_ioctl.h" /* For struct ocfs2_blockcheck_stats */ #include "blockcheck.h" diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index 7638a38c32bc..bb37218a7978 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -253,63 +253,6 @@ * counted in an associated * refcount tree */ -/* - * ioctl commands - */ -#define OCFS2_IOC_GETFLAGS _IOR('f', 1, long) -#define OCFS2_IOC_SETFLAGS _IOW('f', 2, long) -#define OCFS2_IOC32_GETFLAGS _IOR('f', 1, int) -#define OCFS2_IOC32_SETFLAGS _IOW('f', 2, int) - -/* - * Space reservation / allocation / free ioctls and argument structure - * are designed to be compatible with XFS. - * - * ALLOCSP* and FREESP* are not and will never be supported, but are - * included here for completeness. - */ -struct ocfs2_space_resv { - __s16 l_type; - __s16 l_whence; - __s64 l_start; - __s64 l_len; /* len == 0 means until end of file */ - __s32 l_sysid; - __u32 l_pid; - __s32 l_pad[4]; /* reserve area */ -}; - -#define OCFS2_IOC_ALLOCSP _IOW ('X', 10, struct ocfs2_space_resv) -#define OCFS2_IOC_FREESP _IOW ('X', 11, struct ocfs2_space_resv) -#define OCFS2_IOC_RESVSP _IOW ('X', 40, struct ocfs2_space_resv) -#define OCFS2_IOC_UNRESVSP _IOW ('X', 41, struct ocfs2_space_resv) -#define OCFS2_IOC_ALLOCSP64 _IOW ('X', 36, struct ocfs2_space_resv) -#define OCFS2_IOC_FREESP64 _IOW ('X', 37, struct ocfs2_space_resv) -#define OCFS2_IOC_RESVSP64 _IOW ('X', 42, struct ocfs2_space_resv) -#define OCFS2_IOC_UNRESVSP64 _IOW ('X', 43, struct ocfs2_space_resv) - -/* Used to pass group descriptor data when online resize is done */ -struct ocfs2_new_group_input { - __u64 group; /* Group descriptor's blkno. */ - __u32 clusters; /* Total number of clusters in this group */ - __u32 frees; /* Total free clusters in this group */ - __u16 chain; /* Chain for this group */ - __u16 reserved1; - __u32 reserved2; -}; - -#define OCFS2_IOC_GROUP_EXTEND _IOW('o', 1, int) -#define OCFS2_IOC_GROUP_ADD _IOW('o', 2,struct ocfs2_new_group_input) -#define OCFS2_IOC_GROUP_ADD64 _IOW('o', 3,struct ocfs2_new_group_input) - -/* Used to pass 2 file names to reflink. */ -struct reflink_arguments { - __u64 old_path; - __u64 new_path; - __u64 preserve; -}; -#define OCFS2_IOC_REFLINK _IOW('o', 4, struct reflink_arguments) - - /* * Journal Flags (ocfs2_dinode.id1.journal1.i_flags) */ diff --git a/fs/ocfs2/ocfs2_ioctl.h b/fs/ocfs2/ocfs2_ioctl.h new file mode 100644 index 000000000000..2d3420af1a83 --- /dev/null +++ b/fs/ocfs2/ocfs2_ioctl.h @@ -0,0 +1,79 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * ocfs2_ioctl.h + * + * Defines OCFS2 ioctls. + * + * Copyright (C) 2010 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License, version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#ifndef OCFS2_IOCTL_H +#define OCFS2_IOCTL_H + +/* + * ioctl commands + */ +#define OCFS2_IOC_GETFLAGS _IOR('f', 1, long) +#define OCFS2_IOC_SETFLAGS _IOW('f', 2, long) +#define OCFS2_IOC32_GETFLAGS _IOR('f', 1, int) +#define OCFS2_IOC32_SETFLAGS _IOW('f', 2, int) + +/* + * Space reservation / allocation / free ioctls and argument structure + * are designed to be compatible with XFS. + * + * ALLOCSP* and FREESP* are not and will never be supported, but are + * included here for completeness. + */ +struct ocfs2_space_resv { + __s16 l_type; + __s16 l_whence; + __s64 l_start; + __s64 l_len; /* len == 0 means until end of file */ + __s32 l_sysid; + __u32 l_pid; + __s32 l_pad[4]; /* reserve area */ +}; + +#define OCFS2_IOC_ALLOCSP _IOW ('X', 10, struct ocfs2_space_resv) +#define OCFS2_IOC_FREESP _IOW ('X', 11, struct ocfs2_space_resv) +#define OCFS2_IOC_RESVSP _IOW ('X', 40, struct ocfs2_space_resv) +#define OCFS2_IOC_UNRESVSP _IOW ('X', 41, struct ocfs2_space_resv) +#define OCFS2_IOC_ALLOCSP64 _IOW ('X', 36, struct ocfs2_space_resv) +#define OCFS2_IOC_FREESP64 _IOW ('X', 37, struct ocfs2_space_resv) +#define OCFS2_IOC_RESVSP64 _IOW ('X', 42, struct ocfs2_space_resv) +#define OCFS2_IOC_UNRESVSP64 _IOW ('X', 43, struct ocfs2_space_resv) + +/* Used to pass group descriptor data when online resize is done */ +struct ocfs2_new_group_input { + __u64 group; /* Group descriptor's blkno. */ + __u32 clusters; /* Total number of clusters in this group */ + __u32 frees; /* Total free clusters in this group */ + __u16 chain; /* Chain for this group */ + __u16 reserved1; + __u32 reserved2; +}; + +#define OCFS2_IOC_GROUP_EXTEND _IOW('o', 1, int) +#define OCFS2_IOC_GROUP_ADD _IOW('o', 2,struct ocfs2_new_group_input) +#define OCFS2_IOC_GROUP_ADD64 _IOW('o', 3,struct ocfs2_new_group_input) + +/* Used to pass 2 file names to reflink. */ +struct reflink_arguments { + __u64 old_path; + __u64 new_path; + __u64 preserve; +}; +#define OCFS2_IOC_REFLINK _IOW('o', 4, struct reflink_arguments) + +#endif /* OCFS2_IOCTL_H */ -- cgit v1.2.3 From 4b1ae27a96d9860e6c4348673e8fb6a0322511fe Mon Sep 17 00:00:00 2001 From: Al Viro <viro@zeniv.linux.org.uk> Date: Wed, 3 Mar 2010 12:58:31 -0500 Subject: Revert "autofs4: always use lookup for lookup" This reverts commit 213614d583748d00967a91cacd656f417efb36ce. Alas, ->d_revalidate() can't rely on ->lookup() finishing what it's started; if d_alloc() in do_lookup() fails, we are not going to call ->lookup() at all. Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> --- fs/autofs4/autofs_i.h | 7 - fs/autofs4/expire.c | 6 +- fs/autofs4/inode.c | 1 - fs/autofs4/root.c | 474 +++++++++++++++++--------------------------------- 4 files changed, 158 insertions(+), 330 deletions(-) (limited to 'fs') diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index 0118d67221b2..3d283abf67d7 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -60,11 +60,6 @@ do { \ current->pid, __func__, ##args); \ } while (0) -struct rehash_entry { - struct task_struct *task; - struct list_head list; -}; - /* Unified info structure. This is pointed to by both the dentry and inode structures. Each file in the filesystem has an instance of this structure. It holds a reference to the dentry, so dentries are never @@ -81,7 +76,6 @@ struct autofs_info { struct list_head active; int active_count; - struct list_head rehash_list; struct list_head expiring; @@ -104,7 +98,6 @@ struct autofs_info { #define AUTOFS_INF_EXPIRING (1<<0) /* dentry is in the process of expiring */ #define AUTOFS_INF_MOUNTPOINT (1<<1) /* mountpoint status for direct expire */ #define AUTOFS_INF_PENDING (1<<2) /* dentry pending mount */ -#define AUTOFS_INF_REHASH (1<<3) /* dentry in transit to ->lookup() */ struct autofs_wait_queue { wait_queue_head_t queue; diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c index 74bc9aa6df31..a796c9417fb1 100644 --- a/fs/autofs4/expire.c +++ b/fs/autofs4/expire.c @@ -279,7 +279,6 @@ struct dentry *autofs4_expire_direct(struct super_block *sb, root->d_mounted--; } ino->flags |= AUTOFS_INF_EXPIRING; - autofs4_add_expiring(root); init_completion(&ino->expire_complete); spin_unlock(&sbi->fs_lock); return root; @@ -407,7 +406,6 @@ found: expired, (int)expired->d_name.len, expired->d_name.name); ino = autofs4_dentry_ino(expired); ino->flags |= AUTOFS_INF_EXPIRING; - autofs4_add_expiring(expired); init_completion(&ino->expire_complete); spin_unlock(&sbi->fs_lock); spin_lock(&dcache_lock); @@ -435,7 +433,7 @@ int autofs4_expire_wait(struct dentry *dentry) DPRINTK("expire done status=%d", status); - if (d_unhashed(dentry) && IS_DEADDIR(dentry->d_inode)) + if (d_unhashed(dentry)) return -EAGAIN; return status; @@ -475,7 +473,6 @@ int autofs4_expire_run(struct super_block *sb, spin_lock(&sbi->fs_lock); ino = autofs4_dentry_ino(dentry); ino->flags &= ~AUTOFS_INF_EXPIRING; - autofs4_del_expiring(dentry); complete_all(&ino->expire_complete); spin_unlock(&sbi->fs_lock); @@ -506,7 +503,6 @@ int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt, ino->flags &= ~AUTOFS_INF_MOUNTPOINT; } ino->flags &= ~AUTOFS_INF_EXPIRING; - autofs4_del_expiring(dentry); complete_all(&ino->expire_complete); spin_unlock(&sbi->fs_lock); dput(dentry); diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c index d0a3de247458..4670a7818eac 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs4/inode.c @@ -49,7 +49,6 @@ struct autofs_info *autofs4_init_ino(struct autofs_info *ino, ino->dentry = NULL; ino->size = 0; INIT_LIST_HEAD(&ino->active); - INIT_LIST_HEAD(&ino->rehash_list); ino->active_count = 0; INIT_LIST_HEAD(&ino->expiring); atomic_set(&ino->count, 0); diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index 30cc9ddf4b70..a015b49891df 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -104,99 +104,6 @@ static void autofs4_del_active(struct dentry *dentry) return; } -static void autofs4_add_rehash_entry(struct autofs_info *ino, - struct rehash_entry *entry) -{ - entry->task = current; - INIT_LIST_HEAD(&entry->list); - list_add(&entry->list, &ino->rehash_list); - return; -} - -static void autofs4_remove_rehash_entry(struct autofs_info *ino) -{ - struct list_head *head = &ino->rehash_list; - struct rehash_entry *entry; - list_for_each_entry(entry, head, list) { - if (entry->task == current) { - list_del(&entry->list); - kfree(entry); - break; - } - } - return; -} - -static void autofs4_remove_rehash_entrys(struct autofs_info *ino) -{ - struct autofs_sb_info *sbi = ino->sbi; - struct rehash_entry *entry, *next; - struct list_head *head; - - spin_lock(&sbi->fs_lock); - spin_lock(&sbi->lookup_lock); - if (!(ino->flags & AUTOFS_INF_REHASH)) { - spin_unlock(&sbi->lookup_lock); - spin_unlock(&sbi->fs_lock); - return; - } - ino->flags &= ~AUTOFS_INF_REHASH; - head = &ino->rehash_list; - list_for_each_entry_safe(entry, next, head, list) { - list_del(&entry->list); - kfree(entry); - } - spin_unlock(&sbi->lookup_lock); - spin_unlock(&sbi->fs_lock); - dput(ino->dentry); - - return; -} - -static void autofs4_revalidate_drop(struct dentry *dentry, - struct rehash_entry *entry) -{ - struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); - struct autofs_info *ino = autofs4_dentry_ino(dentry); - /* - * Add to the active list so we can pick this up in - * ->lookup(). Also add an entry to a rehash list so - * we know when there are no dentrys in flight so we - * know when we can rehash the dentry. - */ - spin_lock(&sbi->lookup_lock); - if (list_empty(&ino->active)) - list_add(&ino->active, &sbi->active_list); - autofs4_add_rehash_entry(ino, entry); - spin_unlock(&sbi->lookup_lock); - if (!(ino->flags & AUTOFS_INF_REHASH)) { - ino->flags |= AUTOFS_INF_REHASH; - dget(dentry); - spin_lock(&dentry->d_lock); - __d_drop(dentry); - spin_unlock(&dentry->d_lock); - } - return; -} - -static void autofs4_revalidate_rehash(struct dentry *dentry) -{ - struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); - struct autofs_info *ino = autofs4_dentry_ino(dentry); - if (ino->flags & AUTOFS_INF_REHASH) { - spin_lock(&sbi->lookup_lock); - autofs4_remove_rehash_entry(ino); - if (list_empty(&ino->rehash_list)) { - spin_unlock(&sbi->lookup_lock); - ino->flags &= ~AUTOFS_INF_REHASH; - d_rehash(dentry); - dput(ino->dentry); - } else - spin_unlock(&sbi->lookup_lock); - } - return; -} - static unsigned int autofs4_need_mount(unsigned int flags) { unsigned int res = 0; @@ -236,7 +143,7 @@ out: return dcache_dir_open(inode, file); } -static int try_to_fill_dentry(struct dentry *dentry) +static int try_to_fill_dentry(struct dentry *dentry, int flags) { struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); struct autofs_info *ino = autofs4_dentry_ino(dentry); @@ -249,17 +156,55 @@ static int try_to_fill_dentry(struct dentry *dentry) * Wait for a pending mount, triggering one if there * isn't one already */ - DPRINTK("waiting for mount name=%.*s", - dentry->d_name.len, dentry->d_name.name); + if (dentry->d_inode == NULL) { + DPRINTK("waiting for mount name=%.*s", + dentry->d_name.len, dentry->d_name.name); - status = autofs4_wait(sbi, dentry, NFY_MOUNT); + status = autofs4_wait(sbi, dentry, NFY_MOUNT); - DPRINTK("mount done status=%d", status); + DPRINTK("mount done status=%d", status); - /* Update expiry counter */ - ino->last_used = jiffies; + /* Turn this into a real negative dentry? */ + if (status == -ENOENT) { + spin_lock(&sbi->fs_lock); + ino->flags &= ~AUTOFS_INF_PENDING; + spin_unlock(&sbi->fs_lock); + return status; + } else if (status) { + /* Return a negative dentry, but leave it "pending" */ + return status; + } + /* Trigger mount for path component or follow link */ + } else if (ino->flags & AUTOFS_INF_PENDING || + autofs4_need_mount(flags) || + current->link_count) { + DPRINTK("waiting for mount name=%.*s", + dentry->d_name.len, dentry->d_name.name); - return status; + spin_lock(&sbi->fs_lock); + ino->flags |= AUTOFS_INF_PENDING; + spin_unlock(&sbi->fs_lock); + status = autofs4_wait(sbi, dentry, NFY_MOUNT); + + DPRINTK("mount done status=%d", status); + + if (status) { + spin_lock(&sbi->fs_lock); + ino->flags &= ~AUTOFS_INF_PENDING; + spin_unlock(&sbi->fs_lock); + return status; + } + } + + /* Initialize expiry counter after successful mount */ + if (ino) + ino->last_used = jiffies; + + spin_lock(&sbi->fs_lock); + ino->flags &= ~AUTOFS_INF_PENDING; + spin_unlock(&sbi->fs_lock); + + return 0; } /* For autofs direct mounts the follow link triggers the mount */ @@ -313,16 +258,10 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd) */ if (ino->flags & AUTOFS_INF_PENDING || (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs))) { - ino->flags |= AUTOFS_INF_PENDING; spin_unlock(&dcache_lock); spin_unlock(&sbi->fs_lock); - status = try_to_fill_dentry(dentry); - - spin_lock(&sbi->fs_lock); - ino->flags &= ~AUTOFS_INF_PENDING; - spin_unlock(&sbi->fs_lock); - + status = try_to_fill_dentry(dentry, 0); if (status) goto out_error; @@ -361,47 +300,18 @@ static int autofs4_revalidate(struct dentry *dentry, struct nameidata *nd) { struct inode *dir = dentry->d_parent->d_inode; struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb); - struct autofs_info *ino = autofs4_dentry_ino(dentry); - struct rehash_entry *entry; + int oz_mode = autofs4_oz_mode(sbi); int flags = nd ? nd->flags : 0; - unsigned int mutex_aquired; + int status = 1; - DPRINTK("name = %.*s oz_mode = %d", - dentry->d_name.len, dentry->d_name.name, oz_mode); - - /* Daemon never causes a mount to trigger */ - if (autofs4_oz_mode(sbi)) - return 1; - - entry = kmalloc(sizeof(struct rehash_entry), GFP_KERNEL); - if (!entry) - return -ENOMEM; - - mutex_aquired = mutex_trylock(&dir->i_mutex); - - spin_lock(&sbi->fs_lock); - spin_lock(&dcache_lock); /* Pending dentry */ + spin_lock(&sbi->fs_lock); if (autofs4_ispending(dentry)) { - int status; - - /* - * We can only unhash and send this to ->lookup() if - * the directory mutex is held over d_revalidate() and - * ->lookup(). This prevents the VFS from incorrectly - * seeing the dentry as non-existent. - */ - ino->flags |= AUTOFS_INF_PENDING; - if (!mutex_aquired) { - autofs4_revalidate_drop(dentry, entry); - spin_unlock(&dcache_lock); - spin_unlock(&sbi->fs_lock); - return 0; - } - spin_unlock(&dcache_lock); + /* The daemon never causes a mount to trigger */ spin_unlock(&sbi->fs_lock); - mutex_unlock(&dir->i_mutex); - kfree(entry); + + if (oz_mode) + return 1; /* * If the directory has gone away due to an expire @@ -415,82 +325,45 @@ static int autofs4_revalidate(struct dentry *dentry, struct nameidata *nd) * A zero status is success otherwise we have a * negative error code. */ - status = try_to_fill_dentry(dentry); - - spin_lock(&sbi->fs_lock); - ino->flags &= ~AUTOFS_INF_PENDING; - spin_unlock(&sbi->fs_lock); - + status = try_to_fill_dentry(dentry, flags); if (status == 0) return 1; return status; } + spin_unlock(&sbi->fs_lock); + + /* Negative dentry.. invalidate if "old" */ + if (dentry->d_inode == NULL) + return 0; /* Check for a non-mountpoint directory with no contents */ + spin_lock(&dcache_lock); if (S_ISDIR(dentry->d_inode->i_mode) && !d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) { DPRINTK("dentry=%p %.*s, emptydir", dentry, dentry->d_name.len, dentry->d_name.name); + spin_unlock(&dcache_lock); - if (autofs4_need_mount(flags) || current->link_count) { - int status; - - /* - * We can only unhash and send this to ->lookup() if - * the directory mutex is held over d_revalidate() and - * ->lookup(). This prevents the VFS from incorrectly - * seeing the dentry as non-existent. - */ - ino->flags |= AUTOFS_INF_PENDING; - if (!mutex_aquired) { - autofs4_revalidate_drop(dentry, entry); - spin_unlock(&dcache_lock); - spin_unlock(&sbi->fs_lock); - return 0; - } - spin_unlock(&dcache_lock); - spin_unlock(&sbi->fs_lock); - mutex_unlock(&dir->i_mutex); - kfree(entry); - - /* - * A zero status is success otherwise we have a - * negative error code. - */ - status = try_to_fill_dentry(dentry); - - spin_lock(&sbi->fs_lock); - ino->flags &= ~AUTOFS_INF_PENDING; - spin_unlock(&sbi->fs_lock); + /* The daemon never causes a mount to trigger */ + if (oz_mode) + return 1; - if (status == 0) - return 1; + /* + * A zero status is success otherwise we have a + * negative error code. + */ + status = try_to_fill_dentry(dentry, flags); + if (status == 0) + return 1; - return status; - } + return status; } spin_unlock(&dcache_lock); - spin_unlock(&sbi->fs_lock); - - if (mutex_aquired) - mutex_unlock(&dir->i_mutex); - - kfree(entry); return 1; } -static void autofs4_free_rehash_entrys(struct autofs_info *inf) -{ - struct list_head *head = &inf->rehash_list; - struct rehash_entry *entry, *next; - list_for_each_entry_safe(entry, next, head, list) { - list_del(&entry->list); - kfree(entry); - } -} - void autofs4_dentry_release(struct dentry *de) { struct autofs_info *inf; @@ -509,8 +382,6 @@ void autofs4_dentry_release(struct dentry *de) list_del(&inf->active); if (!list_empty(&inf->expiring)) list_del(&inf->expiring); - if (!list_empty(&inf->rehash_list)) - autofs4_free_rehash_entrys(inf); spin_unlock(&sbi->lookup_lock); } @@ -543,7 +414,6 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry) const unsigned char *str = name->name; struct list_head *p, *head; -restart: spin_lock(&dcache_lock); spin_lock(&sbi->lookup_lock); head = &sbi->active_list; @@ -561,19 +431,6 @@ restart: if (atomic_read(&active->d_count) == 0) goto next; - if (active->d_inode && IS_DEADDIR(active->d_inode)) { - if (!list_empty(&ino->rehash_list)) { - dget(active); - spin_unlock(&active->d_lock); - spin_unlock(&sbi->lookup_lock); - spin_unlock(&dcache_lock); - autofs4_remove_rehash_entrys(ino); - dput(active); - goto restart; - } - goto next; - } - qstr = &active->d_name; if (active->d_name.hash != hash) @@ -586,11 +443,13 @@ restart: if (memcmp(qstr->name, str, len)) goto next; - dget(active); - spin_unlock(&active->d_lock); - spin_unlock(&sbi->lookup_lock); - spin_unlock(&dcache_lock); - return active; + if (d_unhashed(active)) { + dget(active); + spin_unlock(&active->d_lock); + spin_unlock(&sbi->lookup_lock); + spin_unlock(&dcache_lock); + return active; + } next: spin_unlock(&active->d_lock); } @@ -639,11 +498,13 @@ static struct dentry *autofs4_lookup_expiring(struct dentry *dentry) if (memcmp(qstr->name, str, len)) goto next; - dget(expiring); - spin_unlock(&expiring->d_lock); - spin_unlock(&sbi->lookup_lock); - spin_unlock(&dcache_lock); - return expiring; + if (d_unhashed(expiring)) { + dget(expiring); + spin_unlock(&expiring->d_lock); + spin_unlock(&sbi->lookup_lock); + spin_unlock(&dcache_lock); + return expiring; + } next: spin_unlock(&expiring->d_lock); } @@ -653,48 +514,6 @@ next: return NULL; } -static struct autofs_info *init_new_dentry(struct autofs_sb_info *sbi, - struct dentry *dentry, int oz_mode) -{ - struct autofs_info *ino; - - /* - * Mark the dentry incomplete but don't hash it. We do this - * to serialize our inode creation operations (symlink and - * mkdir) which prevents deadlock during the callback to - * the daemon. Subsequent user space lookups for the same - * dentry are placed on the wait queue while the daemon - * itself is allowed passage unresticted so the create - * operation itself can then hash the dentry. Finally, - * we check for the hashed dentry and return the newly - * hashed dentry. - */ - dentry->d_op = &autofs4_root_dentry_operations; - - /* - * And we need to ensure that the same dentry is used for - * all following lookup calls until it is hashed so that - * the dentry flags are persistent throughout the request. - */ - ino = autofs4_init_ino(NULL, sbi, 0555); - if (!ino) - return ERR_PTR(-ENOMEM); - - dentry->d_fsdata = ino; - ino->dentry = dentry; - - /* - * Only set the mount pending flag for new dentrys not created - * by the daemon. - */ - if (!oz_mode) - ino->flags |= AUTOFS_INF_PENDING; - - d_instantiate(dentry, NULL); - - return ino; -} - /* Lookups in the root directory */ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) { @@ -702,7 +521,6 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s struct autofs_info *ino; struct dentry *expiring, *active; int oz_mode; - int status = 0; DPRINTK("name = %.*s", dentry->d_name.len, dentry->d_name.name); @@ -717,26 +535,44 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s DPRINTK("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d", current->pid, task_pgrp_nr(current), sbi->catatonic, oz_mode); - spin_lock(&sbi->fs_lock); active = autofs4_lookup_active(dentry); if (active) { dentry = active; ino = autofs4_dentry_ino(dentry); - /* If this came from revalidate, rehash it */ - autofs4_revalidate_rehash(dentry); - spin_unlock(&sbi->fs_lock); } else { - spin_unlock(&sbi->fs_lock); - ino = init_new_dentry(sbi, dentry, oz_mode); - if (IS_ERR(ino)) - return (struct dentry *) ino; - } + /* + * Mark the dentry incomplete but don't hash it. We do this + * to serialize our inode creation operations (symlink and + * mkdir) which prevents deadlock during the callback to + * the daemon. Subsequent user space lookups for the same + * dentry are placed on the wait queue while the daemon + * itself is allowed passage unresticted so the create + * operation itself can then hash the dentry. Finally, + * we check for the hashed dentry and return the newly + * hashed dentry. + */ + dentry->d_op = &autofs4_root_dentry_operations; + + /* + * And we need to ensure that the same dentry is used for + * all following lookup calls until it is hashed so that + * the dentry flags are persistent throughout the request. + */ + ino = autofs4_init_ino(NULL, sbi, 0555); + if (!ino) + return ERR_PTR(-ENOMEM); - autofs4_add_active(dentry); + dentry->d_fsdata = ino; + ino->dentry = dentry; + + autofs4_add_active(dentry); + + d_instantiate(dentry, NULL); + } if (!oz_mode) { - expiring = autofs4_lookup_expiring(dentry); mutex_unlock(&dir->i_mutex); + expiring = autofs4_lookup_expiring(dentry); if (expiring) { /* * If we are racing with expire the request might not @@ -744,22 +580,23 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s * so it must have been successful, so just wait for it. */ autofs4_expire_wait(expiring); + autofs4_del_expiring(expiring); dput(expiring); } - status = try_to_fill_dentry(dentry); - mutex_lock(&dir->i_mutex); + spin_lock(&sbi->fs_lock); - ino->flags &= ~AUTOFS_INF_PENDING; + ino->flags |= AUTOFS_INF_PENDING; spin_unlock(&sbi->fs_lock); + if (dentry->d_op && dentry->d_op->d_revalidate) + (dentry->d_op->d_revalidate)(dentry, nd); + mutex_lock(&dir->i_mutex); } - autofs4_del_active(dentry); - /* - * If we had a mount fail, check if we had to handle + * If we are still pending, check if we had to handle * a signal. If so we can force a restart.. */ - if (status) { + if (ino->flags & AUTOFS_INF_PENDING) { /* See if we were interrupted */ if (signal_pending(current)) { sigset_t *sigset = ¤t->pending.signal; @@ -771,46 +608,43 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s return ERR_PTR(-ERESTARTNOINTR); } } - } - - /* - * User space can (and has done in the past) remove and re-create - * this directory during the callback. This can leave us with an - * unhashed dentry, but a successful mount! So we need to - * perform another cached lookup in case the dentry now exists. - */ - if (!oz_mode && !have_submounts(dentry)) { - struct dentry *new; - new = d_lookup(dentry->d_parent, &dentry->d_name); - if (new) { - if (active) - dput(active); - return new; - } else { - if (!status) - status = -ENOENT; + if (!oz_mode) { + spin_lock(&sbi->fs_lock); + ino->flags &= ~AUTOFS_INF_PENDING; + spin_unlock(&sbi->fs_lock); } } /* - * If we had a mount failure, return status to user space. - * If the mount succeeded and we used a dentry from the active queue - * return it. + * If this dentry is unhashed, then we shouldn't honour this + * lookup. Returning ENOENT here doesn't do the right thing + * for all system calls, but it should be OK for the operations + * we permit from an autofs. */ - if (status) { - dentry = ERR_PTR(status); - if (active) - dput(active); - return dentry; - } else { + if (!oz_mode && d_unhashed(dentry)) { /* - * Valid successful mount, return active dentry or NULL - * for a new dentry. + * A user space application can (and has done in the past) + * remove and re-create this directory during the callback. + * This can leave us with an unhashed dentry, but a + * successful mount! So we need to perform another + * cached lookup in case the dentry now exists. */ + struct dentry *parent = dentry->d_parent; + struct dentry *new = d_lookup(parent, &dentry->d_name); + if (new != NULL) + dentry = new; + else + dentry = ERR_PTR(-ENOENT); + if (active) - return active; + dput(active); + + return dentry; } + if (active) + return active; + return NULL; } @@ -834,6 +668,8 @@ static int autofs4_dir_symlink(struct inode *dir, if (!ino) return -ENOMEM; + autofs4_del_active(dentry); + ino->size = strlen(symname); cp = kmalloc(ino->size + 1, GFP_KERNEL); if (!cp) { @@ -910,6 +746,7 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry) dir->i_mtime = CURRENT_TIME; spin_lock(&dcache_lock); + autofs4_add_expiring(dentry); spin_lock(&dentry->d_lock); __d_drop(dentry); spin_unlock(&dentry->d_lock); @@ -935,6 +772,7 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry) spin_unlock(&dcache_lock); return -ENOTEMPTY; } + autofs4_add_expiring(dentry); spin_lock(&dentry->d_lock); __d_drop(dentry); spin_unlock(&dentry->d_lock); @@ -972,6 +810,8 @@ static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, int mode) if (!ino) return -ENOMEM; + autofs4_del_active(dentry); + inode = autofs4_get_inode(dir->i_sb, ino); if (!inode) { if (!dentry->d_fsdata) -- cgit v1.2.3 From ad2a722f196d2b014f49e6c37e072df71eb3695f Mon Sep 17 00:00:00 2001 From: Boaz Harrosh <bharrosh@panasas.com> Date: Tue, 12 Jan 2010 15:13:47 +0200 Subject: libfs: Open code simple_commit_write into only user * simple_commit_write was only called by simple_write_end. Open coding it makes it tiny bit less heavy on the arithmetic and much more readable. * While at it use zero_user() for clearing a partial page. * While at it add a docbook comment for simple_write_end. Signed-off-by: Boaz Harrosh <bharrosh@panasas.com> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> --- fs/libfs.c | 59 +++++++++++++++++++++++++++++++++++------------------------ 1 file changed, 35 insertions(+), 24 deletions(-) (limited to 'fs') diff --git a/fs/libfs.c b/fs/libfs.c index 6e8d17e1dc4c..cd88abdcb436 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -370,40 +370,51 @@ int simple_write_begin(struct file *file, struct address_space *mapping, return simple_prepare_write(file, page, from, from+len); } -static int simple_commit_write(struct file *file, struct page *page, - unsigned from, unsigned to) -{ - struct inode *inode = page->mapping->host; - loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; - - if (!PageUptodate(page)) - SetPageUptodate(page); - /* - * No need to use i_size_read() here, the i_size - * cannot change under us because we hold the i_mutex. - */ - if (pos > inode->i_size) - i_size_write(inode, pos); - set_page_dirty(page); - return 0; -} - +/** + * simple_write_end - .write_end helper for non-block-device FSes + * @available: See .write_end of address_space_operations + * @file: " + * @mapping: " + * @pos: " + * @len: " + * @copied: " + * @page: " + * @fsdata: " + * + * simple_write_end does the minimum needed for updating a page after writing is + * done. It has the same API signature as the .write_end of + * address_space_operations vector. So it can just be set onto .write_end for + * FSes that don't need any other processing. i_mutex is assumed to be held. + * Block based filesystems should use generic_write_end(). + * NOTE: Even though i_size might get updated by this function, mark_inode_dirty + * is not called, so a filesystem that actually does store data in .write_inode + * should extend on what's done here with a call to mark_inode_dirty() in the + * case that i_size has changed. + */ int simple_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) { - unsigned from = pos & (PAGE_CACHE_SIZE - 1); + struct inode *inode = page->mapping->host; + loff_t last_pos = pos + copied; /* zero the stale part of the page if we did a short copy */ if (copied < len) { - void *kaddr = kmap_atomic(page, KM_USER0); - memset(kaddr + from + copied, 0, len - copied); - flush_dcache_page(page); - kunmap_atomic(kaddr, KM_USER0); + unsigned from = pos & (PAGE_CACHE_SIZE - 1); + + zero_user(page, from + copied, len - copied); } - simple_commit_write(file, page, from, from+copied); + if (!PageUptodate(page)) + SetPageUptodate(page); + /* + * No need to use i_size_read() here, the i_size + * cannot change under us because we hold the i_mutex. + */ + if (last_pos > inode->i_size) + i_size_write(inode, last_pos); + set_page_dirty(page); unlock_page(page); page_cache_release(page); -- cgit v1.2.3 From 193cf4b99113a4550598ba9e8343e591fc062e23 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh <bharrosh@panasas.com> Date: Tue, 12 Jan 2010 16:18:08 +0200 Subject: libfs: Unexport and kill simple_prepare_write Remove the EXPORT_UNUSED_SYMBOL of simple_prepare_write Collapse simple_prepare_write into it's only caller, though making it simpler and clearer to understand. Signed-off-by: Boaz Harrosh <bharrosh@panasas.com> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> --- fs/libfs.c | 22 ++++++---------------- include/linux/fs.h | 2 -- 2 files changed, 6 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/libfs.c b/fs/libfs.c index cd88abdcb436..9e50bcf55857 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -338,28 +338,14 @@ int simple_readpage(struct file *file, struct page *page) return 0; } -int simple_prepare_write(struct file *file, struct page *page, - unsigned from, unsigned to) -{ - if (!PageUptodate(page)) { - if (to - from != PAGE_CACHE_SIZE) - zero_user_segments(page, - 0, from, - to, PAGE_CACHE_SIZE); - } - return 0; -} - int simple_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) { struct page *page; pgoff_t index; - unsigned from; index = pos >> PAGE_CACHE_SHIFT; - from = pos & (PAGE_CACHE_SIZE - 1); page = grab_cache_page_write_begin(mapping, index, flags); if (!page) @@ -367,7 +353,12 @@ int simple_write_begin(struct file *file, struct address_space *mapping, *pagep = page; - return simple_prepare_write(file, page, from, from+len); + if (!PageUptodate(page) && (len != PAGE_CACHE_SIZE)) { + unsigned from = pos & (PAGE_CACHE_SIZE - 1); + + zero_user_segments(page, 0, from, from + len, PAGE_CACHE_SIZE); + } + return 0; } /** @@ -864,7 +855,6 @@ EXPORT_SYMBOL(simple_getattr); EXPORT_SYMBOL(simple_link); EXPORT_SYMBOL(simple_lookup); EXPORT_SYMBOL(simple_pin_fs); -EXPORT_UNUSED_SYMBOL(simple_prepare_write); EXPORT_SYMBOL(simple_readpage); EXPORT_SYMBOL(simple_release_fs); EXPORT_SYMBOL(simple_rename); diff --git a/include/linux/fs.h b/include/linux/fs.h index ebb1cd5bc241..2b124c825e38 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2340,8 +2340,6 @@ extern int simple_rename(struct inode *, struct dentry *, struct inode *, struct extern int simple_sync_file(struct file *, struct dentry *, int); extern int simple_empty(struct dentry *); extern int simple_readpage(struct file *file, struct page *page); -extern int simple_prepare_write(struct file *file, struct page *page, - unsigned offset, unsigned to); extern int simple_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata); -- cgit v1.2.3 From 587d4a17d837ac0f17edb26f1b6c80c0abda6343 Mon Sep 17 00:00:00 2001 From: "Helight.Xu" <helight.xu@gmail.com> Date: Wed, 30 Dec 2009 13:24:41 +0800 Subject: some clean up in fs/proc EXPORT_SYMBOL(proc_symlink); EXPORT_SYMBOL(proc_mkdir); EXPORT_SYMBOL(create_proc_entry); EXPORT_SYMBOL(proc_create_data); EXPORT_SYMBOL(remove_proc_entry); Those EXPORT_SYMBOL shouldn't be in fs/proc/root.c, should be in fs/proc/generic.c. Signed-off-by: Helight.Xu <helight.xu@gmail.com> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> --- fs/proc/generic.c | 5 +++++ fs/proc/root.c | 6 ------ 2 files changed, 5 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 480cb1065eec..9580abeadeb3 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -662,6 +662,7 @@ struct proc_dir_entry *proc_symlink(const char *name, } return ent; } +EXPORT_SYMBOL(proc_symlink); struct proc_dir_entry *proc_mkdir_mode(const char *name, mode_t mode, struct proc_dir_entry *parent) @@ -700,6 +701,7 @@ struct proc_dir_entry *proc_mkdir(const char *name, { return proc_mkdir_mode(name, S_IRUGO | S_IXUGO, parent); } +EXPORT_SYMBOL(proc_mkdir); struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode, struct proc_dir_entry *parent) @@ -728,6 +730,7 @@ struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode, } return ent; } +EXPORT_SYMBOL(create_proc_entry); struct proc_dir_entry *proc_create_data(const char *name, mode_t mode, struct proc_dir_entry *parent, @@ -762,6 +765,7 @@ out_free: out: return NULL; } +EXPORT_SYMBOL(proc_create_data); static void free_proc_entry(struct proc_dir_entry *de) { @@ -853,3 +857,4 @@ continue_removing: de->parent->name, de->name, de->subdir->name); pde_put(de); } +EXPORT_SYMBOL(remove_proc_entry); diff --git a/fs/proc/root.c b/fs/proc/root.c index b080b791d9e3..757c069f2a65 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -220,9 +220,3 @@ void pid_ns_release_proc(struct pid_namespace *ns) { mntput(ns->proc_mnt); } - -EXPORT_SYMBOL(proc_symlink); -EXPORT_SYMBOL(proc_mkdir); -EXPORT_SYMBOL(create_proc_entry); -EXPORT_SYMBOL(proc_create_data); -EXPORT_SYMBOL(remove_proc_entry); -- cgit v1.2.3 From ec4f860597af41c6b71f4de86d8e86f710bfab54 Mon Sep 17 00:00:00 2001 From: H Hartley Sweeten <hartleys@visionengravers.com> Date: Tue, 5 Jan 2010 13:45:18 -0700 Subject: fs/dcache.c: CodingStyle cleanup Cleanup EXPORT* macros according to Documantation/CodingStyle. Move EXPORT* macros to the line immediately after the closing function brace. Signed-off-by: H Hartley Sweeten <hsweeten@visionengravers.com> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> --- fs/dcache.c | 45 ++++++++++++++++++++++----------------------- 1 file changed, 22 insertions(+), 23 deletions(-) (limited to 'fs') diff --git a/fs/dcache.c b/fs/dcache.c index 953173a293a9..4365998b8df4 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -257,6 +257,7 @@ kill_it: if (dentry) goto repeat; } +EXPORT_SYMBOL(dput); /** * d_invalidate - invalidate a dentry @@ -314,6 +315,7 @@ int d_invalidate(struct dentry * dentry) spin_unlock(&dcache_lock); return 0; } +EXPORT_SYMBOL(d_invalidate); /* This should be called _only_ with dcache_lock held */ @@ -328,6 +330,7 @@ struct dentry * dget_locked(struct dentry *dentry) { return __dget_locked(dentry); } +EXPORT_SYMBOL(dget_locked); /** * d_find_alias - grab a hashed alias of inode @@ -384,6 +387,7 @@ struct dentry * d_find_alias(struct inode *inode) } return de; } +EXPORT_SYMBOL(d_find_alias); /* * Try to kill dentries associated with this inode. @@ -408,6 +412,7 @@ restart: } spin_unlock(&dcache_lock); } +EXPORT_SYMBOL(d_prune_aliases); /* * Throw away a dentry - free the inode, dput the parent. This requires that @@ -610,6 +615,7 @@ void shrink_dcache_sb(struct super_block * sb) { __shrink_dcache_sb(sb, NULL, 0); } +EXPORT_SYMBOL(shrink_dcache_sb); /* * destroy a single subtree of dentries for unmount @@ -792,6 +798,7 @@ positive: spin_unlock(&dcache_lock); return 1; } +EXPORT_SYMBOL(have_submounts); /* * Search the dentry child list for the specified parent, @@ -876,6 +883,7 @@ void shrink_dcache_parent(struct dentry * parent) while ((found = select_parent(parent)) != 0) __shrink_dcache_sb(sb, &found, 0); } +EXPORT_SYMBOL(shrink_dcache_parent); /* * Scan `nr' dentries and return the number which remain. @@ -968,6 +976,7 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name) return dentry; } +EXPORT_SYMBOL(d_alloc); struct dentry *d_alloc_name(struct dentry *parent, const char *name) { @@ -1012,6 +1021,7 @@ void d_instantiate(struct dentry *entry, struct inode * inode) spin_unlock(&dcache_lock); security_d_instantiate(entry, inode); } +EXPORT_SYMBOL(d_instantiate); /** * d_instantiate_unique - instantiate a non-aliased dentry @@ -1108,6 +1118,7 @@ struct dentry * d_alloc_root(struct inode * root_inode) } return res; } +EXPORT_SYMBOL(d_alloc_root); static inline struct hlist_head *d_hash(struct dentry *parent, unsigned long hash) @@ -1225,6 +1236,7 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry) d_add(dentry, inode); return new; } +EXPORT_SYMBOL(d_splice_alias); /** * d_add_ci - lookup or allocate new dentry with case-exact name @@ -1314,6 +1326,7 @@ err_out: iput(inode); return ERR_PTR(error); } +EXPORT_SYMBOL(d_add_ci); /** * d_lookup - search for a dentry @@ -1357,6 +1370,7 @@ struct dentry * d_lookup(struct dentry * parent, struct qstr * name) } while (read_seqretry(&rename_lock, seq)); return dentry; } +EXPORT_SYMBOL(d_lookup); struct dentry * __d_lookup(struct dentry * parent, struct qstr * name) { @@ -1483,6 +1497,7 @@ int d_validate(struct dentry *dentry, struct dentry *dparent) out: return 0; } +EXPORT_SYMBOL(d_validate); /* * When a file is deleted, we have two options: @@ -1528,6 +1543,7 @@ void d_delete(struct dentry * dentry) fsnotify_nameremove(dentry, isdir); } +EXPORT_SYMBOL(d_delete); static void __d_rehash(struct dentry * entry, struct hlist_head *list) { @@ -1556,6 +1572,7 @@ void d_rehash(struct dentry * entry) spin_unlock(&entry->d_lock); spin_unlock(&dcache_lock); } +EXPORT_SYMBOL(d_rehash); /* * When switching names, the actual string doesn't strictly have to @@ -1702,6 +1719,7 @@ void d_move(struct dentry * dentry, struct dentry * target) d_move_locked(dentry, target); spin_unlock(&dcache_lock); } +EXPORT_SYMBOL(d_move); /** * d_ancestor - search for an ancestor @@ -1868,6 +1886,7 @@ shouldnt_be_hashed: spin_unlock(&dcache_lock); BUG(); } +EXPORT_SYMBOL_GPL(d_materialise_unique); static int prepend(char **buffer, int *buflen, const char *str, int namelen) { @@ -2005,6 +2024,7 @@ char *d_path(const struct path *path, char *buf, int buflen) path_put(&root); return res; } +EXPORT_SYMBOL(d_path); /* * Helper function for dentry_operations.d_dname() members @@ -2228,6 +2248,7 @@ ino_t find_inode_number(struct dentry *dir, struct qstr *name) } return ino; } +EXPORT_SYMBOL(find_inode_number); static __initdata unsigned long dhash_entries; static int __init set_dhash_entries(char *str) @@ -2297,6 +2318,7 @@ static void __init dcache_init(void) /* SLAB cache for __getname() consumers */ struct kmem_cache *names_cachep __read_mostly; +EXPORT_SYMBOL(names_cachep); EXPORT_SYMBOL(d_genocide); @@ -2326,26 +2348,3 @@ void __init vfs_caches_init(unsigned long mempages) bdev_cache_init(); chrdev_init(); } - -EXPORT_SYMBOL(d_alloc); -EXPORT_SYMBOL(d_alloc_root); -EXPORT_SYMBOL(d_delete); -EXPORT_SYMBOL(d_find_alias); -EXPORT_SYMBOL(d_instantiate); -EXPORT_SYMBOL(d_invalidate); -EXPORT_SYMBOL(d_lookup); -EXPORT_SYMBOL(d_move); -EXPORT_SYMBOL_GPL(d_materialise_unique); -EXPORT_SYMBOL(d_path); -EXPORT_SYMBOL(d_prune_aliases); -EXPORT_SYMBOL(d_rehash); -EXPORT_SYMBOL(d_splice_alias); -EXPORT_SYMBOL(d_add_ci); -EXPORT_SYMBOL(d_validate); -EXPORT_SYMBOL(dget_locked); -EXPORT_SYMBOL(dput); -EXPORT_SYMBOL(find_inode_number); -EXPORT_SYMBOL(have_submounts); -EXPORT_SYMBOL(names_cachep); -EXPORT_SYMBOL(shrink_dcache_parent); -EXPORT_SYMBOL(shrink_dcache_sb); -- cgit v1.2.3 From d208bbdda991b8808d9c033ce4d31cb1bd87dcfc Mon Sep 17 00:00:00 2001 From: Nick Piggin <npiggin@suse.de> Date: Mon, 21 Dec 2009 16:28:53 -0800 Subject: fs: improve remount,ro vs buffercache coherency Invalidate sb->s_bdev on remount,ro. Fixes a problem reported by Jorge Boncompte who is seeing corruption trying to snapshot a minix filesystem image. Some filesystems modify their metadata via a path other than the bdev buffer cache (eg. they may use a private linear mapping for their metadata, or implement directories in pagecache, etc). Also, file data modifications usually go to the bdev via their own mappings. These updates are not coherent with buffercache IO (eg. via /dev/bdev) and never have been. However there could be a reasonable expectation that after a mount -oremount,ro operation then the buffercache should subsequently be coherent with previous filesystem modifications. So invalidate the bdev mappings on a remount,ro operation to provide a coherency point. The problem was exposed when we switched the old rd to brd because old rd didn't really function like a normal block device and updates to rd via mappings other than the buffercache would still end up going into its buffercache. But the same problem has always affected other "normal" block devices, including loop. [akpm@linux-foundation.org: repair comment layout] Reported-by: "Jorge Boncompte [DTI2]" <jorge@dti2.net> Tested-by: "Jorge Boncompte [DTI2]" <jorge@dti2.net> Signed-off-by: Nick Piggin <npiggin@suse.de> Cc: Al Viro <viro@zeniv.linux.org.uk> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> --- fs/super.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/super.c b/fs/super.c index aff046b0fe78..903896ec7c73 100644 --- a/fs/super.c +++ b/fs/super.c @@ -568,7 +568,7 @@ out: int do_remount_sb(struct super_block *sb, int flags, void *data, int force) { int retval; - int remount_rw; + int remount_rw, remount_ro; if (sb->s_frozen != SB_UNFROZEN) return -EBUSY; @@ -583,9 +583,12 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force) shrink_dcache_sb(sb); sync_filesystem(sb); + remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY); + remount_rw = !(flags & MS_RDONLY) && (sb->s_flags & MS_RDONLY); + /* If we are remounting RDONLY and current sb is read/write, make sure there are no rw files opened */ - if ((flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY)) { + if (remount_ro) { if (force) mark_files_ro(sb); else if (!fs_may_remount_ro(sb)) @@ -594,7 +597,6 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force) if (retval < 0 && retval != -ENOSYS) return -EBUSY; } - remount_rw = !(flags & MS_RDONLY) && (sb->s_flags & MS_RDONLY); if (sb->s_op->remount_fs) { retval = sb->s_op->remount_fs(sb, &flags, data); @@ -604,6 +606,16 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force) sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK); if (remount_rw) vfs_dq_quota_on_remount(sb); + /* + * Some filesystems modify their metadata via some other path than the + * bdev buffer cache (eg. use a private mapping, or directories in + * pagecache, etc). Also file data modifications go via their own + * mappings. So If we try to mount readonly then copy the filesystem + * from bdev, we could get stale data, so invalidate it to give a best + * effort at coherency. + */ + if (remount_ro && sb->s_bdev) + invalidate_bdev(sb->s_bdev); return 0; } -- cgit v1.2.3 From 8737c9305bd5602b11f7eb4655d5695d4a42a0c6 Mon Sep 17 00:00:00 2001 From: Al Viro <viro@zeniv.linux.org.uk> Date: Thu, 24 Dec 2009 06:47:55 -0500 Subject: Switch may_open() and break_lease() to passing O_... ... instead of mixing FMODE_ and O_ Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> --- arch/um/drivers/mconsole_kern.c | 2 +- fs/cifs/file.c | 4 ++-- fs/locks.c | 5 +++-- fs/namei.c | 10 +++++----- fs/nfsctl.c | 5 ++--- fs/nfsd/vfs.c | 4 ++-- fs/open.c | 2 +- kernel/sysctl_binary.c | 7 ++----- 8 files changed, 18 insertions(+), 21 deletions(-) (limited to 'fs') diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c index 3b3c36601a7b..de317d0c3294 100644 --- a/arch/um/drivers/mconsole_kern.c +++ b/arch/um/drivers/mconsole_kern.c @@ -140,7 +140,7 @@ void mconsole_proc(struct mc_request *req) goto out; } - err = may_open(&nd.path, MAY_READ, FMODE_READ); + err = may_open(&nd.path, MAY_READ, O_RDONLY); if (result) { mconsole_reply(req, "Failed to open file", 1, 0); path_put(&nd.path); diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 057e1dae12ab..3d8f8a96f5a3 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -2289,9 +2289,9 @@ cifs_oplock_break(struct slow_work *work) if (inode && S_ISREG(inode->i_mode)) { #ifdef CONFIG_CIFS_EXPERIMENTAL if (cinode->clientCanCacheAll == 0) - break_lease(inode, FMODE_READ); + break_lease(inode, O_RDONLY); else if (cinode->clientCanCacheRead == 0) - break_lease(inode, FMODE_WRITE); + break_lease(inode, O_WRONLY); #endif rc = filemap_fdatawrite(inode->i_mapping); if (cinode->clientCanCacheRead == 0) { diff --git a/fs/locks.c b/fs/locks.c index a8794f233bc9..ae9ded026b7c 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1182,8 +1182,9 @@ int __break_lease(struct inode *inode, unsigned int mode) struct file_lock *fl; unsigned long break_time; int i_have_this_lease = 0; + int want_write = (mode & O_ACCMODE) != O_RDONLY; - new_fl = lease_alloc(NULL, mode & FMODE_WRITE ? F_WRLCK : F_RDLCK); + new_fl = lease_alloc(NULL, want_write ? F_WRLCK : F_RDLCK); lock_kernel(); @@ -1197,7 +1198,7 @@ int __break_lease(struct inode *inode, unsigned int mode) if (fl->fl_owner == current->files) i_have_this_lease = 1; - if (mode & FMODE_WRITE) { + if (want_write) { /* If we want write access, we have to revoke any lease. */ future = F_UNLCK | F_INPROGRESS; } else if (flock->fl_type & F_INPROGRESS) { diff --git a/fs/namei.c b/fs/namei.c index a4855af776a8..b20f83d1e065 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1503,7 +1503,7 @@ int may_open(struct path *path, int acc_mode, int flag) * An append-only file must be opened in append mode for writing. */ if (IS_APPEND(inode)) { - if ((flag & FMODE_WRITE) && !(flag & O_APPEND)) + if ((flag & O_ACCMODE) != O_RDONLY && !(flag & O_APPEND)) return -EPERM; if (flag & O_TRUNC) return -EPERM; @@ -1547,7 +1547,7 @@ static int handle_truncate(struct path *path) * what get passed to sys_open(). */ static int __open_namei_create(struct nameidata *nd, struct path *path, - int flag, int mode) + int open_flag, int mode) { int error; struct dentry *dir = nd->path.dentry; @@ -1565,7 +1565,7 @@ out_unlock: if (error) return error; /* Don't check for write permission, don't truncate */ - return may_open(&nd->path, 0, flag & ~O_TRUNC); + return may_open(&nd->path, 0, open_flag & ~O_TRUNC); } /* @@ -1736,7 +1736,7 @@ do_last: error = mnt_want_write(nd.path.mnt); if (error) goto exit_mutex_unlock; - error = __open_namei_create(&nd, &path, flag, mode); + error = __open_namei_create(&nd, &path, open_flag, mode); if (error) { mnt_drop_write(nd.path.mnt); goto exit; @@ -1798,7 +1798,7 @@ ok: if (error) goto exit; } - error = may_open(&nd.path, acc_mode, flag); + error = may_open(&nd.path, acc_mode, open_flag); if (error) { if (will_truncate) mnt_drop_write(nd.path.mnt); diff --git a/fs/nfsctl.c b/fs/nfsctl.c index d3854d94b7cf..bf9cbd242ddd 100644 --- a/fs/nfsctl.c +++ b/fs/nfsctl.c @@ -36,10 +36,9 @@ static struct file *do_open(char *name, int flags) return ERR_PTR(error); if (flags == O_RDWR) - error = may_open(&nd.path, MAY_READ|MAY_WRITE, - FMODE_READ|FMODE_WRITE); + error = may_open(&nd.path, MAY_READ|MAY_WRITE, flags); else - error = may_open(&nd.path, MAY_WRITE, FMODE_WRITE); + error = may_open(&nd.path, MAY_WRITE, flags); if (!error) return dentry_open(nd.path.dentry, nd.path.mnt, flags, diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 8715d194561a..15dc2deaac5f 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -361,7 +361,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, * If we are changing the size of the file, then * we need to break all leases. */ - host_err = break_lease(inode, FMODE_WRITE | O_NONBLOCK); + host_err = break_lease(inode, O_WRONLY | O_NONBLOCK); if (host_err == -EWOULDBLOCK) host_err = -ETIMEDOUT; if (host_err) /* ENOMEM or EWOULDBLOCK */ @@ -734,7 +734,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, * Check to see if there are any leases on this file. * This may block while leases are broken. */ - host_err = break_lease(inode, O_NONBLOCK | ((access & NFSD_MAY_WRITE) ? FMODE_WRITE : 0)); + host_err = break_lease(inode, O_NONBLOCK | ((access & NFSD_MAY_WRITE) ? O_WRONLY : 0)); if (host_err == -EWOULDBLOCK) host_err = -ETIMEDOUT; if (host_err) /* NOMEM or WOULDBLOCK */ diff --git a/fs/open.c b/fs/open.c index 040cef72bc00..e0b2d88b0380 100644 --- a/fs/open.c +++ b/fs/open.c @@ -271,7 +271,7 @@ static long do_sys_truncate(const char __user *pathname, loff_t length) * Make sure that there are no leases. get_write_access() protects * against the truncate racing with a lease-granting setlease(). */ - error = break_lease(inode, FMODE_WRITE); + error = break_lease(inode, O_WRONLY); if (error) goto put_write_and_out; diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 8f5d16e0707a..8cd50d8f9bde 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -1331,7 +1331,7 @@ static ssize_t binary_sysctl(const int *name, int nlen, ssize_t result; char *pathname; int flags; - int acc_mode, fmode; + int acc_mode; pathname = sysctl_getname(name, nlen, &table); result = PTR_ERR(pathname); @@ -1342,15 +1342,12 @@ static ssize_t binary_sysctl(const int *name, int nlen, if (oldval && oldlen && newval && newlen) { flags = O_RDWR; acc_mode = MAY_READ | MAY_WRITE; - fmode = FMODE_READ | FMODE_WRITE; } else if (newval && newlen) { flags = O_WRONLY; acc_mode = MAY_WRITE; - fmode = FMODE_WRITE; } else if (oldval && oldlen) { flags = O_RDONLY; acc_mode = MAY_READ; - fmode = FMODE_READ; } else { result = 0; goto out_putname; @@ -1361,7 +1358,7 @@ static ssize_t binary_sysctl(const int *name, int nlen, if (result) goto out_putname; - result = may_open(&nd.path, acc_mode, fmode); + result = may_open(&nd.path, acc_mode, flags); if (result) goto out_putpath; -- cgit v1.2.3 From c177c2ac8c5aa83ed181db44543c3b38fd1f17a6 Mon Sep 17 00:00:00 2001 From: Al Viro <viro@zeniv.linux.org.uk> Date: Thu, 14 Jan 2010 00:59:16 -0500 Subject: Switch gfs2 to nd_set_link() Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> --- fs/gfs2/ops_inode.c | 113 +++++++++++++--------------------------------------- 1 file changed, 27 insertions(+), 86 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c index 84350e1be66d..4e64352d49de 100644 --- a/fs/gfs2/ops_inode.c +++ b/fs/gfs2/ops_inode.c @@ -976,122 +976,62 @@ out: } /** - * gfs2_readlinki - return the contents of a symlink - * @ip: the symlink's inode - * @buf: a pointer to the buffer to be filled - * @len: a pointer to the length of @buf + * gfs2_follow_link - Follow a symbolic link + * @dentry: The dentry of the link + * @nd: Data that we pass to vfs_follow_link() * - * If @buf is too small, a piece of memory is kmalloc()ed and needs - * to be freed by the caller. + * This can handle symlinks of any size. * - * Returns: errno + * Returns: 0 on success or error code */ -static int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len) +static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd) { + struct gfs2_inode *ip = GFS2_I(dentry->d_inode); struct gfs2_holder i_gh; struct buffer_head *dibh; unsigned int x; + char *buf; int error; gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh); error = gfs2_glock_nq(&i_gh); if (error) { gfs2_holder_uninit(&i_gh); - return error; + nd_set_link(nd, ERR_PTR(error)); + return NULL; } if (!ip->i_disksize) { gfs2_consist_inode(ip); - error = -EIO; + buf = ERR_PTR(-EIO); goto out; } error = gfs2_meta_inode_buffer(ip, &dibh); - if (error) + if (error) { + buf = ERR_PTR(error); goto out; - - x = ip->i_disksize + 1; - if (x > *len) { - *buf = kmalloc(x, GFP_NOFS); - if (!*buf) { - error = -ENOMEM; - goto out_brelse; - } } - memcpy(*buf, dibh->b_data + sizeof(struct gfs2_dinode), x); - *len = x; - -out_brelse: + x = ip->i_disksize + 1; + buf = kmalloc(x, GFP_NOFS); + if (!buf) + buf = ERR_PTR(-ENOMEM); + else + memcpy(buf, dibh->b_data + sizeof(struct gfs2_dinode), x); brelse(dibh); out: gfs2_glock_dq_uninit(&i_gh); - return error; -} - -/** - * gfs2_readlink - Read the value of a symlink - * @dentry: the symlink - * @buf: the buffer to read the symlink data into - * @size: the size of the buffer - * - * Returns: errno - */ - -static int gfs2_readlink(struct dentry *dentry, char __user *user_buf, - int user_size) -{ - struct gfs2_inode *ip = GFS2_I(dentry->d_inode); - char array[GFS2_FAST_NAME_SIZE], *buf = array; - unsigned int len = GFS2_FAST_NAME_SIZE; - int error; - - error = gfs2_readlinki(ip, &buf, &len); - if (error) - return error; - - if (user_size > len - 1) - user_size = len - 1; - - if (copy_to_user(user_buf, buf, user_size)) - error = -EFAULT; - else - error = user_size; - - if (buf != array) - kfree(buf); - - return error; + nd_set_link(nd, buf); + return NULL; } -/** - * gfs2_follow_link - Follow a symbolic link - * @dentry: The dentry of the link - * @nd: Data that we pass to vfs_follow_link() - * - * This can handle symlinks of any size. It is optimised for symlinks - * under GFS2_FAST_NAME_SIZE. - * - * Returns: 0 on success or error code - */ - -static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd) +static void gfs2_put_link(struct dentry *dentry, struct nameidata *nd, void *p) { - struct gfs2_inode *ip = GFS2_I(dentry->d_inode); - char array[GFS2_FAST_NAME_SIZE], *buf = array; - unsigned int len = GFS2_FAST_NAME_SIZE; - int error; - - error = gfs2_readlinki(ip, &buf, &len); - if (!error) { - error = vfs_follow_link(nd, buf); - if (buf != array) - kfree(buf); - } else - path_put(&nd->path); - - return ERR_PTR(error); + char *s = nd_get_link(nd); + if (!IS_ERR(s)) + kfree(s); } /** @@ -1426,8 +1366,9 @@ const struct inode_operations gfs2_dir_iops = { }; const struct inode_operations gfs2_symlink_iops = { - .readlink = gfs2_readlink, + .readlink = generic_readlink, .follow_link = gfs2_follow_link, + .put_link = gfs2_put_link, .permission = gfs2_permission, .setattr = gfs2_setattr, .getattr = gfs2_getattr, -- cgit v1.2.3 From 796a6b521d0eadb338adf8cf7e482351c3a8a7b4 Mon Sep 17 00:00:00 2001 From: Al Viro <viro@zeniv.linux.org.uk> Date: Sat, 16 Jan 2010 13:28:47 -0500 Subject: Kill CL_PROPAGATION, sanitize fs/pnode.c:get_source() First of all, get_source() never results in CL_PROPAGATION alone. We either get CL_MAKE_SHARED (for the continuation of peer group) or CL_SLAVE (slave that is not shared) or both (beginning of peer group among slaves). Massage the code to make that explicit, kill CL_PROPAGATION test in clone_mnt() (nothing sets CL_MAKE_SHARED without CL_PROPAGATION and in clone_mnt() we are checking CL_PROPAGATION after we'd found that there's no CL_SLAVE, so the check for CL_MAKE_SHARED would do just as well). Fix comments, while we are at it... Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> --- fs/namespace.c | 2 +- fs/pnode.c | 28 +++++++++++++++++----------- fs/pnode.h | 3 +-- 3 files changed, 19 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/namespace.c b/fs/namespace.c index c768f733c8d6..25c1dcf9e9eb 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -573,7 +573,7 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root, mnt->mnt_master = old; CLEAR_MNT_SHARED(mnt); } else if (!(flag & CL_PRIVATE)) { - if ((flag & CL_PROPAGATION) || IS_MNT_SHARED(old)) + if ((flag & CL_MAKE_SHARED) || IS_MNT_SHARED(old)) list_add(&mnt->mnt_share, &old->mnt_share); if (IS_MNT_SLAVE(old)) list_add(&mnt->mnt_slave, &old->mnt_slave); diff --git a/fs/pnode.c b/fs/pnode.c index 8d5f392ec3d3..5cc564a83149 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -86,7 +86,7 @@ static int do_make_slave(struct vfsmount *mnt) /* * slave 'mnt' to a peer mount that has the - * same root dentry. If none is available than + * same root dentry. If none is available then * slave it to anything that is available. */ while ((peer_mnt = next_peer(peer_mnt)) != mnt && @@ -147,6 +147,11 @@ void change_mnt_propagation(struct vfsmount *mnt, int type) * get the next mount in the propagation tree. * @m: the mount seen last * @origin: the original mount from where the tree walk initiated + * + * Note that peer groups form contiguous segments of slave lists. + * We rely on that in get_source() to be able to find out if + * vfsmount found while iterating with propagation_next() is + * a peer of one we'd found earlier. */ static struct vfsmount *propagation_next(struct vfsmount *m, struct vfsmount *origin) @@ -186,10 +191,6 @@ static struct vfsmount *get_source(struct vfsmount *dest, { struct vfsmount *p_last_src = NULL; struct vfsmount *p_last_dest = NULL; - *type = CL_PROPAGATION; - - if (IS_MNT_SHARED(dest)) - *type |= CL_MAKE_SHARED; while (last_dest != dest->mnt_master) { p_last_dest = last_dest; @@ -202,13 +203,18 @@ static struct vfsmount *get_source(struct vfsmount *dest, do { p_last_dest = next_peer(p_last_dest); } while (IS_MNT_NEW(p_last_dest)); + /* is that a peer of the earlier? */ + if (dest == p_last_dest) { + *type = CL_MAKE_SHARED; + return p_last_src; + } } - - if (dest != p_last_dest) { - *type |= CL_SLAVE; - return last_src; - } else - return p_last_src; + /* slave of the earlier, then */ + *type = CL_SLAVE; + /* beginning of peer group among the slaves? */ + if (IS_MNT_SHARED(dest)) + *type |= CL_MAKE_SHARED; + return last_src; } /* diff --git a/fs/pnode.h b/fs/pnode.h index 958665d662af..6c7ef3252a26 100644 --- a/fs/pnode.h +++ b/fs/pnode.h @@ -21,8 +21,7 @@ #define CL_SLAVE 0x02 #define CL_COPY_ALL 0x04 #define CL_MAKE_SHARED 0x08 -#define CL_PROPAGATION 0x10 -#define CL_PRIVATE 0x20 +#define CL_PRIVATE 0x10 static inline void set_mnt_shared(struct vfsmount *mnt) { -- cgit v1.2.3 From f598f9f1252b33410ffc52f51e117645ac5116c4 Mon Sep 17 00:00:00 2001 From: Al Viro <viro@zeniv.linux.org.uk> Date: Sat, 23 Jan 2010 20:08:53 -0500 Subject: Sanitize autofs_dev_ioctl_ismountpoint() Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> --- fs/autofs4/dev-ioctl.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index 00bf8fcb245f..c8a80dffb455 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -544,10 +544,9 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp, goto out; devid = new_encode_dev(path.mnt->mnt_sb->s_dev); err = 0; - if (path.dentry->d_inode && - path.mnt->mnt_root == path.dentry) { + if (path.mnt->mnt_root == path.dentry) { err = 1; - magic = path.dentry->d_inode->i_sb->s_magic; + magic = path.mnt->mnt_sb->s_magic; } } else { dev_t dev = sbi->sb->s_dev; @@ -560,10 +559,8 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp, err = have_submounts(path.dentry); - if (path.mnt->mnt_mountpoint != path.mnt->mnt_root) { - if (follow_down(&path)) - magic = path.mnt->mnt_sb->s_magic; - } + if (follow_down(&path)) + magic = path.mnt->mnt_sb->s_magic; } param->ismountpoint.out.devid = devid; -- cgit v1.2.3 From 3899167dbd6832a3d8d7171b425257ad46b6c40c Mon Sep 17 00:00:00 2001 From: Al Viro <viro@zeniv.linux.org.uk> Date: Sat, 23 Jan 2010 20:10:29 -0500 Subject: Get rid of mnt_mountpoint abuses in ext4 path to mnt/mnt->mnt_root is no worse than that to mnt->mnt_parent/mnt->mnt_mountpoint *and* needs no pinning the sucker down (mnt is not going away and mnt->mnt_root won't change) Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> --- fs/ext4/file.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 9630583cef28..56eee3d796c2 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -116,11 +116,9 @@ static int ext4_file_open(struct inode * inode, struct file * filp) * devices or filesystem images. */ memset(buf, 0, sizeof(buf)); - path.mnt = mnt->mnt_parent; - path.dentry = mnt->mnt_mountpoint; - path_get(&path); + path.mnt = mnt; + path.dentry = mnt->mnt_root; cp = d_path(&path, buf, sizeof(buf)); - path_put(&path); if (!IS_ERR(cp)) { memcpy(sbi->s_es->s_last_mounted, cp, sizeof(sbi->s_es->s_last_mounted)); -- cgit v1.2.3 From 5b7e934d887c67fe093b61f1308bc2d9c49381ff Mon Sep 17 00:00:00 2001 From: Al Viro <viro@zeniv.linux.org.uk> Date: Sun, 24 Jan 2010 00:28:52 -0500 Subject: Use kill_litter_super() in autofs4 ->kill_sb() ... and get rid of open-coding its guts (i.e. RIP autofs4_force_release()) Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> --- fs/autofs4/inode.c | 62 +----------------------------------------------------- 1 file changed, 1 insertion(+), 61 deletions(-) (limited to 'fs') diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c index 4670a7818eac..821b2b955dac 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs4/inode.c @@ -96,63 +96,6 @@ void autofs4_free_ino(struct autofs_info *ino) kfree(ino); } -/* - * Deal with the infamous "Busy inodes after umount ..." message. - * - * Clean up the dentry tree. This happens with autofs if the user - * space program goes away due to a SIGKILL, SIGSEGV etc. - */ -static void autofs4_force_release(struct autofs_sb_info *sbi) -{ - struct dentry *this_parent = sbi->sb->s_root; - struct list_head *next; - - if (!sbi->sb->s_root) - return; - - spin_lock(&dcache_lock); -repeat: - next = this_parent->d_subdirs.next; -resume: - while (next != &this_parent->d_subdirs) { - struct dentry *dentry = list_entry(next, struct dentry, d_u.d_child); - - /* Negative dentry - don`t care */ - if (!simple_positive(dentry)) { - next = next->next; - continue; - } - - if (!list_empty(&dentry->d_subdirs)) { - this_parent = dentry; - goto repeat; - } - - next = next->next; - spin_unlock(&dcache_lock); - - DPRINTK("dentry %p %.*s", - dentry, (int)dentry->d_name.len, dentry->d_name.name); - - dput(dentry); - spin_lock(&dcache_lock); - } - - if (this_parent != sbi->sb->s_root) { - struct dentry *dentry = this_parent; - - next = this_parent->d_u.d_child.next; - this_parent = this_parent->d_parent; - spin_unlock(&dcache_lock); - DPRINTK("parent dentry %p %.*s", - dentry, (int)dentry->d_name.len, dentry->d_name.name); - dput(dentry); - spin_lock(&dcache_lock); - goto resume; - } - spin_unlock(&dcache_lock); -} - void autofs4_kill_sb(struct super_block *sb) { struct autofs_sb_info *sbi = autofs4_sbi(sb); @@ -169,15 +112,12 @@ void autofs4_kill_sb(struct super_block *sb) /* Free wait queues, close pipe */ autofs4_catatonic_mode(sbi); - /* Clean up and release dangling references */ - autofs4_force_release(sbi); - sb->s_fs_info = NULL; kfree(sbi); out_kill_sb: DPRINTK("shutting down"); - kill_anon_super(sb); + kill_litter_super(sb); } static int autofs4_show_options(struct seq_file *m, struct vfsmount *mnt) -- cgit v1.2.3 From 495d6c9c6595ec7b37910dfd42634839431d21fd Mon Sep 17 00:00:00 2001 From: Valerie Aurora <vaurora@redhat.com> Date: Tue, 26 Jan 2010 14:20:47 -0500 Subject: VFS: Clean up shared mount flag propagation The handling of mount flags in set_mnt_shared() got a little tangled up during previous cleanups, with the following problems: * MNT_PNODE_MASK is defined as a literal constant when it should be a bitwise xor of other MNT_* flags * set_mnt_shared() clears and then sets MNT_SHARED (part of MNT_PNODE_MASK) * MNT_PNODE_MASK could use a comment in mount.h * MNT_PNODE_MASK is a terrible name, change to MNT_SHARED_MASK This patch fixes these problems. Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> --- fs/namespace.c | 2 +- fs/pnode.h | 2 +- include/linux/mount.h | 11 ++++++++++- 3 files changed, 12 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/namespace.c b/fs/namespace.c index 25c1dcf9e9eb..d25d4602ab50 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1538,7 +1538,7 @@ static int do_remount(struct path *path, int flags, int mnt_flags, err = do_remount_sb(sb, flags, data, 0); if (!err) { spin_lock(&vfsmount_lock); - mnt_flags |= path->mnt->mnt_flags & MNT_PNODE_MASK; + mnt_flags |= path->mnt->mnt_flags & MNT_PROPAGATION_MASK; path->mnt->mnt_flags = mnt_flags; spin_unlock(&vfsmount_lock); } diff --git a/fs/pnode.h b/fs/pnode.h index 6c7ef3252a26..1ea4ae1efcd3 100644 --- a/fs/pnode.h +++ b/fs/pnode.h @@ -25,7 +25,7 @@ static inline void set_mnt_shared(struct vfsmount *mnt) { - mnt->mnt_flags &= ~MNT_PNODE_MASK; + mnt->mnt_flags &= ~MNT_SHARED_MASK; mnt->mnt_flags |= MNT_SHARED; } diff --git a/include/linux/mount.h b/include/linux/mount.h index 5d5275364867..375d43a5d802 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -34,7 +34,16 @@ struct mnt_namespace; #define MNT_SHARED 0x1000 /* if the vfsmount is a shared mount */ #define MNT_UNBINDABLE 0x2000 /* if the vfsmount is a unbindable mount */ -#define MNT_PNODE_MASK 0x3000 /* propagation flag mask */ +/* + * MNT_SHARED_MASK is the set of flags that should be cleared when a + * mount becomes shared. Currently, this is only the flag that says a + * mount cannot be bind mounted, since this is how we create a mount + * that shares events with another mount. If you add a new MNT_* + * flag, consider how it interacts with shared mounts. + */ +#define MNT_SHARED_MASK (MNT_UNBINDABLE) +#define MNT_PROPAGATION_MASK (MNT_SHARED | MNT_UNBINDABLE) + struct vfsmount { struct list_head mnt_hash; -- cgit v1.2.3 From 2096f759abcb42200a81d776f597362fd9265024 Mon Sep 17 00:00:00 2001 From: Al Viro <viro@zeniv.linux.org.uk> Date: Sat, 30 Jan 2010 13:16:21 -0500 Subject: New helper: path_is_under(path1, path2) Analog of is_subdir for vfsmount,dentry pairs, moved from audit_tree.c Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> --- fs/dcache.c | 24 ++++++++++++++++++++++++ include/linux/fs.h | 1 + kernel/audit_tree.c | 51 ++++++++++++--------------------------------------- 3 files changed, 37 insertions(+), 39 deletions(-) (limited to 'fs') diff --git a/fs/dcache.c b/fs/dcache.c index 4365998b8df4..74da947b160b 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2191,6 +2191,30 @@ int is_subdir(struct dentry *new_dentry, struct dentry *old_dentry) return result; } +int path_is_under(struct path *path1, struct path *path2) +{ + struct vfsmount *mnt = path1->mnt; + struct dentry *dentry = path1->dentry; + int res; + spin_lock(&vfsmount_lock); + if (mnt != path2->mnt) { + for (;;) { + if (mnt->mnt_parent == mnt) { + spin_unlock(&vfsmount_lock); + return 0; + } + if (mnt->mnt_parent == path2->mnt) + break; + mnt = mnt->mnt_parent; + } + dentry = mnt->mnt_mountpoint; + } + res = is_subdir(dentry, path2->dentry); + spin_unlock(&vfsmount_lock); + return res; +} +EXPORT_SYMBOL(path_is_under); + void d_genocide(struct dentry *root) { struct dentry *this_parent = root; diff --git a/include/linux/fs.h b/include/linux/fs.h index d443c9dd3caa..8d53bc17f93f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2126,6 +2126,7 @@ extern struct file * open_exec(const char *); /* fs/dcache.c -- generic fs support functions */ extern int is_subdir(struct dentry *, struct dentry *); +extern int path_is_under(struct path *, struct path *); extern ino_t find_inode_number(struct dentry *, struct qstr *); #include <linux/err.h> diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 4b05bd9479db..f09b42d9c32d 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -603,22 +603,6 @@ skip_it: mutex_unlock(&audit_filter_mutex); } -static int is_under(struct vfsmount *mnt, struct dentry *dentry, - struct path *path) -{ - if (mnt != path->mnt) { - for (;;) { - if (mnt->mnt_parent == mnt) - return 0; - if (mnt->mnt_parent == path->mnt) - break; - mnt = mnt->mnt_parent; - } - dentry = mnt->mnt_mountpoint; - } - return is_subdir(dentry, path->dentry); -} - int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op) { @@ -714,29 +698,24 @@ int audit_tag_tree(char *old, char *new) { struct list_head cursor, barrier; int failed = 0; - struct path path; + struct path path1, path2; struct vfsmount *tagged; struct list_head list; - struct vfsmount *mnt; - struct dentry *dentry; int err; - err = kern_path(new, 0, &path); + err = kern_path(new, 0, &path2); if (err) return err; - tagged = collect_mounts(&path); - path_put(&path); + tagged = collect_mounts(&path2); + path_put(&path2); if (!tagged) return -ENOMEM; - err = kern_path(old, 0, &path); + err = kern_path(old, 0, &path1); if (err) { drop_collected_mounts(tagged); return err; } - mnt = mntget(path.mnt); - dentry = dget(path.dentry); - path_put(&path); list_add_tail(&list, &tagged->mnt_list); @@ -747,6 +726,7 @@ int audit_tag_tree(char *old, char *new) while (cursor.next != &tree_list) { struct audit_tree *tree; struct vfsmount *p; + int good_one = 0; tree = container_of(cursor.next, struct audit_tree, list); get_tree(tree); @@ -754,23 +734,17 @@ int audit_tag_tree(char *old, char *new) list_add(&cursor, &tree->list); mutex_unlock(&audit_filter_mutex); - err = kern_path(tree->pathname, 0, &path); - if (err) { - put_tree(tree); - mutex_lock(&audit_filter_mutex); - continue; + err = kern_path(tree->pathname, 0, &path2); + if (!err) { + good_one = path_is_under(&path1, &path2); + path_put(&path2); } - spin_lock(&vfsmount_lock); - if (!is_under(mnt, dentry, &path)) { - spin_unlock(&vfsmount_lock); - path_put(&path); + if (!good_one) { put_tree(tree); mutex_lock(&audit_filter_mutex); continue; } - spin_unlock(&vfsmount_lock); - path_put(&path); list_for_each_entry(p, &list, mnt_list) { failed = tag_chunk(p->mnt_root->d_inode, tree); @@ -820,8 +794,7 @@ int audit_tag_tree(char *old, char *new) list_del(&cursor); list_del(&list); mutex_unlock(&audit_filter_mutex); - dput(dentry); - mntput(mnt); + path_put(&path1); drop_collected_mounts(tagged); return failed; } -- cgit v1.2.3 From 6eae7974d0490a9dbc3091f702ea1650871652a9 Mon Sep 17 00:00:00 2001 From: Al Viro <viro@zeniv.linux.org.uk> Date: Sat, 30 Jan 2010 13:44:07 -0500 Subject: Switch alloc_nfs_open_context() to struct path Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> --- fs/nfs/inode.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index f141bde7756a..7570573bdb30 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -574,14 +574,14 @@ void nfs_close_context(struct nfs_open_context *ctx, int is_sync) nfs_revalidate_inode(server, inode); } -static struct nfs_open_context *alloc_nfs_open_context(struct vfsmount *mnt, struct dentry *dentry, struct rpc_cred *cred) +static struct nfs_open_context *alloc_nfs_open_context(struct path *path, struct rpc_cred *cred) { struct nfs_open_context *ctx; ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); if (ctx != NULL) { - ctx->path.dentry = dget(dentry); - ctx->path.mnt = mntget(mnt); + ctx->path = *path; + path_get(&ctx->path); ctx->cred = get_rpccred(cred); ctx->state = NULL; ctx->lockowner = current->files; @@ -686,7 +686,7 @@ int nfs_open(struct inode *inode, struct file *filp) cred = rpc_lookup_cred(); if (IS_ERR(cred)) return PTR_ERR(cred); - ctx = alloc_nfs_open_context(filp->f_path.mnt, filp->f_path.dentry, cred); + ctx = alloc_nfs_open_context(&filp->f_path, cred); put_rpccred(cred); if (ctx == NULL) return -ENOMEM; -- cgit v1.2.3 From f694869709cc39a5fbde21aa40f22999ddad0e6e Mon Sep 17 00:00:00 2001 From: Al Viro <viro@zeniv.linux.org.uk> Date: Sat, 30 Jan 2010 13:51:04 -0500 Subject: a couple of mntget+dget -> path_get in nfs4proc Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> --- fs/nfs/nfs4proc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 375f0fae2c6a..84d83be25a98 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -724,8 +724,8 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path, p->o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid); if (p->o_arg.seqid == NULL) goto err_free; - p->path.mnt = mntget(path->mnt); - p->path.dentry = dget(path->dentry); + path_get(path); + p->path = *path; p->dir = parent; p->owner = sp; atomic_inc(&sp->so_count); @@ -1944,8 +1944,8 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait) calldata->res.seqid = calldata->arg.seqid; calldata->res.server = server; calldata->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE; - calldata->path.mnt = mntget(path->mnt); - calldata->path.dentry = dget(path->dentry); + path_get(path); + calldata->path = *path; msg.rpc_argp = &calldata->arg, msg.rpc_resp = &calldata->res, -- cgit v1.2.3 From 3088dd7080d1ecc6d18c27ef9e617cbbd2a2e51e Mon Sep 17 00:00:00 2001 From: Al Viro <viro@zeniv.linux.org.uk> Date: Sat, 30 Jan 2010 15:47:29 -0500 Subject: Clean follow_dotdot() up a bit No need to open-code follow_up() in it and locking can be lighter. Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> --- fs/namei.c | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index b20f83d1e065..3df2ed50ab57 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -689,33 +689,20 @@ static __always_inline void follow_dotdot(struct nameidata *nd) set_root(nd); while(1) { - struct vfsmount *parent; struct dentry *old = nd->path.dentry; if (nd->path.dentry == nd->root.dentry && nd->path.mnt == nd->root.mnt) { break; } - spin_lock(&dcache_lock); if (nd->path.dentry != nd->path.mnt->mnt_root) { - nd->path.dentry = dget(nd->path.dentry->d_parent); - spin_unlock(&dcache_lock); + /* rare case of legitimate dget_parent()... */ + nd->path.dentry = dget_parent(nd->path.dentry); dput(old); break; } - spin_unlock(&dcache_lock); - spin_lock(&vfsmount_lock); - parent = nd->path.mnt->mnt_parent; - if (parent == nd->path.mnt) { - spin_unlock(&vfsmount_lock); + if (!follow_up(&nd->path)) break; - } - mntget(parent); - nd->path.dentry = dget(nd->path.mnt->mnt_mountpoint); - spin_unlock(&vfsmount_lock); - dput(old); - mntput(nd->path.mnt); - nd->path.mnt = parent; } follow_mount(&nd->path); } -- cgit v1.2.3 From 462d60577a997aa87c935ae4521bd303733a9f2b Mon Sep 17 00:00:00 2001 From: Al Viro <viro@zeniv.linux.org.uk> Date: Sat, 30 Jan 2010 16:11:21 -0500 Subject: fix NFS4 handling of mountpoint stat RFC says we need to follow the chain of mounts if there's more than one stacked on that point. Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> --- fs/nfsd/nfs4xdr.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index a8587e90fd5a..bbf72d8f9fc0 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -2121,9 +2121,15 @@ out_acl: * and this is the root of a cross-mounted filesystem. */ if (ignore_crossmnt == 0 && - exp->ex_path.mnt->mnt_root->d_inode == dentry->d_inode) { - err = vfs_getattr(exp->ex_path.mnt->mnt_parent, - exp->ex_path.mnt->mnt_mountpoint, &stat); + dentry == exp->ex_path.mnt->mnt_root) { + struct path path = exp->ex_path; + path_get(&path); + while (follow_up(&path)) { + if (path.dentry != path.mnt->mnt_root) + break; + } + err = vfs_getattr(path.mnt, path.dentry, &stat); + path_put(&path); if (err) goto out_nfserr; } -- cgit v1.2.3 From 1f707137b55764740981d022d29c622832a61880 Mon Sep 17 00:00:00 2001 From: Al Viro <viro@zeniv.linux.org.uk> Date: Sat, 30 Jan 2010 22:51:25 -0500 Subject: new helper: iterate_mounts() apply function to vfsmounts in set returned by collect_mounts(), stop if it returns non-zero. Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> --- fs/namespace.c | 15 +++++++++++++++ include/linux/fs.h | 3 ++- kernel/audit_tree.c | 49 ++++++++++++++++--------------------------------- 3 files changed, 33 insertions(+), 34 deletions(-) (limited to 'fs') diff --git a/fs/namespace.c b/fs/namespace.c index d25d4602ab50..d5906c19e08e 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1246,6 +1246,21 @@ void drop_collected_mounts(struct vfsmount *mnt) release_mounts(&umount_list); } +int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg, + struct vfsmount *root) +{ + struct vfsmount *mnt; + int res = f(root, arg); + if (res) + return res; + list_for_each_entry(mnt, &root->mnt_list, mnt_list) { + res = f(mnt, arg); + if (res) + return res; + } + return 0; +} + static void cleanup_group_ids(struct vfsmount *mnt, struct vfsmount *end) { struct vfsmount *p; diff --git a/include/linux/fs.h b/include/linux/fs.h index 8d53bc17f93f..e764f247d0ab 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1794,7 +1794,8 @@ extern int may_umount(struct vfsmount *); extern long do_mount(char *, char *, char *, unsigned long, void *); extern struct vfsmount *collect_mounts(struct path *); extern void drop_collected_mounts(struct vfsmount *); - +extern int iterate_mounts(int (*)(struct vfsmount *, void *), void *, + struct vfsmount *); extern int vfs_statfs(struct dentry *, struct kstatfs *); extern int current_umask(void); diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index f09b42d9c32d..028e85663f27 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -548,6 +548,11 @@ int audit_remove_tree_rule(struct audit_krule *rule) return 0; } +static int compare_root(struct vfsmount *mnt, void *arg) +{ + return mnt->mnt_root->d_inode == arg; +} + void audit_trim_trees(void) { struct list_head cursor; @@ -559,7 +564,6 @@ void audit_trim_trees(void) struct path path; struct vfsmount *root_mnt; struct node *node; - struct list_head list; int err; tree = container_of(cursor.next, struct audit_tree, list); @@ -577,24 +581,16 @@ void audit_trim_trees(void) if (!root_mnt) goto skip_it; - list_add_tail(&list, &root_mnt->mnt_list); spin_lock(&hash_lock); list_for_each_entry(node, &tree->chunks, list) { - struct audit_chunk *chunk = find_chunk(node); - struct inode *inode = chunk->watch.inode; - struct vfsmount *mnt; + struct inode *inode = find_chunk(node)->watch.inode; node->index |= 1U<<31; - list_for_each_entry(mnt, &list, mnt_list) { - if (mnt->mnt_root->d_inode == inode) { - node->index &= ~(1U<<31); - break; - } - } + if (iterate_mounts(compare_root, inode, root_mnt)) + node->index &= ~(1U<<31); } spin_unlock(&hash_lock); trim_marked(tree); put_tree(tree); - list_del_init(&list); drop_collected_mounts(root_mnt); skip_it: mutex_lock(&audit_filter_mutex); @@ -622,13 +618,17 @@ void audit_put_tree(struct audit_tree *tree) put_tree(tree); } +static int tag_mount(struct vfsmount *mnt, void *arg) +{ + return tag_chunk(mnt->mnt_root->d_inode, arg); +} + /* called with audit_filter_mutex */ int audit_add_tree_rule(struct audit_krule *rule) { struct audit_tree *seed = rule->tree, *tree; struct path path; - struct vfsmount *mnt, *p; - struct list_head list; + struct vfsmount *mnt; int err; list_for_each_entry(tree, &tree_list, list) { @@ -654,16 +654,9 @@ int audit_add_tree_rule(struct audit_krule *rule) err = -ENOMEM; goto Err; } - list_add_tail(&list, &mnt->mnt_list); get_tree(tree); - list_for_each_entry(p, &list, mnt_list) { - err = tag_chunk(p->mnt_root->d_inode, tree); - if (err) - break; - } - - list_del(&list); + err = iterate_mounts(tag_mount, tree, mnt); drop_collected_mounts(mnt); if (!err) { @@ -700,7 +693,6 @@ int audit_tag_tree(char *old, char *new) int failed = 0; struct path path1, path2; struct vfsmount *tagged; - struct list_head list; int err; err = kern_path(new, 0, &path2); @@ -717,15 +709,12 @@ int audit_tag_tree(char *old, char *new) return err; } - list_add_tail(&list, &tagged->mnt_list); - mutex_lock(&audit_filter_mutex); list_add(&barrier, &tree_list); list_add(&cursor, &barrier); while (cursor.next != &tree_list) { struct audit_tree *tree; - struct vfsmount *p; int good_one = 0; tree = container_of(cursor.next, struct audit_tree, list); @@ -746,12 +735,7 @@ int audit_tag_tree(char *old, char *new) continue; } - list_for_each_entry(p, &list, mnt_list) { - failed = tag_chunk(p->mnt_root->d_inode, tree); - if (failed) - break; - } - + failed = iterate_mounts(tag_mount, tree, tagged); if (failed) { put_tree(tree); mutex_lock(&audit_filter_mutex); @@ -792,7 +776,6 @@ int audit_tag_tree(char *old, char *new) } list_del(&barrier); list_del(&cursor); - list_del(&list); mutex_unlock(&audit_filter_mutex); path_put(&path1); drop_collected_mounts(tagged); -- cgit v1.2.3 From 7e7742ee005c887b86fd1fd38d5b48419329dfa0 Mon Sep 17 00:00:00 2001 From: Al Viro <viro@zeniv.linux.org.uk> Date: Sun, 31 Jan 2010 17:09:29 -0500 Subject: sanitize signedness/const for pointers to char in hpfs a bit Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> --- fs/hpfs/anode.c | 2 +- fs/hpfs/dentry.c | 14 +++++++------ fs/hpfs/dir.c | 14 ++++++------- fs/hpfs/dnode.c | 21 +++++++++++-------- fs/hpfs/ea.c | 7 ++++--- fs/hpfs/hpfs_fn.h | 30 +++++++++++++++++----------- fs/hpfs/inode.c | 4 ++-- fs/hpfs/map.c | 6 +++--- fs/hpfs/name.c | 21 +++++++++---------- fs/hpfs/namei.c | 60 +++++++++++++++++++++++++++---------------------------- 10 files changed, 97 insertions(+), 82 deletions(-) (limited to 'fs') diff --git a/fs/hpfs/anode.c b/fs/hpfs/anode.c index 1aa88c4e0964..6a2f04bf3df0 100644 --- a/fs/hpfs/anode.c +++ b/fs/hpfs/anode.c @@ -353,7 +353,7 @@ int hpfs_ea_read(struct super_block *s, secno a, int ano, unsigned pos, } int hpfs_ea_write(struct super_block *s, secno a, int ano, unsigned pos, - unsigned len, char *buf) + unsigned len, const char *buf) { struct buffer_head *bh; char *data; diff --git a/fs/hpfs/dentry.c b/fs/hpfs/dentry.c index 940d6d150bee..67d9d36b3d5f 100644 --- a/fs/hpfs/dentry.c +++ b/fs/hpfs/dentry.c @@ -20,8 +20,8 @@ static int hpfs_hash_dentry(struct dentry *dentry, struct qstr *qstr) if (l == 1) if (qstr->name[0]=='.') goto x; if (l == 2) if (qstr->name[0]=='.' || qstr->name[1]=='.') goto x; - hpfs_adjust_length((char *)qstr->name, &l); - /*if (hpfs_chk_name((char *)qstr->name,&l))*/ + hpfs_adjust_length(qstr->name, &l); + /*if (hpfs_chk_name(qstr->name,&l))*/ /*return -ENAMETOOLONG;*/ /*return -ENOENT;*/ x: @@ -38,14 +38,16 @@ static int hpfs_compare_dentry(struct dentry *dentry, struct qstr *a, struct qst { unsigned al=a->len; unsigned bl=b->len; - hpfs_adjust_length((char *)a->name, &al); - /*hpfs_adjust_length((char *)b->name, &bl);*/ + hpfs_adjust_length(a->name, &al); + /*hpfs_adjust_length(b->name, &bl);*/ /* 'a' is the qstr of an already existing dentry, so the name * must be valid. 'b' must be validated first. */ - if (hpfs_chk_name((char *)b->name, &bl)) return 1; - if (hpfs_compare_names(dentry->d_sb, (char *)a->name, al, (char *)b->name, bl, 0)) return 1; + if (hpfs_chk_name(b->name, &bl)) + return 1; + if (hpfs_compare_names(dentry->d_sb, a->name, al, b->name, bl, 0)) + return 1; return 0; } diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c index 8865c94f55f6..26e3964a4b8c 100644 --- a/fs/hpfs/dir.c +++ b/fs/hpfs/dir.c @@ -59,7 +59,7 @@ static int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir) struct hpfs_dirent *de; int lc; long old_pos; - char *tempname; + unsigned char *tempname; int c1, c2 = 0; int ret = 0; @@ -158,11 +158,11 @@ static int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir) tempname = hpfs_translate_name(inode->i_sb, de->name, de->namelen, lc, de->not_8x3); if (filldir(dirent, tempname, de->namelen, old_pos, de->fnode, DT_UNKNOWN) < 0) { filp->f_pos = old_pos; - if (tempname != (char *)de->name) kfree(tempname); + if (tempname != de->name) kfree(tempname); hpfs_brelse4(&qbh); goto out; } - if (tempname != (char *)de->name) kfree(tempname); + if (tempname != de->name) kfree(tempname); hpfs_brelse4(&qbh); } out: @@ -187,7 +187,7 @@ out: struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) { - const char *name = dentry->d_name.name; + const unsigned char *name = dentry->d_name.name; unsigned len = dentry->d_name.len; struct quad_buffer_head qbh; struct hpfs_dirent *de; @@ -197,7 +197,7 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, struct name struct hpfs_inode_info *hpfs_result; lock_kernel(); - if ((err = hpfs_chk_name((char *)name, &len))) { + if ((err = hpfs_chk_name(name, &len))) { if (err == -ENAMETOOLONG) { unlock_kernel(); return ERR_PTR(-ENAMETOOLONG); @@ -209,7 +209,7 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, struct name * '.' and '..' will never be passed here. */ - de = map_dirent(dir, hpfs_i(dir)->i_dno, (char *) name, len, NULL, &qbh); + de = map_dirent(dir, hpfs_i(dir)->i_dno, name, len, NULL, &qbh); /* * This is not really a bailout, just means file not found. @@ -250,7 +250,7 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, struct name hpfs_result = hpfs_i(result); if (!de->directory) hpfs_result->i_parent_dir = dir->i_ino; - hpfs_decide_conv(result, (char *)name, len); + hpfs_decide_conv(result, name, len); if (de->has_acl || de->has_xtd_perm) if (!(dir->i_sb->s_flags & MS_RDONLY)) { hpfs_error(result->i_sb, "ACLs or XPERM found. This is probably HPFS386. This driver doesn't support it now. Send me some info on these structures"); diff --git a/fs/hpfs/dnode.c b/fs/hpfs/dnode.c index fe83c2b7d2d8..9b2ffadfc8c4 100644 --- a/fs/hpfs/dnode.c +++ b/fs/hpfs/dnode.c @@ -158,7 +158,8 @@ static void set_last_pointer(struct super_block *s, struct dnode *d, dnode_secno /* Add an entry to dnode and don't care if it grows over 2048 bytes */ -struct hpfs_dirent *hpfs_add_de(struct super_block *s, struct dnode *d, unsigned char *name, +struct hpfs_dirent *hpfs_add_de(struct super_block *s, struct dnode *d, + const unsigned char *name, unsigned namelen, secno down_ptr) { struct hpfs_dirent *de; @@ -223,7 +224,7 @@ static void fix_up_ptrs(struct super_block *s, struct dnode *d) /* Add an entry to dnode and do dnode splitting if required */ static int hpfs_add_to_dnode(struct inode *i, dnode_secno dno, - unsigned char *name, unsigned namelen, + const unsigned char *name, unsigned namelen, struct hpfs_dirent *new_de, dnode_secno down_ptr) { struct quad_buffer_head qbh, qbh1, qbh2; @@ -231,7 +232,7 @@ static int hpfs_add_to_dnode(struct inode *i, dnode_secno dno, dnode_secno adno, rdno; struct hpfs_dirent *de; struct hpfs_dirent nde; - char *nname; + unsigned char *nname; int h; int pos; struct buffer_head *bh; @@ -305,7 +306,9 @@ static int hpfs_add_to_dnode(struct inode *i, dnode_secno dno, pos++; } copy_de(new_de = &nde, de); - memcpy(name = nname, de->name, namelen = de->namelen); + memcpy(nname, de->name, de->namelen); + name = nname; + namelen = de->namelen; for_all_poss(i, hpfs_pos_subst, ((loff_t)dno << 4) | pos, 4); down_ptr = adno; set_last_pointer(i->i_sb, ad, de->down ? de_down_pointer(de) : 0); @@ -368,7 +371,8 @@ static int hpfs_add_to_dnode(struct inode *i, dnode_secno dno, * I hope, now it's finally bug-free. */ -int hpfs_add_dirent(struct inode *i, unsigned char *name, unsigned namelen, +int hpfs_add_dirent(struct inode *i, + const unsigned char *name, unsigned namelen, struct hpfs_dirent *new_de, int cdepth) { struct hpfs_inode_info *hpfs_inode = hpfs_i(i); @@ -897,7 +901,8 @@ struct hpfs_dirent *map_pos_dirent(struct inode *inode, loff_t *posp, /* Find a dirent in tree */ -struct hpfs_dirent *map_dirent(struct inode *inode, dnode_secno dno, char *name, unsigned len, +struct hpfs_dirent *map_dirent(struct inode *inode, dnode_secno dno, + const unsigned char *name, unsigned len, dnode_secno *dd, struct quad_buffer_head *qbh) { struct dnode *dnode; @@ -988,8 +993,8 @@ void hpfs_remove_dtree(struct super_block *s, dnode_secno dno) struct hpfs_dirent *map_fnode_dirent(struct super_block *s, fnode_secno fno, struct fnode *f, struct quad_buffer_head *qbh) { - char *name1; - char *name2; + unsigned char *name1; + unsigned char *name2; int name1len, name2len; struct dnode *d; dnode_secno dno, downd; diff --git a/fs/hpfs/ea.c b/fs/hpfs/ea.c index 547a8384571f..45e53d972b42 100644 --- a/fs/hpfs/ea.c +++ b/fs/hpfs/ea.c @@ -62,8 +62,8 @@ static char *get_indirect_ea(struct super_block *s, int ano, secno a, int size) return ret; } -static void set_indirect_ea(struct super_block *s, int ano, secno a, char *data, - int size) +static void set_indirect_ea(struct super_block *s, int ano, secno a, + const char *data, int size) { hpfs_ea_write(s, a, ano, 0, size, data); } @@ -186,7 +186,8 @@ char *hpfs_get_ea(struct super_block *s, struct fnode *fnode, char *key, int *si * This driver can't change sizes of eas ('cause I just don't need it). */ -void hpfs_set_ea(struct inode *inode, struct fnode *fnode, char *key, char *data, int size) +void hpfs_set_ea(struct inode *inode, struct fnode *fnode, const char *key, + const char *data, int size) { fnode_secno fno = inode->i_ino; struct super_block *s = inode->i_sb; diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h index 701ca54c0867..97bf738cd5d6 100644 --- a/fs/hpfs/hpfs_fn.h +++ b/fs/hpfs/hpfs_fn.h @@ -215,7 +215,7 @@ secno hpfs_bplus_lookup(struct super_block *, struct inode *, struct bplus_heade secno hpfs_add_sector_to_btree(struct super_block *, secno, int, unsigned); void hpfs_remove_btree(struct super_block *, struct bplus_header *); int hpfs_ea_read(struct super_block *, secno, int, unsigned, unsigned, char *); -int hpfs_ea_write(struct super_block *, secno, int, unsigned, unsigned, char *); +int hpfs_ea_write(struct super_block *, secno, int, unsigned, unsigned, const char *); void hpfs_ea_remove(struct super_block *, secno, int, unsigned); void hpfs_truncate_btree(struct super_block *, secno, int, unsigned); void hpfs_remove_fnode(struct super_block *, fnode_secno fno); @@ -244,13 +244,17 @@ extern const struct file_operations hpfs_dir_ops; void hpfs_add_pos(struct inode *, loff_t *); void hpfs_del_pos(struct inode *, loff_t *); -struct hpfs_dirent *hpfs_add_de(struct super_block *, struct dnode *, unsigned char *, unsigned, secno); -int hpfs_add_dirent(struct inode *, unsigned char *, unsigned, struct hpfs_dirent *, int); +struct hpfs_dirent *hpfs_add_de(struct super_block *, struct dnode *, + const unsigned char *, unsigned, secno); +int hpfs_add_dirent(struct inode *, const unsigned char *, unsigned, + struct hpfs_dirent *, int); int hpfs_remove_dirent(struct inode *, dnode_secno, struct hpfs_dirent *, struct quad_buffer_head *, int); void hpfs_count_dnodes(struct super_block *, dnode_secno, int *, int *, int *); dnode_secno hpfs_de_as_down_as_possible(struct super_block *, dnode_secno dno); struct hpfs_dirent *map_pos_dirent(struct inode *, loff_t *, struct quad_buffer_head *); -struct hpfs_dirent *map_dirent(struct inode *, dnode_secno, char *, unsigned, dnode_secno *, struct quad_buffer_head *); +struct hpfs_dirent *map_dirent(struct inode *, dnode_secno, + const unsigned char *, unsigned, dnode_secno *, + struct quad_buffer_head *); void hpfs_remove_dtree(struct super_block *, dnode_secno); struct hpfs_dirent *map_fnode_dirent(struct super_block *, fnode_secno, struct fnode *, struct quad_buffer_head *); @@ -259,7 +263,8 @@ struct hpfs_dirent *map_fnode_dirent(struct super_block *, fnode_secno, struct f void hpfs_ea_ext_remove(struct super_block *, secno, int, unsigned); int hpfs_read_ea(struct super_block *, struct fnode *, char *, char *, int); char *hpfs_get_ea(struct super_block *, struct fnode *, char *, int *); -void hpfs_set_ea(struct inode *, struct fnode *, char *, char *, int); +void hpfs_set_ea(struct inode *, struct fnode *, const char *, + const char *, int); /* file.c */ @@ -282,7 +287,7 @@ void hpfs_delete_inode(struct inode *); unsigned *hpfs_map_dnode_bitmap(struct super_block *, struct quad_buffer_head *); unsigned *hpfs_map_bitmap(struct super_block *, unsigned, struct quad_buffer_head *, char *); -char *hpfs_load_code_page(struct super_block *, secno); +unsigned char *hpfs_load_code_page(struct super_block *, secno); secno *hpfs_load_bitmap_directory(struct super_block *, secno bmp); struct fnode *hpfs_map_fnode(struct super_block *s, ino_t, struct buffer_head **); struct anode *hpfs_map_anode(struct super_block *s, anode_secno, struct buffer_head **); @@ -292,12 +297,13 @@ dnode_secno hpfs_fnode_dno(struct super_block *s, ino_t ino); /* name.c */ unsigned char hpfs_upcase(unsigned char *, unsigned char); -int hpfs_chk_name(unsigned char *, unsigned *); -char *hpfs_translate_name(struct super_block *, unsigned char *, unsigned, int, int); -int hpfs_compare_names(struct super_block *, unsigned char *, unsigned, unsigned char *, unsigned, int); -int hpfs_is_name_long(unsigned char *, unsigned); -void hpfs_adjust_length(unsigned char *, unsigned *); -void hpfs_decide_conv(struct inode *, unsigned char *, unsigned); +int hpfs_chk_name(const unsigned char *, unsigned *); +unsigned char *hpfs_translate_name(struct super_block *, unsigned char *, unsigned, int, int); +int hpfs_compare_names(struct super_block *, const unsigned char *, unsigned, + const unsigned char *, unsigned, int); +int hpfs_is_name_long(const unsigned char *, unsigned); +void hpfs_adjust_length(const unsigned char *, unsigned *); +void hpfs_decide_conv(struct inode *, const unsigned char *, unsigned); /* namei.c */ diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c index fe703ae46bc7..ff90affb94e1 100644 --- a/fs/hpfs/inode.c +++ b/fs/hpfs/inode.c @@ -46,7 +46,7 @@ void hpfs_read_inode(struct inode *i) struct fnode *fnode; struct super_block *sb = i->i_sb; struct hpfs_inode_info *hpfs_inode = hpfs_i(i); - unsigned char *ea; + void *ea; int ea_size; if (!(fnode = hpfs_map_fnode(sb, i->i_ino, &bh))) { @@ -112,7 +112,7 @@ void hpfs_read_inode(struct inode *i) } } if (fnode->dirflag) { - unsigned n_dnodes, n_subdirs; + int n_dnodes, n_subdirs; i->i_mode |= S_IFDIR; i->i_op = &hpfs_dir_iops; i->i_fop = &hpfs_dir_ops; diff --git a/fs/hpfs/map.c b/fs/hpfs/map.c index c4724589b2eb..840d033ecee8 100644 --- a/fs/hpfs/map.c +++ b/fs/hpfs/map.c @@ -35,7 +35,7 @@ unsigned int *hpfs_map_bitmap(struct super_block *s, unsigned bmp_block, * lowercasing table */ -char *hpfs_load_code_page(struct super_block *s, secno cps) +unsigned char *hpfs_load_code_page(struct super_block *s, secno cps) { struct buffer_head *bh; secno cpds; @@ -71,7 +71,7 @@ char *hpfs_load_code_page(struct super_block *s, secno cps) brelse(bh); return NULL; } - ptr = (char *)cpd + cpd->offs[cpi] + 6; + ptr = (unsigned char *)cpd + cpd->offs[cpi] + 6; if (!(cp_table = kmalloc(256, GFP_KERNEL))) { printk("HPFS: out of memory for code page table\n"); brelse(bh); @@ -217,7 +217,7 @@ struct dnode *hpfs_map_dnode(struct super_block *s, unsigned secno, if ((dnode = hpfs_map_4sectors(s, secno, qbh, DNODE_RD_AHEAD))) if (hpfs_sb(s)->sb_chk) { unsigned p, pp = 0; - unsigned char *d = (char *)dnode; + unsigned char *d = (unsigned char *)dnode; int b = 0; if (dnode->magic != DNODE_MAGIC) { hpfs_error(s, "bad magic on dnode %08x", secno); diff --git a/fs/hpfs/name.c b/fs/hpfs/name.c index 1f4a964384eb..f24736d7a439 100644 --- a/fs/hpfs/name.c +++ b/fs/hpfs/name.c @@ -8,16 +8,16 @@ #include "hpfs_fn.h" -static char *text_postfix[]={ +static const char *text_postfix[]={ ".ASM", ".BAS", ".BAT", ".C", ".CC", ".CFG", ".CMD", ".CON", ".CPP", ".DEF", ".DOC", ".DPR", ".ERX", ".H", ".HPP", ".HTM", ".HTML", ".JAVA", ".LOG", ".PAS", ".RC", ".TEX", ".TXT", ".Y", ""}; -static char *text_prefix[]={ +static const char *text_prefix[]={ "AUTOEXEC.", "CHANGES", "COPYING", "CONFIG.", "CREDITS", "FAQ", "FILE_ID.DIZ", "MAKEFILE", "READ.ME", "README", "TERMCAP", ""}; -void hpfs_decide_conv(struct inode *inode, unsigned char *name, unsigned len) +void hpfs_decide_conv(struct inode *inode, const unsigned char *name, unsigned len) { struct hpfs_inode_info *hpfs_inode = hpfs_i(inode); int i; @@ -71,7 +71,7 @@ static inline unsigned char locase(unsigned char *dir, unsigned char a) return dir[a]; } -int hpfs_chk_name(unsigned char *name, unsigned *len) +int hpfs_chk_name(const unsigned char *name, unsigned *len) { int i; if (*len > 254) return -ENAMETOOLONG; @@ -83,10 +83,10 @@ int hpfs_chk_name(unsigned char *name, unsigned *len) return 0; } -char *hpfs_translate_name(struct super_block *s, unsigned char *from, +unsigned char *hpfs_translate_name(struct super_block *s, unsigned char *from, unsigned len, int lc, int lng) { - char *to; + unsigned char *to; int i; if (hpfs_sb(s)->sb_chk >= 2) if (hpfs_is_name_long(from, len) != lng) { printk("HPFS: Long name flag mismatch - name "); @@ -103,8 +103,9 @@ char *hpfs_translate_name(struct super_block *s, unsigned char *from, return to; } -int hpfs_compare_names(struct super_block *s, unsigned char *n1, unsigned l1, - unsigned char *n2, unsigned l2, int last) +int hpfs_compare_names(struct super_block *s, + const unsigned char *n1, unsigned l1, + const unsigned char *n2, unsigned l2, int last) { unsigned l = l1 < l2 ? l1 : l2; unsigned i; @@ -120,7 +121,7 @@ int hpfs_compare_names(struct super_block *s, unsigned char *n1, unsigned l1, return 0; } -int hpfs_is_name_long(unsigned char *name, unsigned len) +int hpfs_is_name_long(const unsigned char *name, unsigned len) { int i,j; for (i = 0; i < len && name[i] != '.'; i++) @@ -134,7 +135,7 @@ int hpfs_is_name_long(unsigned char *name, unsigned len) /* OS/2 clears dots and spaces at the end of file name, so we have to */ -void hpfs_adjust_length(unsigned char *name, unsigned *len) +void hpfs_adjust_length(const unsigned char *name, unsigned *len) { if (!*len) return; if (*len == 1 && name[0] == '.') return; diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c index 82b9c4ba9ed0..15fd2c06f4a7 100644 --- a/fs/hpfs/namei.c +++ b/fs/hpfs/namei.c @@ -11,7 +11,7 @@ static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) { - const char *name = dentry->d_name.name; + const unsigned char *name = dentry->d_name.name; unsigned len = dentry->d_name.len; struct quad_buffer_head qbh0; struct buffer_head *bh; @@ -24,7 +24,7 @@ static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) int r; struct hpfs_dirent dee; int err; - if ((err = hpfs_chk_name((char *)name, &len))) return err==-ENOENT ? -EINVAL : err; + if ((err = hpfs_chk_name(name, &len))) return err==-ENOENT ? -EINVAL : err; lock_kernel(); err = -ENOSPC; fnode = hpfs_alloc_fnode(dir->i_sb, hpfs_i(dir)->i_dno, &fno, &bh); @@ -62,7 +62,7 @@ static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) result->i_mode &= ~0222; mutex_lock(&hpfs_i(dir)->i_mutex); - r = hpfs_add_dirent(dir, (char *)name, len, &dee, 0); + r = hpfs_add_dirent(dir, name, len, &dee, 0); if (r == 1) goto bail3; if (r == -1) { @@ -121,7 +121,7 @@ bail: static int hpfs_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd) { - const char *name = dentry->d_name.name; + const unsigned char *name = dentry->d_name.name; unsigned len = dentry->d_name.len; struct inode *result = NULL; struct buffer_head *bh; @@ -130,7 +130,7 @@ static int hpfs_create(struct inode *dir, struct dentry *dentry, int mode, struc int r; struct hpfs_dirent dee; int err; - if ((err = hpfs_chk_name((char *)name, &len))) + if ((err = hpfs_chk_name(name, &len))) return err==-ENOENT ? -EINVAL : err; lock_kernel(); err = -ENOSPC; @@ -155,7 +155,7 @@ static int hpfs_create(struct inode *dir, struct dentry *dentry, int mode, struc result->i_op = &hpfs_file_iops; result->i_fop = &hpfs_file_ops; result->i_nlink = 1; - hpfs_decide_conv(result, (char *)name, len); + hpfs_decide_conv(result, name, len); hpfs_i(result)->i_parent_dir = dir->i_ino; result->i_ctime.tv_sec = result->i_mtime.tv_sec = result->i_atime.tv_sec = local_to_gmt(dir->i_sb, dee.creation_date); result->i_ctime.tv_nsec = 0; @@ -170,7 +170,7 @@ static int hpfs_create(struct inode *dir, struct dentry *dentry, int mode, struc hpfs_i(result)->mmu_private = 0; mutex_lock(&hpfs_i(dir)->i_mutex); - r = hpfs_add_dirent(dir, (char *)name, len, &dee, 0); + r = hpfs_add_dirent(dir, name, len, &dee, 0); if (r == 1) goto bail2; if (r == -1) { @@ -211,7 +211,7 @@ bail: static int hpfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev) { - const char *name = dentry->d_name.name; + const unsigned char *name = dentry->d_name.name; unsigned len = dentry->d_name.len; struct buffer_head *bh; struct fnode *fnode; @@ -220,7 +220,7 @@ static int hpfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t struct hpfs_dirent dee; struct inode *result = NULL; int err; - if ((err = hpfs_chk_name((char *)name, &len))) return err==-ENOENT ? -EINVAL : err; + if ((err = hpfs_chk_name(name, &len))) return err==-ENOENT ? -EINVAL : err; if (hpfs_sb(dir->i_sb)->sb_eas < 2) return -EPERM; if (!new_valid_dev(rdev)) return -EINVAL; @@ -256,7 +256,7 @@ static int hpfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t init_special_inode(result, mode, rdev); mutex_lock(&hpfs_i(dir)->i_mutex); - r = hpfs_add_dirent(dir, (char *)name, len, &dee, 0); + r = hpfs_add_dirent(dir, name, len, &dee, 0); if (r == 1) goto bail2; if (r == -1) { @@ -289,7 +289,7 @@ bail: static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *symlink) { - const char *name = dentry->d_name.name; + const unsigned char *name = dentry->d_name.name; unsigned len = dentry->d_name.len; struct buffer_head *bh; struct fnode *fnode; @@ -298,7 +298,7 @@ static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *sy struct hpfs_dirent dee; struct inode *result; int err; - if ((err = hpfs_chk_name((char *)name, &len))) return err==-ENOENT ? -EINVAL : err; + if ((err = hpfs_chk_name(name, &len))) return err==-ENOENT ? -EINVAL : err; lock_kernel(); if (hpfs_sb(dir->i_sb)->sb_eas < 2) { unlock_kernel(); @@ -335,7 +335,7 @@ static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *sy result->i_data.a_ops = &hpfs_symlink_aops; mutex_lock(&hpfs_i(dir)->i_mutex); - r = hpfs_add_dirent(dir, (char *)name, len, &dee, 0); + r = hpfs_add_dirent(dir, name, len, &dee, 0); if (r == 1) goto bail2; if (r == -1) { @@ -345,7 +345,7 @@ static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *sy fnode->len = len; memcpy(fnode->name, name, len > 15 ? 15 : len); fnode->up = dir->i_ino; - hpfs_set_ea(result, fnode, "SYMLINK", (char *)symlink, strlen(symlink)); + hpfs_set_ea(result, fnode, "SYMLINK", symlink, strlen(symlink)); mark_buffer_dirty(bh); brelse(bh); @@ -369,7 +369,7 @@ bail: static int hpfs_unlink(struct inode *dir, struct dentry *dentry) { - const char *name = dentry->d_name.name; + const unsigned char *name = dentry->d_name.name; unsigned len = dentry->d_name.len; struct quad_buffer_head qbh; struct hpfs_dirent *de; @@ -381,12 +381,12 @@ static int hpfs_unlink(struct inode *dir, struct dentry *dentry) int err; lock_kernel(); - hpfs_adjust_length((char *)name, &len); + hpfs_adjust_length(name, &len); again: mutex_lock(&hpfs_i(inode)->i_parent_mutex); mutex_lock(&hpfs_i(dir)->i_mutex); err = -ENOENT; - de = map_dirent(dir, hpfs_i(dir)->i_dno, (char *)name, len, &dno, &qbh); + de = map_dirent(dir, hpfs_i(dir)->i_dno, name, len, &dno, &qbh); if (!de) goto out; @@ -451,7 +451,7 @@ out: static int hpfs_rmdir(struct inode *dir, struct dentry *dentry) { - const char *name = dentry->d_name.name; + const unsigned char *name = dentry->d_name.name; unsigned len = dentry->d_name.len; struct quad_buffer_head qbh; struct hpfs_dirent *de; @@ -462,12 +462,12 @@ static int hpfs_rmdir(struct inode *dir, struct dentry *dentry) int err; int r; - hpfs_adjust_length((char *)name, &len); + hpfs_adjust_length(name, &len); lock_kernel(); mutex_lock(&hpfs_i(inode)->i_parent_mutex); mutex_lock(&hpfs_i(dir)->i_mutex); err = -ENOENT; - de = map_dirent(dir, hpfs_i(dir)->i_dno, (char *)name, len, &dno, &qbh); + de = map_dirent(dir, hpfs_i(dir)->i_dno, name, len, &dno, &qbh); if (!de) goto out; @@ -546,10 +546,10 @@ const struct address_space_operations hpfs_symlink_aops = { static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { - char *old_name = (char *)old_dentry->d_name.name; - int old_len = old_dentry->d_name.len; - char *new_name = (char *)new_dentry->d_name.name; - int new_len = new_dentry->d_name.len; + const unsigned char *old_name = old_dentry->d_name.name; + unsigned old_len = old_dentry->d_name.len; + const unsigned char *new_name = new_dentry->d_name.name; + unsigned new_len = new_dentry->d_name.len; struct inode *i = old_dentry->d_inode; struct inode *new_inode = new_dentry->d_inode; struct quad_buffer_head qbh, qbh1; @@ -560,9 +560,9 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct buffer_head *bh; struct fnode *fnode; int err; - if ((err = hpfs_chk_name((char *)new_name, &new_len))) return err; + if ((err = hpfs_chk_name(new_name, &new_len))) return err; err = 0; - hpfs_adjust_length((char *)old_name, &old_len); + hpfs_adjust_length(old_name, &old_len); lock_kernel(); /* order doesn't matter, due to VFS exclusion */ @@ -579,7 +579,7 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry, goto end1; } - if (!(dep = map_dirent(old_dir, hpfs_i(old_dir)->i_dno, (char *)old_name, old_len, &dno, &qbh))) { + if (!(dep = map_dirent(old_dir, hpfs_i(old_dir)->i_dno, old_name, old_len, &dno, &qbh))) { hpfs_error(i->i_sb, "lookup succeeded but map dirent failed"); err = -ENOENT; goto end1; @@ -590,7 +590,7 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (new_inode) { int r; if ((r = hpfs_remove_dirent(old_dir, dno, dep, &qbh, 1)) != 2) { - if ((nde = map_dirent(new_dir, hpfs_i(new_dir)->i_dno, (char *)new_name, new_len, NULL, &qbh1))) { + if ((nde = map_dirent(new_dir, hpfs_i(new_dir)->i_dno, new_name, new_len, NULL, &qbh1))) { clear_nlink(new_inode); copy_de(nde, &de); memcpy(nde->name, new_name, new_len); @@ -618,7 +618,7 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry, } if (new_dir == old_dir) - if (!(dep = map_dirent(old_dir, hpfs_i(old_dir)->i_dno, (char *)old_name, old_len, &dno, &qbh))) { + if (!(dep = map_dirent(old_dir, hpfs_i(old_dir)->i_dno, old_name, old_len, &dno, &qbh))) { hpfs_unlock_creation(i->i_sb); hpfs_error(i->i_sb, "lookup succeeded but map dirent failed at #2"); err = -ENOENT; @@ -648,7 +648,7 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry, brelse(bh); } hpfs_i(i)->i_conv = hpfs_sb(i->i_sb)->sb_conv; - hpfs_decide_conv(i, (char *)new_name, new_len); + hpfs_decide_conv(i, new_name, new_len); end1: if (old_dir != new_dir) mutex_unlock(&hpfs_i(new_dir)->i_mutex); -- cgit v1.2.3 From 89031bc79782a93fc65adabd0e123c89645bee6e Mon Sep 17 00:00:00 2001 From: Al Viro <viro@zeniv.linux.org.uk> Date: Sun, 31 Jan 2010 20:49:54 -0500 Subject: sanitize const/signedness of ufs a bit Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> --- fs/ufs/dir.c | 10 +++++----- fs/ufs/ufs.h | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c index 22af68f8b682..317a0d444f6b 100644 --- a/fs/ufs/dir.c +++ b/fs/ufs/dir.c @@ -31,7 +31,7 @@ * len <= UFS_MAXNAMLEN and de != NULL are guaranteed by caller. */ static inline int ufs_match(struct super_block *sb, int len, - const char * const name, struct ufs_dir_entry * de) + const unsigned char *name, struct ufs_dir_entry *de) { if (len != ufs_get_de_namlen(sb, de)) return 0; @@ -70,7 +70,7 @@ static inline unsigned long ufs_dir_pages(struct inode *inode) return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT; } -ino_t ufs_inode_by_name(struct inode *dir, struct qstr *qstr) +ino_t ufs_inode_by_name(struct inode *dir, const struct qstr *qstr) { ino_t res = 0; struct ufs_dir_entry *de; @@ -249,11 +249,11 @@ struct ufs_dir_entry *ufs_dotdot(struct inode *dir, struct page **p) * (as a parameter - res_dir). Page is returned mapped and unlocked. * Entry is guaranteed to be valid. */ -struct ufs_dir_entry *ufs_find_entry(struct inode *dir, struct qstr *qstr, +struct ufs_dir_entry *ufs_find_entry(struct inode *dir, const struct qstr *qstr, struct page **res_page) { struct super_block *sb = dir->i_sb; - const char *name = qstr->name; + const unsigned char *name = qstr->name; int namelen = qstr->len; unsigned reclen = UFS_DIR_REC_LEN(namelen); unsigned long start, n; @@ -313,7 +313,7 @@ found: int ufs_add_link(struct dentry *dentry, struct inode *inode) { struct inode *dir = dentry->d_parent->d_inode; - const char *name = dentry->d_name.name; + const unsigned char *name = dentry->d_name.name; int namelen = dentry->d_name.len; struct super_block *sb = dir->i_sb; unsigned reclen = UFS_DIR_REC_LEN(namelen); diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h index 0b4c39bc0d9e..01d0e2a3b230 100644 --- a/fs/ufs/ufs.h +++ b/fs/ufs/ufs.h @@ -86,9 +86,9 @@ extern void ufs_put_cylinder (struct super_block *, unsigned); /* dir.c */ extern const struct inode_operations ufs_dir_inode_operations; extern int ufs_add_link (struct dentry *, struct inode *); -extern ino_t ufs_inode_by_name(struct inode *, struct qstr *); +extern ino_t ufs_inode_by_name(struct inode *, const struct qstr *); extern int ufs_make_empty(struct inode *, struct inode *); -extern struct ufs_dir_entry *ufs_find_entry(struct inode *, struct qstr *, struct page **); +extern struct ufs_dir_entry *ufs_find_entry(struct inode *, const struct qstr *, struct page **); extern int ufs_delete_entry(struct inode *, struct ufs_dir_entry *, struct page *); extern int ufs_empty_dir (struct inode *); extern struct ufs_dir_entry *ufs_dotdot(struct inode *, struct page **); -- cgit v1.2.3 From 0319003d0d229735770c185ddf132c666e9cd01a Mon Sep 17 00:00:00 2001 From: Al Viro <viro@zeniv.linux.org.uk> Date: Sun, 31 Jan 2010 21:02:09 -0500 Subject: nilfs really shouldn't slap struct dentry on stack... ... especially when it only needs (and initializes) .d_name of it Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> --- fs/nilfs2/dir.c | 10 +++++----- fs/nilfs2/namei.c | 13 +++++-------- fs/nilfs2/nilfs.h | 4 ++-- 3 files changed, 12 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c index 76d803e060a9..26402b9b305e 100644 --- a/fs/nilfs2/dir.c +++ b/fs/nilfs2/dir.c @@ -349,11 +349,11 @@ done: * Entry is guaranteed to be valid. */ struct nilfs_dir_entry * -nilfs_find_entry(struct inode *dir, struct dentry *dentry, +nilfs_find_entry(struct inode *dir, const struct qstr *qstr, struct page **res_page) { - const char *name = dentry->d_name.name; - int namelen = dentry->d_name.len; + const unsigned char *name = qstr->name; + int namelen = qstr->len; unsigned reclen = NILFS_DIR_REC_LEN(namelen); unsigned long start, n; unsigned long npages = dir_pages(dir); @@ -424,13 +424,13 @@ struct nilfs_dir_entry *nilfs_dotdot(struct inode *dir, struct page **p) return de; } -ino_t nilfs_inode_by_name(struct inode *dir, struct dentry *dentry) +ino_t nilfs_inode_by_name(struct inode *dir, const struct qstr *qstr) { ino_t res = 0; struct nilfs_dir_entry *de; struct page *page; - de = nilfs_find_entry(dir, dentry, &page); + de = nilfs_find_entry(dir, qstr, &page); if (de) { res = le64_to_cpu(de->inode); kunmap(page); diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c index 07ba838ef089..ad6ed2cf19b4 100644 --- a/fs/nilfs2/namei.c +++ b/fs/nilfs2/namei.c @@ -67,7 +67,7 @@ nilfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) if (dentry->d_name.len > NILFS_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); - ino = nilfs_inode_by_name(dir, dentry); + ino = nilfs_inode_by_name(dir, &dentry->d_name); inode = NULL; if (ino) { inode = nilfs_iget(dir->i_sb, ino); @@ -81,10 +81,7 @@ struct dentry *nilfs_get_parent(struct dentry *child) { unsigned long ino; struct inode *inode; - struct dentry dotdot; - - dotdot.d_name.name = ".."; - dotdot.d_name.len = 2; + struct qstr dotdot = {.name = "..", .len = 2}; ino = nilfs_inode_by_name(child->d_inode, &dotdot); if (!ino) @@ -296,7 +293,7 @@ static int nilfs_do_unlink(struct inode *dir, struct dentry *dentry) int err; err = -ENOENT; - de = nilfs_find_entry(dir, dentry, &page); + de = nilfs_find_entry(dir, &dentry->d_name, &page); if (!de) goto out; @@ -389,7 +386,7 @@ static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry, return err; err = -ENOENT; - old_de = nilfs_find_entry(old_dir, old_dentry, &old_page); + old_de = nilfs_find_entry(old_dir, &old_dentry->d_name, &old_page); if (!old_de) goto out; @@ -409,7 +406,7 @@ static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry, goto out_dir; err = -ENOENT; - new_de = nilfs_find_entry(new_dir, new_dentry, &new_page); + new_de = nilfs_find_entry(new_dir, &new_dentry->d_name, &new_page); if (!new_de) goto out_dir; inc_nlink(old_inode); diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h index 4da6f67e9a91..8723e5bfd071 100644 --- a/fs/nilfs2/nilfs.h +++ b/fs/nilfs2/nilfs.h @@ -217,10 +217,10 @@ static inline int nilfs_init_acl(struct inode *inode, struct inode *dir) /* dir.c */ extern int nilfs_add_link(struct dentry *, struct inode *); -extern ino_t nilfs_inode_by_name(struct inode *, struct dentry *); +extern ino_t nilfs_inode_by_name(struct inode *, const struct qstr *); extern int nilfs_make_empty(struct inode *, struct inode *); extern struct nilfs_dir_entry * -nilfs_find_entry(struct inode *, struct dentry *, struct page **); +nilfs_find_entry(struct inode *, const struct qstr *, struct page **); extern int nilfs_delete_entry(struct nilfs_dir_entry *, struct page *); extern int nilfs_empty_dir(struct inode *); extern struct nilfs_dir_entry *nilfs_dotdot(struct inode *, struct page **); -- cgit v1.2.3 From 072f98b4637eddcbdf2178fc84f382e2ee522f08 Mon Sep 17 00:00:00 2001 From: Al Viro <viro@zeniv.linux.org.uk> Date: Sun, 31 Jan 2010 21:03:58 -0500 Subject: nilfs: sanitize const/signedness in dealing with ->d_name.name Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> --- fs/nilfs2/dir.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c index 26402b9b305e..0092840492ee 100644 --- a/fs/nilfs2/dir.c +++ b/fs/nilfs2/dir.c @@ -224,7 +224,7 @@ fail: * len <= NILFS_NAME_LEN and de != NULL are guaranteed by caller. */ static int -nilfs_match(int len, const char * const name, struct nilfs_dir_entry *de) +nilfs_match(int len, const unsigned char *name, struct nilfs_dir_entry *de) { if (len != de->name_len) return 0; @@ -465,7 +465,7 @@ void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de, int nilfs_add_link(struct dentry *dentry, struct inode *inode) { struct inode *dir = dentry->d_parent->d_inode; - const char *name = dentry->d_name.name; + const unsigned char *name = dentry->d_name.name; int namelen = dentry->d_name.len; unsigned chunk_size = nilfs_chunk_size(dir); unsigned reclen = NILFS_DIR_REC_LEN(namelen); -- cgit v1.2.3 From 391e8bbd38474b9f85b1f3933394a79ea66fe1e2 Mon Sep 17 00:00:00 2001 From: Al Viro <viro@zeniv.linux.org.uk> Date: Sun, 31 Jan 2010 21:28:48 -0500 Subject: sanitize const/signedness for udf Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> --- fs/udf/balloc.c | 2 +- fs/udf/dir.c | 4 ++-- fs/udf/inode.c | 2 +- fs/udf/namei.c | 20 ++++++++++---------- fs/udf/symlink.c | 10 +++++----- 5 files changed, 19 insertions(+), 19 deletions(-) (limited to 'fs') diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c index 82372e332f08..b2d96f45c12b 100644 --- a/fs/udf/balloc.c +++ b/fs/udf/balloc.c @@ -547,7 +547,7 @@ static void udf_table_free_blocks(struct super_block *sb, } if (epos.offset + (2 * adsize) > sb->s_blocksize) { - char *sptr, *dptr; + unsigned char *sptr, *dptr; int loffset; brelse(oepos.bh); diff --git a/fs/udf/dir.c b/fs/udf/dir.c index 61d9a76a3a69..f0f2a436251e 100644 --- a/fs/udf/dir.c +++ b/fs/udf/dir.c @@ -45,8 +45,8 @@ static int do_udf_readdir(struct inode *dir, struct file *filp, int block, iblock; loff_t nf_pos = (filp->f_pos - 1) << 2; int flen; - char *fname = NULL; - char *nameptr; + unsigned char *fname = NULL; + unsigned char *nameptr; uint16_t liu; uint8_t lfi; loff_t size = udf_ext0_offset(dir) + dir->i_size; diff --git a/fs/udf/inode.c b/fs/udf/inode.c index f90231eb2916..378a7592257c 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -1672,7 +1672,7 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos, return -1; if (epos->offset + (2 * adsize) > inode->i_sb->s_blocksize) { - char *sptr, *dptr; + unsigned char *sptr, *dptr; struct buffer_head *nbh; int err, loffset; struct kernel_lb_addr obloc = epos->block; diff --git a/fs/udf/namei.c b/fs/udf/namei.c index cd2115060fdc..7c56ff00cd53 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -34,8 +34,8 @@ #include <linux/crc-itu-t.h> #include <linux/exportfs.h> -static inline int udf_match(int len1, const char *name1, int len2, - const char *name2) +static inline int udf_match(int len1, const unsigned char *name1, int len2, + const unsigned char *name2) { if (len1 != len2) return 0; @@ -142,15 +142,15 @@ int udf_write_fi(struct inode *inode, struct fileIdentDesc *cfi, } static struct fileIdentDesc *udf_find_entry(struct inode *dir, - struct qstr *child, + const struct qstr *child, struct udf_fileident_bh *fibh, struct fileIdentDesc *cfi) { struct fileIdentDesc *fi = NULL; loff_t f_pos; int block, flen; - char *fname = NULL; - char *nameptr; + unsigned char *fname = NULL; + unsigned char *nameptr; uint8_t lfi; uint16_t liu; loff_t size; @@ -308,7 +308,7 @@ static struct fileIdentDesc *udf_add_entry(struct inode *dir, { struct super_block *sb = dir->i_sb; struct fileIdentDesc *fi = NULL; - char *name = NULL; + unsigned char *name = NULL; int namelen; loff_t f_pos; loff_t size = udf_ext0_offset(dir) + dir->i_size; @@ -885,16 +885,16 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry, { struct inode *inode; struct pathComponent *pc; - char *compstart; + const char *compstart; struct udf_fileident_bh fibh; struct extent_position epos = {}; int eoffset, elen = 0; struct fileIdentDesc *fi; struct fileIdentDesc cfi; - char *ea; + uint8_t *ea; int err; int block; - char *name = NULL; + unsigned char *name = NULL; int namelen; struct buffer_head *bh; struct udf_inode_info *iinfo; @@ -970,7 +970,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry, pc = (struct pathComponent *)(ea + elen); - compstart = (char *)symname; + compstart = symname; do { symname++; diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c index c3265e1385d4..852e91845688 100644 --- a/fs/udf/symlink.c +++ b/fs/udf/symlink.c @@ -32,12 +32,12 @@ #include <linux/buffer_head.h> #include "udf_i.h" -static void udf_pc_to_char(struct super_block *sb, char *from, int fromlen, - char *to) +static void udf_pc_to_char(struct super_block *sb, unsigned char *from, + int fromlen, unsigned char *to) { struct pathComponent *pc; int elen = 0; - char *p = to; + unsigned char *p = to; while (elen < fromlen) { pc = (struct pathComponent *)(from + elen); @@ -75,9 +75,9 @@ static int udf_symlink_filler(struct file *file, struct page *page) { struct inode *inode = page->mapping->host; struct buffer_head *bh = NULL; - char *symlink; + unsigned char *symlink; int err = -EIO; - char *p = kmap(page); + unsigned char *p = kmap(page); struct udf_inode_info *iinfo; lock_kernel(); -- cgit v1.2.3 From e21e7095a78867364d7aa9223d833ccb966f93f3 Mon Sep 17 00:00:00 2001 From: Al Viro <viro@zeniv.linux.org.uk> Date: Mon, 1 Feb 2010 11:05:16 -0500 Subject: Don't mess with generic_permission() under ->d_lock in hpfs Just use dentry_unhash() there Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> --- fs/hpfs/namei.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c index 15fd2c06f4a7..11c2b4080f65 100644 --- a/fs/hpfs/namei.c +++ b/fs/hpfs/namei.c @@ -413,22 +413,25 @@ again: mutex_unlock(&hpfs_i(dir)->i_mutex); mutex_unlock(&hpfs_i(inode)->i_parent_mutex); - d_drop(dentry); - spin_lock(&dentry->d_lock); - if (atomic_read(&dentry->d_count) > 1 || - generic_permission(inode, MAY_WRITE, NULL) || + dentry_unhash(dentry); + if (!d_unhashed(dentry)) { + dput(dentry); + unlock_kernel(); + return -ENOSPC; + } + if (generic_permission(inode, MAY_WRITE, NULL) || !S_ISREG(inode->i_mode) || get_write_access(inode)) { - spin_unlock(&dentry->d_lock); d_rehash(dentry); + dput(dentry); } else { struct iattr newattrs; - spin_unlock(&dentry->d_lock); /*printk("HPFS: truncating file before delete.\n");*/ newattrs.ia_size = 0; newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME; err = notify_change(dentry, &newattrs); put_write_access(inode); + dput(dentry); if (!err) goto again; } -- cgit v1.2.3 From 9f5596af44514f99e3a654a4f7cb813354b9e516 Mon Sep 17 00:00:00 2001 From: Al Viro <viro@zeniv.linux.org.uk> Date: Fri, 5 Feb 2010 00:40:25 -0500 Subject: take check for new events in namespace (guts of mounts_poll()) to namespace.c Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> --- fs/namespace.c | 15 +++++++++++++++ fs/proc/base.c | 10 ++-------- include/linux/mnt_namespace.h | 1 + 3 files changed, 18 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/namespace.c b/fs/namespace.c index d5906c19e08e..970fe79d7867 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -737,6 +737,21 @@ static void m_stop(struct seq_file *m, void *v) up_read(&namespace_sem); } +int mnt_had_events(struct proc_mounts *p) +{ + struct mnt_namespace *ns = p->ns; + int res = 0; + + spin_lock(&vfsmount_lock); + if (p->event != ns->event) { + p->event = ns->event; + res = 1; + } + spin_unlock(&vfsmount_lock); + + return res; +} + struct proc_fs_info { int flag; const char *str; diff --git a/fs/proc/base.c b/fs/proc/base.c index 58324c299165..746895ddfda1 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -647,17 +647,11 @@ static int mounts_release(struct inode *inode, struct file *file) static unsigned mounts_poll(struct file *file, poll_table *wait) { struct proc_mounts *p = file->private_data; - struct mnt_namespace *ns = p->ns; unsigned res = POLLIN | POLLRDNORM; - poll_wait(file, &ns->poll, wait); - - spin_lock(&vfsmount_lock); - if (p->event != ns->event) { - p->event = ns->event; + poll_wait(file, &p->ns->poll, wait); + if (mnt_had_events(p)) res |= POLLERR | POLLPRI; - } - spin_unlock(&vfsmount_lock); return res; } diff --git a/include/linux/mnt_namespace.h b/include/linux/mnt_namespace.h index d74785c2393a..0b89efc6f215 100644 --- a/include/linux/mnt_namespace.h +++ b/include/linux/mnt_namespace.h @@ -35,6 +35,7 @@ static inline void get_mnt_ns(struct mnt_namespace *ns) extern const struct seq_operations mounts_op; extern const struct seq_operations mountinfo_op; extern const struct seq_operations mountstats_op; +extern int mnt_had_events(struct proc_mounts *); #endif #endif -- cgit v1.2.3 From 47cd813f2984569570021ce3d34cdf9cb20aa6a2 Mon Sep 17 00:00:00 2001 From: Al Viro <viro@zeniv.linux.org.uk> Date: Fri, 5 Feb 2010 02:01:14 -0500 Subject: Take vfsmount_lock to fs/internal.h no more users left outside of fs/*.c (and very few outside of fs/namespace.c, actually) Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> --- fs/internal.h | 2 ++ include/linux/mount.h | 1 - 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/internal.h b/fs/internal.h index e96a1667d749..8a03a5447bdf 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -70,6 +70,8 @@ extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int); extern void __init mnt_init(void); +extern spinlock_t vfsmount_lock; + /* * fs_struct.c */ diff --git a/include/linux/mount.h b/include/linux/mount.h index 375d43a5d802..163896137ab5 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -132,7 +132,6 @@ extern int do_add_mount(struct vfsmount *newmnt, struct path *path, extern void mark_mounts_for_expiry(struct list_head *mounts); -extern spinlock_t vfsmount_lock; extern dev_t name_to_dev_t(char *name); #endif /* _LINUX_MOUNT_H */ -- cgit v1.2.3 From d498b25a4f877be535246c4e5b6ee5890781a477 Mon Sep 17 00:00:00 2001 From: Al Viro <viro@zeniv.linux.org.uk> Date: Fri, 5 Feb 2010 02:21:06 -0500 Subject: get rid of useless vfsmount_lock use in put_mnt_ns() It hadn't been needed since we'd sanitized the logics in mark_mounts_for_expiry() (which, in turn, used to be a rudiment of bad old times when namespace_sem was per-ns). Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> --- fs/namespace.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/namespace.c b/fs/namespace.c index 970fe79d7867..b0b15cc2117c 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2344,17 +2344,13 @@ void __init mnt_init(void) void put_mnt_ns(struct mnt_namespace *ns) { - struct vfsmount *root; LIST_HEAD(umount_list); - if (!atomic_dec_and_lock(&ns->count, &vfsmount_lock)) + if (!atomic_dec_and_test(&ns->count)) return; - root = ns->root; - ns->root = NULL; - spin_unlock(&vfsmount_lock); down_write(&namespace_sem); spin_lock(&vfsmount_lock); - umount_tree(root, 0, &umount_list); + umount_tree(ns->root, 0, &umount_list); spin_unlock(&vfsmount_lock); up_write(&namespace_sem); release_mounts(&umount_list); -- cgit v1.2.3 From 8089352a13b785d4e0df63d87bd2b71c76bb9aee Mon Sep 17 00:00:00 2001 From: Al Viro <viro@zeniv.linux.org.uk> Date: Fri, 5 Feb 2010 09:30:46 -0500 Subject: Mirror MS_KERNMOUNT in ->mnt_flags Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> --- fs/namespace.c | 2 +- fs/super.c | 3 +++ include/linux/mount.h | 2 ++ 3 files changed, 6 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/namespace.c b/fs/namespace.c index b0b15cc2117c..ffa3843404e0 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1701,7 +1701,7 @@ int do_add_mount(struct vfsmount *newmnt, struct path *path, { int err; - mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD); + mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL); down_write(&namespace_sem); /* Something was mounted here while we slept */ diff --git a/fs/super.c b/fs/super.c index 903896ec7c73..f35ac6022109 100644 --- a/fs/super.c +++ b/fs/super.c @@ -937,6 +937,9 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void if (!mnt) goto out; + if (flags & MS_KERNMOUNT) + mnt->mnt_flags = MNT_INTERNAL; + if (data && !(type->fs_flags & FS_BINARY_MOUNTDATA)) { secdata = alloc_secdata(); if (!secdata) diff --git a/include/linux/mount.h b/include/linux/mount.h index 163896137ab5..ca726ebf50a3 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -45,6 +45,8 @@ struct mnt_namespace; #define MNT_PROPAGATION_MASK (MNT_SHARED | MNT_UNBINDABLE) +#define MNT_INTERNAL 0x4000 + struct vfsmount { struct list_head mnt_hash; struct vfsmount *mnt_parent; /* fs we are mounted on */ -- cgit v1.2.3 From 0ceeca5a08abb1d880f0cc0ea812ad14932070e0 Mon Sep 17 00:00:00 2001 From: Al Viro <viro@zeniv.linux.org.uk> Date: Fri, 5 Feb 2010 09:34:36 -0500 Subject: hppfs can use existing proc_mnt, no need for do_kern_mount() in there Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> --- fs/hppfs/hppfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c index 7239efc690d8..2e4dfa8593da 100644 --- a/fs/hppfs/hppfs.c +++ b/fs/hppfs/hppfs.c @@ -718,7 +718,7 @@ static int hppfs_fill_super(struct super_block *sb, void *d, int silent) struct vfsmount *proc_mnt; int err = -ENOENT; - proc_mnt = do_kern_mount("proc", 0, "proc", NULL); + proc_mnt = mntget(current->nsproxy->pid_ns->proc_mnt); if (IS_ERR(proc_mnt)) goto out; -- cgit v1.2.3 From db1f05bb85d7966b9176e293f3ceead1cb8b5d79 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi <mszeredi@suse.cz> Date: Wed, 10 Feb 2010 12:15:53 +0100 Subject: vfs: add NOFOLLOW flag to umount(2) Add a new UMOUNT_NOFOLLOW flag to umount(2). This is needed to prevent symlink attacks in unprivileged unmounts (fuse, samba, ncpfs). Additionally, return -EINVAL if an unknown flag is used (and specify an explicitly unused flag: UMOUNT_UNUSED). This makes it possible for the caller to determine if a flag is supported or not. CC: Eugene Teo <eugene@redhat.com> CC: Michael Kerrisk <mtk.manpages@gmail.com> Signed-off-by: Miklos Szeredi <mszeredi@suse.cz> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> --- fs/namespace.c | 9 ++++++++- include/linux/fs.h | 2 ++ 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/namespace.c b/fs/namespace.c index ffa3843404e0..8174c8ab5c70 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1136,8 +1136,15 @@ SYSCALL_DEFINE2(umount, char __user *, name, int, flags) { struct path path; int retval; + int lookup_flags = 0; - retval = user_path(name, &path); + if (flags & ~(MNT_FORCE | MNT_DETACH | MNT_EXPIRE | UMOUNT_NOFOLLOW)) + return -EINVAL; + + if (!(flags & UMOUNT_NOFOLLOW)) + lookup_flags |= LOOKUP_FOLLOW; + + retval = user_path_at(AT_FDCWD, name, lookup_flags, &path); if (retval) goto out; retval = -EINVAL; diff --git a/include/linux/fs.h b/include/linux/fs.h index e764f247d0ab..5b3182c7eb5f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1305,6 +1305,8 @@ extern int send_sigurg(struct fown_struct *fown); #define MNT_FORCE 0x00000001 /* Attempt to forcibily umount */ #define MNT_DETACH 0x00000002 /* Just detach from the tree */ #define MNT_EXPIRE 0x00000004 /* Mark for expiry */ +#define UMOUNT_NOFOLLOW 0x00000008 /* Don't follow symlink on umount */ +#define UMOUNT_UNUSED 0x80000000 /* Flag guaranteed to be unused */ extern struct list_head super_blocks; extern spinlock_t sb_lock; -- cgit v1.2.3 From bec1052e5be6a70f03f6adc650f3a6e4c2f44ddf Mon Sep 17 00:00:00 2001 From: Al Viro <viro@zeniv.linux.org.uk> Date: Wed, 3 Mar 2010 14:12:08 -0500 Subject: set S_DEAD on unlink() and non-directory rename() victims Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> --- fs/namei.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 3df2ed50ab57..54d33df06be0 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2262,8 +2262,11 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry) error = -EBUSY; else { error = security_inode_unlink(dir, dentry); - if (!error) + if (!error) { error = dir->i_op->unlink(dir, dentry); + if (!error) + dentry->d_inode->i_flags |= S_DEAD; + } } mutex_unlock(&dentry->d_inode->i_mutex); @@ -2616,6 +2619,8 @@ static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry, else error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); if (!error) { + if (target) + target->i_flags |= S_DEAD; if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) d_move(old_dentry, new_dentry); } -- cgit v1.2.3 From 4919c5e45a91b5db5a41695fe0357fbdff0d5767 Mon Sep 17 00:00:00 2001 From: Al Viro <viro@zeniv.linux.org.uk> Date: Wed, 3 Mar 2010 14:13:08 -0500 Subject: fix race in d_splice_alias() rehashing the negative placeholder opens a race with d_lookup(); we unhash it almost immediately (by d_move()), but the race window is there. Since d_move() doesn't rely on target being hashed, we don't need that d_rehash() at all. Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> --- fs/dcache.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/dcache.c b/fs/dcache.c index 74da947b160b..f1358e5c3a59 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1222,7 +1222,6 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry) BUG_ON(!(new->d_flags & DCACHE_DISCONNECTED)); spin_unlock(&dcache_lock); security_d_instantiate(new, inode); - d_rehash(dentry); d_move(new, dentry); iput(inode); } else { -- cgit v1.2.3 From 8d75da8afd068fa58b35e69c7c8c46770d9e7a98 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" <bfields@citi.umich.edu> Date: Wed, 3 Mar 2010 16:13:29 -0500 Subject: nfsd4: fix minor memory leak There's no need to allocate this cred more than once. Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu> --- fs/nfsd/nfs4callback.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 8fa412ce8021..4bc22c763de7 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -525,6 +525,8 @@ static struct rpc_cred *callback_cred; int set_callback_cred(void) { + if (callback_cred) + return 0; callback_cred = rpc_lookup_machine_cred(); if (!callback_cred) return -ENOMEM; -- cgit v1.2.3 From 9b1d0998d24f9c207d5fbdd0b8bac07284e0eda7 Mon Sep 17 00:00:00 2001 From: Jan Kara <jack@suse.cz> Date: Wed, 3 Mar 2010 16:19:32 -0500 Subject: ext4: Release page references acquired in ext4_da_block_invalidatepages We forget to release page references we acquire in ext4_da_block_invalidatepages. Luckily, this function gets called only if we are not able to allocate blocks for delay-allocated data so that function should better never be called. Also cleanup handling of index variable. Reported-by: Wu Fengguang <fengguang.wu@intel.com> Signed-off-by: Jan Kara <jack@suse.cz> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu> --- fs/ext4/inode.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index c717a74f2178..f55df7192b95 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2157,17 +2157,16 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd, break; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; - index = page->index; - if (index > end) + if (page->index > end) break; - index++; - BUG_ON(!PageLocked(page)); BUG_ON(PageWriteback(page)); block_invalidatepage(page, 0); ClearPageUptodate(page); unlock_page(page); } + index = pvec.pages[nr_pages - 1]->index + 1; + pagevec_release(&pvec); } return; } -- cgit v1.2.3 From 5661bd6861b7490394e29aaf74dca812188272e4 Mon Sep 17 00:00:00 2001 From: Akinobu Mita <akinobu.mita@gmail.com> Date: Wed, 3 Mar 2010 23:53:39 -0500 Subject: ext4: cleanup to use ext4_group_first_block_no() This is a cleanup and simplification patch which takes some open-coded calculations to calculate the first block number of a group and converts them to use the (already defined) ext4_group_first_block_no() function. Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu> Cc: Andreas Dilger <adilger@sun.com> --- fs/ext4/balloc.c | 3 +-- fs/ext4/extents.c | 3 +-- fs/ext4/mballoc.c | 29 +++++++++++------------------ fs/ext4/mballoc.h | 7 +------ 4 files changed, 14 insertions(+), 28 deletions(-) (limited to 'fs') diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 720061a0ee53..fadbc4d3a202 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -130,8 +130,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, * to make sure we calculate the right free blocks */ group_blocks = ext4_blocks_count(sbi->s_es) - - le32_to_cpu(sbi->s_es->s_first_data_block) - - (EXT4_BLOCKS_PER_GROUP(sb) * (ngroups - 1)); + ext4_group_first_block_no(sb, ngroups - 1); } else { group_blocks = EXT4_BLOCKS_PER_GROUP(sb); } diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 4bb69206f175..3c0bae11a097 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -195,8 +195,7 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, if (S_ISREG(inode->i_mode)) block_group++; } - bg_start = (block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) + - le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_first_data_block); + bg_start = ext4_group_first_block_no(inode->i_sb, block_group); last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; /* diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 72d5ceb18523..11568697b192 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -441,10 +441,9 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b, for (i = 0; i < count; i++) { if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) { ext4_fsblk_t blocknr; - blocknr = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb); + + blocknr = ext4_group_first_block_no(sb, e4b->bd_group); blocknr += first + i; - blocknr += - le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); ext4_grp_locked_error(sb, e4b->bd_group, __func__, "double-free of inode" " %lu's block %llu(bit %u in group %u)", @@ -1255,10 +1254,9 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, if (!mb_test_bit(block, EXT4_MB_BITMAP(e4b))) { ext4_fsblk_t blocknr; - blocknr = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb); + + blocknr = ext4_group_first_block_no(sb, e4b->bd_group); blocknr += block; - blocknr += - le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); ext4_grp_locked_error(sb, e4b->bd_group, __func__, "double-free of inode" " %lu's block %llu(bit %u in group %u)", @@ -1631,7 +1629,6 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac, int max; int err; struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); - struct ext4_super_block *es = sbi->s_es; struct ext4_free_extent ex; if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL)) @@ -1648,8 +1645,8 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac, if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) { ext4_fsblk_t start; - start = (e4b->bd_group * EXT4_BLOCKS_PER_GROUP(ac->ac_sb)) + - ex.fe_start + le32_to_cpu(es->s_first_data_block); + start = ext4_group_first_block_no(ac->ac_sb, e4b->bd_group) + + ex.fe_start; /* use do_div to get remainder (would be 64-bit modulo) */ if (do_div(start, sbi->s_stripe) == 0) { ac->ac_found++; @@ -1803,8 +1800,8 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, BUG_ON(sbi->s_stripe == 0); /* find first stripe-aligned block in group */ - first_group_block = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb) - + le32_to_cpu(sbi->s_es->s_first_data_block); + first_group_block = ext4_group_first_block_no(sb, e4b->bd_group); + a = first_group_block + sbi->s_stripe - 1; do_div(a, sbi->s_stripe); i = (a * sbi->s_stripe) - first_group_block; @@ -2560,12 +2557,9 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) ext4_unlock_group(sb, entry->group); if (test_opt(sb, DISCARD)) { ext4_fsblk_t discard_block; - struct ext4_super_block *es = EXT4_SB(sb)->s_es; - discard_block = (ext4_fsblk_t)entry->group * - EXT4_BLOCKS_PER_GROUP(sb) - + entry->start_blk - + le32_to_cpu(es->s_first_data_block); + discard_block = entry->start_blk + + ext4_group_first_block_no(sb, entry->group); trace_ext4_discard_blocks(sb, (unsigned long long)discard_block, entry->count); @@ -3525,8 +3519,7 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, if (bit >= end) break; next = mb_find_next_bit(bitmap_bh->b_data, end, bit); - start = group * EXT4_BLOCKS_PER_GROUP(sb) + bit + - le32_to_cpu(sbi->s_es->s_first_data_block); + start = ext4_group_first_block_no(sb, group) + bit; mb_debug(1, " free preallocated %u/%u in group %u\n", (unsigned) start, (unsigned) next - bit, (unsigned) group); diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 436521cae456..9b2deed652b7 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -225,11 +225,6 @@ struct ext4_buddy { static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb, struct ext4_free_extent *fex) { - ext4_fsblk_t block; - - block = (ext4_fsblk_t) fex->fe_group * EXT4_BLOCKS_PER_GROUP(sb) - + fex->fe_start - + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); - return block; + return ext4_group_first_block_no(sb, fex->fe_group) + fex->fe_start; } #endif -- cgit v1.2.3 From bda00de7e8569b1fcde27b68fa59e74e14c5f93a Mon Sep 17 00:00:00 2001 From: Akinobu Mita <akinobu.mita@gmail.com> Date: Wed, 3 Mar 2010 23:53:25 -0500 Subject: ext4: cleanup to use ext4_grp_offs_to_block() More cleanup to convert open-coded calculations of the first block number of a free extent to use ext4_grp_offs_to_block() instead. Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu> Cc: Andreas Dilger <adilger@sun.com> --- fs/ext4/mballoc.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 11568697b192..37d2438e0cb4 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2697,9 +2697,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, if (err) goto out_err; - block = ac->ac_b_ex.fe_group * EXT4_BLOCKS_PER_GROUP(sb) - + ac->ac_b_ex.fe_start - + le32_to_cpu(es->s_first_data_block); + block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); len = ac->ac_b_ex.fe_len; if (!ext4_data_block_valid(sbi, block, len)) { @@ -3154,9 +3152,7 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) /* The max size of hash table is PREALLOC_TB_SIZE */ order = PREALLOC_TB_SIZE - 1; - goal_block = ac->ac_g_ex.fe_group * EXT4_BLOCKS_PER_GROUP(ac->ac_sb) + - ac->ac_g_ex.fe_start + - le32_to_cpu(EXT4_SB(ac->ac_sb)->s_es->s_first_data_block); + goal_block = ext4_grp_offs_to_block(ac->ac_sb, &ac->ac_g_ex); /* * search for the prealloc space that is having * minimal distance from the goal block. -- cgit v1.2.3 From 731eb1a03a8445cde2cb23ecfb3580c6fa7bb690 Mon Sep 17 00:00:00 2001 From: Akinobu Mita <akinobu.mita@gmail.com> Date: Wed, 3 Mar 2010 23:55:01 -0500 Subject: ext4: consolidate in_range() definitions There are duplicate macro definitions of in_range() in mballoc.h and balloc.c. This consolidates these two definitions into ext4.h, and changes extents.c to use in_range() as well. Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu> Cc: Andreas Dilger <adilger@sun.com> --- fs/ext4/balloc.c | 3 --- fs/ext4/ext4.h | 2 ++ fs/ext4/extents.c | 4 ++-- fs/ext4/mballoc.h | 2 -- 4 files changed, 4 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index fadbc4d3a202..d2f37a5516c7 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -188,9 +188,6 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, * when a file system is mounted (see ext4_fill_super). */ - -#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) - /** * ext4_get_group_desc() -- load group descriptor from disk * @sb: super block diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 3d85bbb09f1b..9b179163f1de 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1819,6 +1819,8 @@ static inline void set_bitmap_uptodate(struct buffer_head *bh) set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state); } +#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) + #endif /* __KERNEL__ */ #endif /* _EXT4_H */ diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 3c0bae11a097..94c8ee81f5e1 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2051,7 +2051,7 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block, BUG_ON(cex->ec_type != EXT4_EXT_CACHE_GAP && cex->ec_type != EXT4_EXT_CACHE_EXTENT); - if (block >= cex->ec_block && block < cex->ec_block + cex->ec_len) { + if (in_range(block, cex->ec_block, cex->ec_len)) { ex->ee_block = cpu_to_le32(cex->ec_block); ext4_ext_store_pblock(ex, cex->ec_start); ex->ee_len = cpu_to_le16(cex->ec_len); @@ -3364,7 +3364,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, */ ee_len = ext4_ext_get_actual_len(ex); /* if found extent covers block, simply return it */ - if (iblock >= ee_block && iblock < ee_block + ee_len) { + if (in_range(iblock, ee_block, ee_len)) { newblock = iblock - ee_block + ee_start; /* number of remaining blocks in the extent */ allocated = ee_len - (iblock - ee_block); diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 9b2deed652b7..b619322c76f0 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -220,8 +220,6 @@ struct ext4_buddy { #define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap) #define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy) -#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) - static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb, struct ext4_free_extent *fex) { -- cgit v1.2.3 From 5fd5249aa36fad98c9fd5edced352939e54f9324 Mon Sep 17 00:00:00 2001 From: Akira Fujita <a-fujita@rs.jp.nec.com> Date: Thu, 4 Mar 2010 00:31:06 -0500 Subject: ext4: Fix insertion point of extent in mext_insert_across_blocks() If the leaf node has 2 extent space or fewer and EXT4_IOC_MOVE_EXT ioctl is called with the file offset where after the 2nd extent covers, mext_insert_across_blocks() always tries to insert extent into the first extent. As a result, the file gets corrupted because of wrong extent order. The patch fixes this problem. Signed-off-by: Akira Fujita <a-fujita@rs.jp.nec.com> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu> --- fs/ext4/move_extent.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 1654eb862d74..9eca1c0ec546 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -252,6 +252,7 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode, } o_start->ee_len = start_ext->ee_len; + eblock = le32_to_cpu(start_ext->ee_block); new_flag = 1; } else if (start_ext->ee_len && new_ext->ee_len && @@ -262,6 +263,7 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode, * orig |------------------------------| */ o_start->ee_len = start_ext->ee_len; + eblock = le32_to_cpu(start_ext->ee_block); new_flag = 1; } else if (!start_ext->ee_len && new_ext->ee_len && @@ -502,6 +504,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode, le32_to_cpu(oext->ee_block) + oext_alen) { start_ext.ee_len = cpu_to_le16(le32_to_cpu(new_ext.ee_block) - le32_to_cpu(oext->ee_block)); + start_ext.ee_block = oext->ee_block; copy_extent_status(oext, &start_ext); } else if (oext > EXT_FIRST_EXTENT(orig_path[depth].p_hdr)) { prev_ext = oext - 1; @@ -515,6 +518,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode, start_ext.ee_len = cpu_to_le16( ext4_ext_get_actual_len(prev_ext) + new_ext_alen); + start_ext.ee_block = oext->ee_block; copy_extent_status(prev_ext, &start_ext); new_ext.ee_len = 0; } -- cgit v1.2.3 From 7247c0caa23d94a1cb6b307edba9dc45fb0798d4 Mon Sep 17 00:00:00 2001 From: Akira Fujita <a-fujita@rs.jp.nec.com> Date: Thu, 4 Mar 2010 00:34:58 -0500 Subject: ext4: Fix the NULL reference in double_down_write_data_sem() If EXT4_IOC_MOVE_EXT ioctl is called with NULL donor_fd, fget() in ext4_ioctl() gets inappropriate file structure for donor; so we need to do this check earlier, before calling double_down_write_data_sem(). Signed-off-by: Akira Fujita <a-fujita@rs.jp.nec.com> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu> --- fs/ext4/move_extent.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 9eca1c0ec546..7e99f4e72bf5 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -953,14 +953,6 @@ mext_check_arguments(struct inode *orig_inode, unsigned int blkbits = orig_inode->i_blkbits; unsigned int blocksize = 1 << blkbits; - /* Regular file check */ - if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) { - ext4_debug("ext4 move extent: The argument files should be " - "regular file [ino:orig %lu, donor %lu]\n", - orig_inode->i_ino, donor_inode->i_ino); - return -EINVAL; - } - if (donor_inode->i_mode & (S_ISUID|S_ISGID)) { ext4_debug("ext4 move extent: suid or sgid is set" " to donor file [ino:orig %lu, donor %lu]\n", @@ -1208,6 +1200,14 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, return -EINVAL; } + /* Regular file check */ + if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) { + ext4_debug("ext4 move extent: The argument files should be " + "regular file [ino:orig %lu, donor %lu]\n", + orig_inode->i_ino, donor_inode->i_ino); + return -EINVAL; + } + /* Protect orig and donor inodes against a truncate */ ret1 = mext_inode_double_lock(orig_inode, donor_inode); if (ret1 < 0) -- cgit v1.2.3 From c437b2733520599a2c6e0dbcdeae611319f84707 Mon Sep 17 00:00:00 2001 From: Akira Fujita <a-fujita@rs.jp.nec.com> Date: Thu, 4 Mar 2010 00:39:24 -0500 Subject: ext4: Code cleanup for EXT4_IOC_MOVE_EXT ioctl a) Fix sparse warning in ext4_ioctl() b) Remove unneeded variable in mext_leaf_block() c) Fix spelling typo in mext_check_arguments() Signed-off-by: Akira Fujita <a-fujita@rs.jp.nec.com> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu> --- fs/ext4/ioctl.c | 3 ++- fs/ext4/move_extent.c | 4 +--- 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 2220feb2dcc1..016d0249294f 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -258,7 +258,8 @@ setversion_out: if (me.moved_len > 0) file_remove_suid(donor_filp); - if (copy_to_user((struct move_extent *)arg, &me, sizeof(me))) + if (copy_to_user((struct move_extent __user *)arg, + &me, sizeof(me))) err = -EFAULT; mext_out: fput(donor_filp); diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 7e99f4e72bf5..aa5fe28d180f 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -477,7 +477,6 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode, struct ext4_extent *oext, *o_start, *o_end, *prev_ext; struct ext4_extent new_ext, start_ext, end_ext; ext4_lblk_t new_ext_end; - ext4_fsblk_t new_phys_end; int oext_alen, new_ext_alen, end_ext_alen; int depth = ext_depth(orig_inode); int ret; @@ -491,7 +490,6 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode, new_ext.ee_len = dext->ee_len; new_ext_alen = ext4_ext_get_actual_len(&new_ext); new_ext_end = le32_to_cpu(new_ext.ee_block) + new_ext_alen - 1; - new_phys_end = ext_pblock(&new_ext) + new_ext_alen - 1; /* * Case: original extent is first @@ -932,7 +930,7 @@ out2: } /** - * mext_check_argumants - Check whether move extent can be done + * mext_check_arguments - Check whether move extent can be done * * @orig_inode: original inode * @donor_inode: donor inode -- cgit v1.2.3 From 9421502b4fc894cc477be8fc49776830e37ca157 Mon Sep 17 00:00:00 2001 From: Joern Engel <joern@logfs.org> Date: Thu, 4 Mar 2010 21:30:58 +0100 Subject: [LogFS] Fix bdev erases Erases for block devices were always just emulated by writing 0xff. Some time back the write was removed and only the page cache was changed to 0xff. Superficialy a good idea with two problems: 1. Touching the page cache isn't necessary either. 2. However, writing out 0xff _is_ necessary for the journal. As the journal is scanned linearly, an old non-overwritten commit entry can be used on next mount and cause havoc. This should fix both aspects. --- fs/logfs/dev_bdev.c | 88 +++++++++++++++++++++++++++++++++++++++++++++-------- fs/logfs/dev_mtd.c | 3 +- fs/logfs/journal.c | 2 +- fs/logfs/logfs.h | 6 ++-- fs/logfs/segment.c | 6 ++-- fs/logfs/super.c | 12 +++++++- 6 files changed, 97 insertions(+), 20 deletions(-) (limited to 'fs') diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c index 58a057b6e1af..9718c22f186d 100644 --- a/fs/logfs/dev_bdev.c +++ b/fs/logfs/dev_bdev.c @@ -167,27 +167,91 @@ static void bdev_writeseg(struct super_block *sb, u64 ofs, size_t len) generic_unplug_device(bdev_get_queue(logfs_super(sb)->s_bdev)); } -static int bdev_erase(struct super_block *sb, loff_t to, size_t len) + +static void erase_end_io(struct bio *bio, int err) +{ + const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + struct super_block *sb = bio->bi_private; + struct logfs_super *super = logfs_super(sb); + + BUG_ON(!uptodate); /* FIXME: Retry io or write elsewhere */ + BUG_ON(err); + BUG_ON(bio->bi_vcnt == 0); + bio_put(bio); + if (atomic_dec_and_test(&super->s_pending_writes)) + wake_up(&wq); +} + +static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index, + size_t nr_pages) +{ + struct logfs_super *super = logfs_super(sb); + struct bio *bio; + struct request_queue *q = bdev_get_queue(sb->s_bdev); + unsigned int max_pages = queue_max_hw_sectors(q) >> (PAGE_SHIFT - 9); + int i; + + bio = bio_alloc(GFP_NOFS, max_pages); + BUG_ON(!bio); /* FIXME: handle this */ + + for (i = 0; i < nr_pages; i++) { + if (i >= max_pages) { + /* Block layer cannot split bios :( */ + bio->bi_vcnt = i; + bio->bi_idx = 0; + bio->bi_size = i * PAGE_SIZE; + bio->bi_bdev = super->s_bdev; + bio->bi_sector = ofs >> 9; + bio->bi_private = sb; + bio->bi_end_io = erase_end_io; + atomic_inc(&super->s_pending_writes); + submit_bio(WRITE, bio); + + ofs += i * PAGE_SIZE; + index += i; + nr_pages -= i; + i = 0; + + bio = bio_alloc(GFP_NOFS, max_pages); + BUG_ON(!bio); + } + bio->bi_io_vec[i].bv_page = super->s_erase_page; + bio->bi_io_vec[i].bv_len = PAGE_SIZE; + bio->bi_io_vec[i].bv_offset = 0; + } + bio->bi_vcnt = nr_pages; + bio->bi_idx = 0; + bio->bi_size = nr_pages * PAGE_SIZE; + bio->bi_bdev = super->s_bdev; + bio->bi_sector = ofs >> 9; + bio->bi_private = sb; + bio->bi_end_io = erase_end_io; + atomic_inc(&super->s_pending_writes); + submit_bio(WRITE, bio); + return 0; +} + +static int bdev_erase(struct super_block *sb, loff_t to, size_t len, + int ensure_write) { struct logfs_super *super = logfs_super(sb); - struct address_space *mapping = super->s_mapping_inode->i_mapping; - struct page *page; - pgoff_t index = to >> PAGE_SHIFT; - int i, nr_pages = len >> PAGE_SHIFT; BUG_ON(to & (PAGE_SIZE - 1)); BUG_ON(len & (PAGE_SIZE - 1)); - if (logfs_super(sb)->s_flags & LOGFS_SB_FLAG_RO) + if (super->s_flags & LOGFS_SB_FLAG_RO) return -EROFS; - for (i = 0; i < nr_pages; i++) { - page = find_get_page(mapping, index + i); - if (page) { - memset(page_address(page), 0xFF, PAGE_SIZE); - page_cache_release(page); - } + if (ensure_write) { + /* + * Object store doesn't care whether erases happen or not. + * But for the journal they are required. Otherwise a scan + * can find an old commit entry and assume it is the current + * one, travelling back in time. + */ + do_erase(sb, to, to >> PAGE_SHIFT, len >> PAGE_SHIFT); } + return 0; } diff --git a/fs/logfs/dev_mtd.c b/fs/logfs/dev_mtd.c index 68e99d046c23..cafb6ef2e05b 100644 --- a/fs/logfs/dev_mtd.c +++ b/fs/logfs/dev_mtd.c @@ -83,7 +83,8 @@ static int mtd_erase_mapping(struct super_block *sb, loff_t ofs, size_t len) return 0; } -static int mtd_erase(struct super_block *sb, loff_t ofs, size_t len) +static int mtd_erase(struct super_block *sb, loff_t ofs, size_t len, + int ensure_write) { struct mtd_info *mtd = logfs_super(sb)->s_mtd; struct erase_info ei; diff --git a/fs/logfs/journal.c b/fs/logfs/journal.c index 2f2e8e4fd02d..c0e7d63221d4 100644 --- a/fs/logfs/journal.c +++ b/fs/logfs/journal.c @@ -392,7 +392,7 @@ static int journal_erase_segment(struct logfs_area *area) u64 ofs; int err; - err = logfs_erase_segment(sb, area->a_segno); + err = logfs_erase_segment(sb, area->a_segno, 1); if (err) return err; diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h index e3082abe9e3b..72592114a28f 100644 --- a/fs/logfs/logfs.h +++ b/fs/logfs/logfs.h @@ -151,7 +151,8 @@ struct logfs_device_ops { int (*write_sb)(struct super_block *sb, struct page *page); int (*readpage)(void *_sb, struct page *page); void (*writeseg)(struct super_block *sb, u64 ofs, size_t len); - int (*erase)(struct super_block *sb, loff_t ofs, size_t len); + int (*erase)(struct super_block *sb, loff_t ofs, size_t len, + int ensure_write); void (*sync)(struct super_block *sb); void (*put_device)(struct super_block *sb); }; @@ -327,6 +328,7 @@ struct logfs_super { u64 s_feature_compat; u64 s_feature_flags; u64 s_sb_ofs[2]; + struct page *s_erase_page; /* for dev_bdev.c */ /* alias.c fields */ struct btree_head32 s_segment_alias; /* remapped segments */ int s_no_object_aliases; @@ -572,7 +574,7 @@ int get_page_reserve(struct inode *inode, struct page *page); extern struct logfs_block_ops indirect_block_ops; /* segment.c */ -int logfs_erase_segment(struct super_block *sb, u32 ofs); +int logfs_erase_segment(struct super_block *sb, u32 ofs, int ensure_erase); int wbuf_read(struct super_block *sb, u64 ofs, size_t len, void *buf); int logfs_segment_read(struct inode *inode, struct page *page, u64 ofs, u64 bix, level_t level); diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c index 5f58b74516ca..664cd0dd3576 100644 --- a/fs/logfs/segment.c +++ b/fs/logfs/segment.c @@ -25,14 +25,14 @@ static int logfs_mark_segment_bad(struct super_block *sb, u32 segno) return 0; } -int logfs_erase_segment(struct super_block *sb, u32 segno) +int logfs_erase_segment(struct super_block *sb, u32 segno, int ensure_erase) { struct logfs_super *super = logfs_super(sb); super->s_gec++; return super->s_devops->erase(sb, (u64)segno << super->s_segshift, - super->s_segsize); + super->s_segsize, ensure_erase); } static s64 logfs_get_free_bytes(struct logfs_area *area, size_t bytes) @@ -798,7 +798,7 @@ static int ostore_erase_segment(struct logfs_area *area) u64 ofs; int err; - err = logfs_erase_segment(sb, area->a_segno); + err = logfs_erase_segment(sb, area->a_segno, 0); if (err) return err; diff --git a/fs/logfs/super.c b/fs/logfs/super.c index d128a2c1c8d1..94d80f7ee7c2 100644 --- a/fs/logfs/super.c +++ b/fs/logfs/super.c @@ -317,6 +317,7 @@ static int logfs_make_writeable(struct super_block *sb) static int logfs_get_sb_final(struct super_block *sb, struct vfsmount *mnt) { + struct logfs_super *super = logfs_super(sb); struct inode *rootdir; int err; @@ -329,15 +330,22 @@ static int logfs_get_sb_final(struct super_block *sb, struct vfsmount *mnt) if (!sb->s_root) goto fail; + super->s_erase_page = alloc_pages(GFP_KERNEL, 0); + if (!super->s_erase_page) + goto fail2; + memset(page_address(super->s_erase_page), 0xFF, PAGE_SIZE); + /* FIXME: check for read-only mounts */ err = logfs_make_writeable(sb); if (err) - goto fail2; + goto fail3; log_super("LogFS: Finished mounting\n"); simple_set_mnt(mnt, sb); return 0; +fail3: + __free_page(super->s_erase_page); fail2: iput(rootdir); fail: @@ -498,6 +506,8 @@ static void logfs_kill_sb(struct super_block *sb) logfs_cleanup_journal(sb); logfs_cleanup_areas(sb); logfs_cleanup_rw(sb); + if (super->s_erase_page) + __free_page(super->s_erase_page); super->s_devops->put_device(sb); mempool_destroy(super->s_btree_pool); mempool_destroy(super->s_alias_pool); -- cgit v1.2.3 From c6d3830140f1d56b07d8ab56a6e14ca3c492a39a Mon Sep 17 00:00:00 2001 From: Joern Engel <joern@logfs.org> Date: Thu, 4 Mar 2010 21:36:19 +0100 Subject: [LogFS] Only write journal if dirty This prevents unnecessary journal writes. More importantly it prevents an oops due to a journal write on failed mount. --- fs/logfs/gc.c | 6 +++--- fs/logfs/journal.c | 11 +++++++---- fs/logfs/logfs.h | 4 ++-- fs/logfs/readwrite.c | 2 +- fs/logfs/segment.c | 7 +++++-- fs/logfs/super.c | 2 +- 6 files changed, 19 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/logfs/gc.c b/fs/logfs/gc.c index b3656c44190e..92949f95a901 100644 --- a/fs/logfs/gc.c +++ b/fs/logfs/gc.c @@ -469,7 +469,7 @@ static void __logfs_gc_pass(struct super_block *sb, int target) /* Sync in-memory state with on-medium state in case they * diverged */ - logfs_write_anchor(super->s_master_inode); + logfs_write_anchor(sb); round += logfs_scan_some(sb); if (no_free_segments(sb) >= target) goto write_alias; @@ -613,8 +613,8 @@ void logfs_gc_pass(struct super_block *sb) */ if (super->s_dirty_used_bytes + super->s_dirty_free_bytes + LOGFS_MAX_OBJECTSIZE >= super->s_free_bytes) - logfs_write_anchor(super->s_master_inode); - __logfs_gc_pass(sb, logfs_super(sb)->s_total_levels); + logfs_write_anchor(sb); + __logfs_gc_pass(sb, super->s_total_levels); logfs_wl_pass(sb); logfs_journal_wl_pass(sb); } diff --git a/fs/logfs/journal.c b/fs/logfs/journal.c index c0e7d63221d4..57eb4fb444a9 100644 --- a/fs/logfs/journal.c +++ b/fs/logfs/journal.c @@ -724,14 +724,17 @@ static int logfs_write_obj_aliases(struct super_block *sb) * bit wasteful, but robustness is more important. With this we can *always* * erase all journal segments except the one containing the most recent commit. */ -void logfs_write_anchor(struct inode *inode) +void logfs_write_anchor(struct super_block *sb) { - struct super_block *sb = inode->i_sb; struct logfs_super *super = logfs_super(sb); struct logfs_area *area = super->s_journal_area; int i, err; - BUG_ON(logfs_super(sb)->s_flags & LOGFS_SB_FLAG_SHUTDOWN); + if (!(super->s_flags & LOGFS_SB_FLAG_DIRTY)) + return; + super->s_flags &= ~LOGFS_SB_FLAG_DIRTY; + + BUG_ON(super->s_flags & LOGFS_SB_FLAG_SHUTDOWN); mutex_lock(&super->s_journal_mutex); /* Do this first or suffer corruption */ @@ -821,7 +824,7 @@ void do_logfs_journal_wl_pass(struct super_block *sb) area->a_is_open = 0; area->a_used_bytes = 0; /* Write journal */ - logfs_write_anchor(super->s_master_inode); + logfs_write_anchor(sb); /* Write superblocks */ err = logfs_write_sb(sb); BUG_ON(err); diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h index 72592114a28f..129779431373 100644 --- a/fs/logfs/logfs.h +++ b/fs/logfs/logfs.h @@ -82,7 +82,7 @@ /* Read-only filesystem */ #define LOGFS_SB_FLAG_RO 0x0001 -#define LOGFS_SB_FLAG_SEG_ALIAS 0x0002 +#define LOGFS_SB_FLAG_DIRTY 0x0002 #define LOGFS_SB_FLAG_OBJ_ALIAS 0x0004 #define LOGFS_SB_FLAG_SHUTDOWN 0x0008 @@ -526,7 +526,7 @@ void logfs_delete_inode(struct inode *inode); void logfs_clear_inode(struct inode *inode); /* journal.c */ -void logfs_write_anchor(struct inode *inode); +void logfs_write_anchor(struct super_block *sb); int logfs_init_journal(struct super_block *sb); void logfs_cleanup_journal(struct super_block *sb); int write_alias_journal(struct super_block *sb, u64 ino, u64 bix, diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c index 1dbe6e8cccec..7a23b3e7c0a7 100644 --- a/fs/logfs/readwrite.c +++ b/fs/logfs/readwrite.c @@ -421,7 +421,7 @@ static void inode_write_block(struct logfs_block *block) inode = block->inode; if (inode->i_ino == LOGFS_INO_MASTER) - logfs_write_anchor(inode); + logfs_write_anchor(inode->i_sb); else { ret = __logfs_write_inode(inode, 0); /* see indirect_write_block comment */ diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c index 664cd0dd3576..1a14f9910d55 100644 --- a/fs/logfs/segment.c +++ b/fs/logfs/segment.c @@ -352,7 +352,8 @@ int logfs_segment_write(struct inode *inode, struct page *page, int ret; void *buf; - BUG_ON(logfs_super(sb)->s_flags & LOGFS_SB_FLAG_SHUTDOWN); + super->s_flags |= LOGFS_SB_FLAG_DIRTY; + BUG_ON(super->s_flags & LOGFS_SB_FLAG_SHUTDOWN); do_compress = logfs_inode(inode)->li_flags & LOGFS_IF_COMPRESSED; if (shadow->gc_level != 0) { /* temporarily disable compression for indirect blocks */ @@ -653,11 +654,13 @@ int logfs_segment_read(struct inode *inode, struct page *page, int logfs_segment_delete(struct inode *inode, struct logfs_shadow *shadow) { struct super_block *sb = inode->i_sb; + struct logfs_super *super = logfs_super(sb); struct logfs_object_header h; u16 len; int err; - BUG_ON(logfs_super(sb)->s_flags & LOGFS_SB_FLAG_SHUTDOWN); + super->s_flags |= LOGFS_SB_FLAG_DIRTY; + BUG_ON(super->s_flags & LOGFS_SB_FLAG_SHUTDOWN); BUG_ON(shadow->old_ofs & LOGFS_FULLY_POPULATED); if (!shadow->old_ofs) return 0; diff --git a/fs/logfs/super.c b/fs/logfs/super.c index 94d80f7ee7c2..1d081b7ede5a 100644 --- a/fs/logfs/super.c +++ b/fs/logfs/super.c @@ -490,7 +490,7 @@ static void logfs_kill_sb(struct super_block *sb) log_super("LogFS: Start unmounting\n"); /* Alias entries slow down mount, so evict as many as possible */ sync_filesystem(sb); - logfs_write_anchor(super->s_master_inode); + logfs_write_anchor(sb); /* * From this point on alias entries are simply dropped - and any -- cgit v1.2.3 From 26245c949c8473ea7352907b5a54bc34487eb87f Mon Sep 17 00:00:00 2001 From: Jan Kara <jack@suse.cz> Date: Wed, 6 Jan 2010 17:20:35 +0100 Subject: quota: Cleanup S_NOQUOTA handling Cleanup handling of S_NOQUOTA inode flag and document it a bit. The flag does not have to be set under dqptr_sem. Only functions modifying inode's dquot pointers have to check the flag under dqptr_sem before going forward with the modification. This way we are sure that we cannot add new dquot pointers to the inode which is just becoming a quota file. The good thing about this cleanup is that there are no more places in quota code which enforce i_mutex vs. dqptr_sem lock ordering (in particular that dqptr_sem -> i_mutex of quota file). This should silence some (false) lockdep warnings with ext4 + quota and generally make life of some filesystems easier. Signed-off-by: Jan Kara <jack@suse.cz> --- fs/quota/dquot.c | 47 +++++++++++------------------------------------ 1 file changed, 11 insertions(+), 36 deletions(-) (limited to 'fs') diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 3fc62b097bed..f6eaf0d8fd6a 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -100,9 +100,13 @@ * * Any operation working on dquots via inode pointers must hold dqptr_sem. If * operation is just reading pointers from inode (or not using them at all) the - * read lock is enough. If pointers are altered function must hold write lock - * (these locking rules also apply for S_NOQUOTA flag in the inode - note that - * for altering the flag i_mutex is also needed). + * read lock is enough. If pointers are altered function must hold write lock. + * Special care needs to be taken about S_NOQUOTA inode flag (marking that + * inode is a quota file). Functions adding pointers from inode to dquots have + * to check this flag under dqptr_sem and then (if S_NOQUOTA is not set) they + * have to do all pointer modifications before dropping dqptr_sem. This makes + * sure they cannot race with quotaon which first sets S_NOQUOTA flag and + * then drops all pointers to dquots from an inode. * * Each dquot has its dq_lock mutex. Locked dquots might not be referenced * from inodes (dquot_alloc_space() and such don't check the dq_lock). @@ -1275,7 +1279,6 @@ int dquot_initialize(struct inode *inode, int type) } down_write(&sb_dqopt(sb)->dqptr_sem); - /* Having dqptr_sem we know NOQUOTA flags can't be altered... */ if (IS_NOQUOTA(inode)) goto out_err; for (cnt = 0; cnt < MAXQUOTAS; cnt++) { @@ -1431,11 +1434,6 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, } down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); - if (IS_NOQUOTA(inode)) { - inode_incr_space(inode, number, reserve); - goto out_unlock; - } - for (cnt = 0; cnt < MAXQUOTAS; cnt++) warntype[cnt] = QUOTA_NL_NOWARN; @@ -1466,7 +1464,6 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, mark_all_dquot_dirty(inode->i_dquot); out_flush_warn: flush_warnings(inode->i_dquot, warntype); -out_unlock: up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); out: return ret; @@ -1499,10 +1496,6 @@ int dquot_alloc_inode(const struct inode *inode, qsize_t number) for (cnt = 0; cnt < MAXQUOTAS; cnt++) warntype[cnt] = QUOTA_NL_NOWARN; down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); - if (IS_NOQUOTA(inode)) { - up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); - return QUOTA_OK; - } spin_lock(&dq_data_lock); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (!inode->i_dquot[cnt]) @@ -1539,12 +1532,6 @@ int dquot_claim_space(struct inode *inode, qsize_t number) } down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); - if (IS_NOQUOTA(inode)) { - up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); - inode_claim_rsv_space(inode, number); - goto out; - } - spin_lock(&dq_data_lock); /* Claim reserved quotas to allocated quotas */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { @@ -1573,17 +1560,11 @@ int __dquot_free_space(struct inode *inode, qsize_t number, int reserve) /* First test before acquiring mutex - solves deadlocks when we * re-enter the quota code and are already holding the mutex */ if (IS_NOQUOTA(inode)) { -out_sub: inode_decr_space(inode, number, reserve); return QUOTA_OK; } down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); - /* Now recheck reliably when holding dqptr_sem */ - if (IS_NOQUOTA(inode)) { - up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); - goto out_sub; - } spin_lock(&dq_data_lock); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (!inode->i_dquot[cnt]) @@ -1636,11 +1617,6 @@ int dquot_free_inode(const struct inode *inode, qsize_t number) return QUOTA_OK; down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); - /* Now recheck reliably when holding dqptr_sem */ - if (IS_NOQUOTA(inode)) { - up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); - return QUOTA_OK; - } spin_lock(&dq_data_lock); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (!inode->i_dquot[cnt]) @@ -1692,7 +1668,6 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr) GRPQUOTA); down_write(&sb_dqopt(inode->i_sb)->dqptr_sem); - /* Now recheck reliably when holding dqptr_sem */ if (IS_NOQUOTA(inode)) { /* File without quota accounting? */ up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); goto put_all; @@ -2010,13 +1985,15 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id, /* We don't want quota and atime on quota files (deadlocks * possible) Also nobody should write to the file - we use * special IO operations which ignore the immutable bit. */ - down_write(&dqopt->dqptr_sem); mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA); oldflags = inode->i_flags & (S_NOATIME | S_IMMUTABLE | S_NOQUOTA); inode->i_flags |= S_NOQUOTA | S_NOATIME | S_IMMUTABLE; mutex_unlock(&inode->i_mutex); - up_write(&dqopt->dqptr_sem); + /* + * When S_NOQUOTA is set, remove dquot references as no more + * references can be added + */ sb->dq_op->drop(inode); } @@ -2053,14 +2030,12 @@ out_file_init: iput(inode); out_lock: if (oldflags != -1) { - down_write(&dqopt->dqptr_sem); mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA); /* Set the flags back (in the case of accidental quotaon() * on a wrong file we don't want to mess up the flags) */ inode->i_flags &= ~(S_NOATIME | S_NOQUOTA | S_IMMUTABLE); inode->i_flags |= oldflags; mutex_unlock(&inode->i_mutex); - up_write(&dqopt->dqptr_sem); } mutex_unlock(&dqopt->dqonoff_mutex); out_fmt: -- cgit v1.2.3 From 9df93939b735dd273e49cbee290b9f4738500ef4 Mon Sep 17 00:00:00 2001 From: Jan Kara <jack@suse.cz> Date: Wed, 6 Jan 2010 21:58:48 +0100 Subject: ext3: Use bitops to read/modify EXT3_I(inode)->i_state At several places we modify EXT3_I(inode)->i_state without holding i_mutex (ext3_release_file, ext3_bmap, ext3_journalled_writepage, ext3_do_update_inode, ...). These modifications are racy and we can lose updates to i_state. So convert handling of i_state to use bitops which are atomic. Signed-off-by: Jan Kara <jack@suse.cz> --- fs/ext3/file.c | 4 ++-- fs/ext3/inode.c | 18 +++++++++--------- fs/ext3/xattr.c | 14 +++++++------- include/linux/ext3_fs.h | 33 +++++++++++++++++++++++++-------- include/linux/ext3_fs_i.h | 2 +- 5 files changed, 44 insertions(+), 27 deletions(-) (limited to 'fs') diff --git a/fs/ext3/file.c b/fs/ext3/file.c index 388bbdfa0b4e..a86d3302cdc2 100644 --- a/fs/ext3/file.c +++ b/fs/ext3/file.c @@ -33,9 +33,9 @@ */ static int ext3_release_file (struct inode * inode, struct file * filp) { - if (EXT3_I(inode)->i_state & EXT3_STATE_FLUSH_ON_CLOSE) { + if (ext3_test_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE)) { filemap_flush(inode->i_mapping); - EXT3_I(inode)->i_state &= ~EXT3_STATE_FLUSH_ON_CLOSE; + ext3_clear_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE); } /* if we are the last writer on the inode, drop the block reservation */ if ((filp->f_mode & FMODE_WRITE) && diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 455e6e6e5cb9..44b53386ab8b 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -1378,7 +1378,7 @@ static int ext3_journalled_write_end(struct file *file, */ if (pos + len > inode->i_size && ext3_can_truncate(inode)) ext3_orphan_add(handle, inode); - EXT3_I(inode)->i_state |= EXT3_STATE_JDATA; + ext3_set_inode_state(inode, EXT3_STATE_JDATA); if (inode->i_size > EXT3_I(inode)->i_disksize) { EXT3_I(inode)->i_disksize = inode->i_size; ret2 = ext3_mark_inode_dirty(handle, inode); @@ -1417,7 +1417,7 @@ static sector_t ext3_bmap(struct address_space *mapping, sector_t block) journal_t *journal; int err; - if (EXT3_I(inode)->i_state & EXT3_STATE_JDATA) { + if (ext3_test_inode_state(inode, EXT3_STATE_JDATA)) { /* * This is a REALLY heavyweight approach, but the use of * bmap on dirty files is expected to be extremely rare: @@ -1436,7 +1436,7 @@ static sector_t ext3_bmap(struct address_space *mapping, sector_t block) * everything they get. */ - EXT3_I(inode)->i_state &= ~EXT3_STATE_JDATA; + ext3_clear_inode_state(inode, EXT3_STATE_JDATA); journal = EXT3_JOURNAL(inode); journal_lock_updates(journal); err = journal_flush(journal); @@ -1670,7 +1670,7 @@ static int ext3_journalled_writepage(struct page *page, PAGE_CACHE_SIZE, NULL, write_end_fn); if (ret == 0) ret = err; - EXT3_I(inode)->i_state |= EXT3_STATE_JDATA; + ext3_set_inode_state(inode, EXT3_STATE_JDATA); unlock_page(page); } else { /* @@ -2402,7 +2402,7 @@ void ext3_truncate(struct inode *inode) goto out_notrans; if (inode->i_size == 0 && ext3_should_writeback_data(inode)) - ei->i_state |= EXT3_STATE_FLUSH_ON_CLOSE; + ext3_set_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE); /* * We have to lock the EOF page here, because lock_page() nests @@ -2721,7 +2721,7 @@ int ext3_get_inode_loc(struct inode *inode, struct ext3_iloc *iloc) { /* We have all inode data except xattrs in memory here. */ return __ext3_get_inode_loc(inode, iloc, - !(EXT3_I(inode)->i_state & EXT3_STATE_XATTR)); + !ext3_test_inode_state(inode, EXT3_STATE_XATTR)); } void ext3_set_inode_flags(struct inode *inode) @@ -2893,7 +2893,7 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino) EXT3_GOOD_OLD_INODE_SIZE + ei->i_extra_isize; if (*magic == cpu_to_le32(EXT3_XATTR_MAGIC)) - ei->i_state |= EXT3_STATE_XATTR; + ext3_set_inode_state(inode, EXT3_STATE_XATTR); } } else ei->i_extra_isize = 0; @@ -2955,7 +2955,7 @@ again: /* For fields not not tracking in the in-memory inode, * initialise them to zero for new inodes. */ - if (ei->i_state & EXT3_STATE_NEW) + if (ext3_test_inode_state(inode, EXT3_STATE_NEW)) memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size); ext3_get_inode_flags(ei); @@ -3052,7 +3052,7 @@ again: rc = ext3_journal_dirty_metadata(handle, bh); if (!err) err = rc; - ei->i_state &= ~EXT3_STATE_NEW; + ext3_clear_inode_state(inode, EXT3_STATE_NEW); atomic_set(&ei->i_sync_tid, handle->h_transaction->t_tid); out_brelse: diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c index 66895ccf76c7..2d2fb2a85961 100644 --- a/fs/ext3/xattr.c +++ b/fs/ext3/xattr.c @@ -274,7 +274,7 @@ ext3_xattr_ibody_get(struct inode *inode, int name_index, const char *name, void *end; int error; - if (!(EXT3_I(inode)->i_state & EXT3_STATE_XATTR)) + if (!ext3_test_inode_state(inode, EXT3_STATE_XATTR)) return -ENODATA; error = ext3_get_inode_loc(inode, &iloc); if (error) @@ -403,7 +403,7 @@ ext3_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size) void *end; int error; - if (!(EXT3_I(inode)->i_state & EXT3_STATE_XATTR)) + if (!ext3_test_inode_state(inode, EXT3_STATE_XATTR)) return 0; error = ext3_get_inode_loc(inode, &iloc); if (error) @@ -882,7 +882,7 @@ ext3_xattr_ibody_find(struct inode *inode, struct ext3_xattr_info *i, is->s.base = is->s.first = IFIRST(header); is->s.here = is->s.first; is->s.end = (void *)raw_inode + EXT3_SB(inode->i_sb)->s_inode_size; - if (EXT3_I(inode)->i_state & EXT3_STATE_XATTR) { + if (ext3_test_inode_state(inode, EXT3_STATE_XATTR)) { error = ext3_xattr_check_names(IFIRST(header), is->s.end); if (error) return error; @@ -914,10 +914,10 @@ ext3_xattr_ibody_set(handle_t *handle, struct inode *inode, header = IHDR(inode, ext3_raw_inode(&is->iloc)); if (!IS_LAST_ENTRY(s->first)) { header->h_magic = cpu_to_le32(EXT3_XATTR_MAGIC); - EXT3_I(inode)->i_state |= EXT3_STATE_XATTR; + ext3_set_inode_state(inode, EXT3_STATE_XATTR); } else { header->h_magic = cpu_to_le32(0); - EXT3_I(inode)->i_state &= ~EXT3_STATE_XATTR; + ext3_clear_inode_state(inode, EXT3_STATE_XATTR); } return 0; } @@ -967,10 +967,10 @@ ext3_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, if (error) goto cleanup; - if (EXT3_I(inode)->i_state & EXT3_STATE_NEW) { + if (ext3_test_inode_state(inode, EXT3_STATE_NEW)) { struct ext3_inode *raw_inode = ext3_raw_inode(&is.iloc); memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size); - EXT3_I(inode)->i_state &= ~EXT3_STATE_NEW; + ext3_clear_inode_state(inode, EXT3_STATE_NEW); } error = ext3_xattr_ibody_find(inode, &i, &is); diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h index 6b049030fbe6..e6590f8f0b3c 100644 --- a/include/linux/ext3_fs.h +++ b/include/linux/ext3_fs.h @@ -202,14 +202,6 @@ static inline __u32 ext3_mask_flags(umode_t mode, __u32 flags) return flags & EXT3_OTHER_FLMASK; } -/* - * Inode dynamic state flags - */ -#define EXT3_STATE_JDATA 0x00000001 /* journaled data exists */ -#define EXT3_STATE_NEW 0x00000002 /* inode is newly created */ -#define EXT3_STATE_XATTR 0x00000004 /* has in-inode xattrs */ -#define EXT3_STATE_FLUSH_ON_CLOSE 0x00000008 - /* Used to pass group descriptor data when online resize is done */ struct ext3_new_group_input { __u32 group; /* Group number for this data */ @@ -560,6 +552,31 @@ static inline int ext3_valid_inum(struct super_block *sb, unsigned long ino) (ino >= EXT3_FIRST_INO(sb) && ino <= le32_to_cpu(EXT3_SB(sb)->s_es->s_inodes_count)); } + +/* + * Inode dynamic state flags + */ +enum { + EXT3_STATE_JDATA, /* journaled data exists */ + EXT3_STATE_NEW, /* inode is newly created */ + EXT3_STATE_XATTR, /* has in-inode xattrs */ + EXT3_STATE_FLUSH_ON_CLOSE, /* flush dirty pages on close */ +}; + +static inline int ext3_test_inode_state(struct inode *inode, int bit) +{ + return test_bit(bit, &EXT3_I(inode)->i_state); +} + +static inline void ext3_set_inode_state(struct inode *inode, int bit) +{ + set_bit(bit, &EXT3_I(inode)->i_state); +} + +static inline void ext3_clear_inode_state(struct inode *inode, int bit) +{ + clear_bit(bit, &EXT3_I(inode)->i_state); +} #else /* Assume that user mode programs are passing in an ext3fs superblock, not * a kernel struct super_block. This will allow us to call the feature-test diff --git a/include/linux/ext3_fs_i.h b/include/linux/ext3_fs_i.h index 93e7428156ba..7679acdb519a 100644 --- a/include/linux/ext3_fs_i.h +++ b/include/linux/ext3_fs_i.h @@ -87,7 +87,7 @@ struct ext3_inode_info { * near to their parent directory's inode. */ __u32 i_block_group; - __u32 i_state; /* Dynamic state flags for ext3 */ + unsigned long i_state; /* Dynamic state flags for ext3 */ /* block reservation info */ struct ext3_block_alloc_info *i_block_alloc_info; -- cgit v1.2.3 From e3c9643597ac4bd6b5db62b5e7d915f8c8fa34b6 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov <dmonakhov@openvz.org> Date: Tue, 2 Feb 2010 16:05:51 +0300 Subject: ext3: mount flags manipulation cleanup Replace intermediate EXT3_MOUNT_XXX flags manipulation to corresponding macro. Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org> Signed-off-by: Jan Kara <jack@suse.cz> --- fs/ext3/super.c | 47 ++++++++++++++++++++--------------------------- 1 file changed, 20 insertions(+), 27 deletions(-) (limited to 'fs') diff --git a/fs/ext3/super.c b/fs/ext3/super.c index afa2b569da10..7950ff6ec4e8 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -181,7 +181,7 @@ static void ext3_handle_error(struct super_block *sb) if (!test_opt (sb, ERRORS_CONT)) { journal_t *journal = EXT3_SB(sb)->s_journal; - EXT3_SB(sb)->s_mount_opt |= EXT3_MOUNT_ABORT; + set_opt(EXT3_SB(sb)->s_mount_opt, ABORT); if (journal) journal_abort(journal, -EIO); } @@ -296,7 +296,7 @@ void ext3_abort (struct super_block * sb, const char * function, "error: remounting filesystem read-only"); EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS; sb->s_flags |= MS_RDONLY; - EXT3_SB(sb)->s_mount_opt |= EXT3_MOUNT_ABORT; + set_opt(EXT3_SB(sb)->s_mount_opt, ABORT); if (EXT3_SB(sb)->s_journal) journal_abort(EXT3_SB(sb)->s_journal, -EIO); } @@ -562,10 +562,10 @@ static inline void ext3_show_quota_options(struct seq_file *seq, struct super_bl if (sbi->s_qf_names[GRPQUOTA]) seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]); - if (sbi->s_mount_opt & EXT3_MOUNT_USRQUOTA) + if (test_opt(sb, USRQUOTA)) seq_puts(seq, ",usrquota"); - if (sbi->s_mount_opt & EXT3_MOUNT_GRPQUOTA) + if (test_opt(sb, GRPQUOTA)) seq_puts(seq, ",grpquota"); #endif } @@ -656,8 +656,7 @@ static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs) if (test_opt(sb, NOBH)) seq_puts(seq, ",nobh"); - seq_printf(seq, ",data=%s", data_mode_string(sbi->s_mount_opt & - EXT3_MOUNT_DATA_FLAGS)); + seq_printf(seq, ",data=%s", data_mode_string(test_opt(sb, DATA_FLAGS))); if (test_opt(sb, DATA_ERR_ABORT)) seq_puts(seq, ",data_err=abort"); @@ -1065,20 +1064,19 @@ static int parse_options (char *options, struct super_block *sb, data_opt = EXT3_MOUNT_WRITEBACK_DATA; datacheck: if (is_remount) { - if ((sbi->s_mount_opt & EXT3_MOUNT_DATA_FLAGS) - == data_opt) + if (test_opt(sb, DATA_FLAGS) == data_opt) break; ext3_msg(sb, KERN_ERR, "error: cannot change " "data mode on remount. The filesystem " "is mounted in data=%s mode and you " "try to remount it in data=%s mode.", - data_mode_string(sbi->s_mount_opt & - EXT3_MOUNT_DATA_FLAGS), + data_mode_string(test_opt(sb, + DATA_FLAGS)), data_mode_string(data_opt)); return 0; } else { - sbi->s_mount_opt &= ~EXT3_MOUNT_DATA_FLAGS; + clear_opt(sbi->s_mount_opt, DATA_FLAGS); sbi->s_mount_opt |= data_opt; } break; @@ -1244,18 +1242,13 @@ set_qf_format: } #ifdef CONFIG_QUOTA if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) { - if ((sbi->s_mount_opt & EXT3_MOUNT_USRQUOTA) && - sbi->s_qf_names[USRQUOTA]) + if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA]) clear_opt(sbi->s_mount_opt, USRQUOTA); - - if ((sbi->s_mount_opt & EXT3_MOUNT_GRPQUOTA) && - sbi->s_qf_names[GRPQUOTA]) + if (test_opt(sb, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA]) clear_opt(sbi->s_mount_opt, GRPQUOTA); - if ((sbi->s_qf_names[USRQUOTA] && - (sbi->s_mount_opt & EXT3_MOUNT_GRPQUOTA)) || - (sbi->s_qf_names[GRPQUOTA] && - (sbi->s_mount_opt & EXT3_MOUNT_USRQUOTA))) { + if ((sbi->s_qf_names[USRQUOTA] && test_opt(sb, GRPQUOTA)) || + (sbi->s_qf_names[GRPQUOTA] && test_opt(sb, USRQUOTA))) { ext3_msg(sb, KERN_ERR, "error: old and new quota " "format mixing."); return 0; @@ -1671,11 +1664,11 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) set_opt(sbi->s_mount_opt, POSIX_ACL); #endif if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_DATA) - sbi->s_mount_opt |= EXT3_MOUNT_JOURNAL_DATA; + set_opt(sbi->s_mount_opt, JOURNAL_DATA); else if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_ORDERED) - sbi->s_mount_opt |= EXT3_MOUNT_ORDERED_DATA; + set_opt(sbi->s_mount_opt, ORDERED_DATA); else if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_WBACK) - sbi->s_mount_opt |= EXT3_MOUNT_WRITEBACK_DATA; + set_opt(sbi->s_mount_opt, WRITEBACK_DATA); if (le16_to_cpu(sbi->s_es->s_errors) == EXT3_ERRORS_PANIC) set_opt(sbi->s_mount_opt, ERRORS_PANIC); @@ -1694,7 +1687,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) goto failed_mount; sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | - ((sbi->s_mount_opt & EXT3_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); + (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0); if (le32_to_cpu(es->s_rev_level) == EXT3_GOOD_OLD_REV && (EXT3_HAS_COMPAT_FEATURE(sb, ~0U) || @@ -2561,11 +2554,11 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data) goto restore_opts; } - if (sbi->s_mount_opt & EXT3_MOUNT_ABORT) + if (test_opt(sb, ABORT)) ext3_abort(sb, __func__, "Abort forced by user"); sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | - ((sbi->s_mount_opt & EXT3_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); + (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0); es = sbi->s_es; @@ -2573,7 +2566,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data) if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) || n_blocks_count > le32_to_cpu(es->s_blocks_count)) { - if (sbi->s_mount_opt & EXT3_MOUNT_ABORT) { + if (test_opt(sb, ABORT)) { err = -EROFS; goto restore_opts; } -- cgit v1.2.3 From e1f5c67a1994312300ebc41195e6f5bea2f6c065 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov <dmonakhov@openvz.org> Date: Tue, 2 Feb 2010 16:05:53 +0300 Subject: ext3: trivial quota cleanup The patch is aimed to reorganize and simplify quota code a bit. Quota code is itself complex enouth, but we can make it more readable in some places: - Move quota option parsing to separate functions. - Simplify old-quota and journaled-quota mix check. Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org> Signed-off-by: Jan Kara <jack@suse.cz> --- fs/ext3/super.c | 121 +++++++++++++++++++++++++++++++------------------------- 1 file changed, 67 insertions(+), 54 deletions(-) (limited to 'fs') diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 7950ff6ec4e8..241c520b3081 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -895,6 +895,63 @@ static ext3_fsblk_t get_sb_block(void **data, struct super_block *sb) return sb_block; } +#ifdef CONFIG_QUOTA +static int set_qf_name(struct super_block *sb, int qtype, substring_t *args) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + char *qname; + + if (sb_any_quota_loaded(sb) && + !sbi->s_qf_names[qtype]) { + ext3_msg(sb, KERN_ERR, + "Cannot change journaled " + "quota options when quota turned on"); + return 0; + } + qname = match_strdup(args); + if (!qname) { + ext3_msg(sb, KERN_ERR, + "Not enough memory for storing quotafile name"); + return 0; + } + if (sbi->s_qf_names[qtype] && + strcmp(sbi->s_qf_names[qtype], qname)) { + ext3_msg(sb, KERN_ERR, + "%s quota file already specified", QTYPE2NAME(qtype)); + kfree(qname); + return 0; + } + sbi->s_qf_names[qtype] = qname; + if (strchr(sbi->s_qf_names[qtype], '/')) { + ext3_msg(sb, KERN_ERR, + "quotafile must be on filesystem root"); + kfree(sbi->s_qf_names[qtype]); + sbi->s_qf_names[qtype] = NULL; + return 0; + } + set_opt(sbi->s_mount_opt, QUOTA); + return 1; +} + +static int clear_qf_name(struct super_block *sb, int qtype) { + + struct ext3_sb_info *sbi = EXT3_SB(sb); + + if (sb_any_quota_loaded(sb) && + sbi->s_qf_names[qtype]) { + ext3_msg(sb, KERN_ERR, "Cannot change journaled quota options" + " when quota turned on"); + return 0; + } + /* + * The space will be released later when all options are confirmed + * to be correct + */ + sbi->s_qf_names[qtype] = NULL; + return 1; +} +#endif + static int parse_options (char *options, struct super_block *sb, unsigned int *inum, unsigned long *journal_devnum, ext3_fsblk_t *n_blocks_count, int is_remount) @@ -905,8 +962,7 @@ static int parse_options (char *options, struct super_block *sb, int data_opt = 0; int option; #ifdef CONFIG_QUOTA - int qtype, qfmt; - char *qname; + int qfmt; #endif if (!options) @@ -1088,62 +1144,20 @@ static int parse_options (char *options, struct super_block *sb, break; #ifdef CONFIG_QUOTA case Opt_usrjquota: - qtype = USRQUOTA; - goto set_qf_name; - case Opt_grpjquota: - qtype = GRPQUOTA; -set_qf_name: - if (sb_any_quota_loaded(sb) && - !sbi->s_qf_names[qtype]) { - ext3_msg(sb, KERN_ERR, - "error: cannot change journaled " - "quota options when quota turned on."); + if (!set_qf_name(sb, USRQUOTA, &args[0])) return 0; - } - qname = match_strdup(&args[0]); - if (!qname) { - ext3_msg(sb, KERN_ERR, - "error: not enough memory for " - "storing quotafile name."); - return 0; - } - if (sbi->s_qf_names[qtype] && - strcmp(sbi->s_qf_names[qtype], qname)) { - ext3_msg(sb, KERN_ERR, - "error: %s quota file already " - "specified.", QTYPE2NAME(qtype)); - kfree(qname); - return 0; - } - sbi->s_qf_names[qtype] = qname; - if (strchr(sbi->s_qf_names[qtype], '/')) { - ext3_msg(sb, KERN_ERR, - "error: quotafile must be on " - "filesystem root."); - kfree(sbi->s_qf_names[qtype]); - sbi->s_qf_names[qtype] = NULL; + break; + case Opt_grpjquota: + if (!set_qf_name(sb, GRPQUOTA, &args[0])) return 0; - } - set_opt(sbi->s_mount_opt, QUOTA); break; case Opt_offusrjquota: - qtype = USRQUOTA; - goto clear_qf_name; + if (!clear_qf_name(sb, USRQUOTA)) + return 0; + break; case Opt_offgrpjquota: - qtype = GRPQUOTA; -clear_qf_name: - if (sb_any_quota_loaded(sb) && - sbi->s_qf_names[qtype]) { - ext3_msg(sb, KERN_ERR, "error: cannot change " - "journaled quota options when " - "quota turned on."); + if (!clear_qf_name(sb, GRPQUOTA)) return 0; - } - /* - * The space will be released later when all options - * are confirmed to be correct - */ - sbi->s_qf_names[qtype] = NULL; break; case Opt_jqfmt_vfsold: qfmt = QFMT_VFS_OLD; @@ -1247,8 +1261,7 @@ set_qf_format: if (test_opt(sb, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA]) clear_opt(sbi->s_mount_opt, GRPQUOTA); - if ((sbi->s_qf_names[USRQUOTA] && test_opt(sb, GRPQUOTA)) || - (sbi->s_qf_names[GRPQUOTA] && test_opt(sb, USRQUOTA))) { + if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) { ext3_msg(sb, KERN_ERR, "error: old and new quota " "format mixing."); return 0; -- cgit v1.2.3 From c469070aea5a0ada45a836937c776fd3083dae2b Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov <dmonakhov@openvz.org> Date: Tue, 9 Feb 2010 17:53:36 +0100 Subject: quota: manage reserved space when quota is not active [v2] Since we implemented generic reserved space management interface, then it is possible to account reserved space even when quota is not active (similar to i_blocks/i_bytes). Without this patch following testcase result in massive comlain from WARN_ON in dquot_claim_space() TEST_CASE: mount /dev/sdb /mnt -oquota dd if=/dev/zero of=/mnt/test bs=1M count=1 quotaon /mnt # fs_reserved_spave == 1Mb # quota_reserved_space == 0, because quota was disabled dd if=/dev/zero of=/mnt/test seek=1 bs=1M count=1 # fs_reserved_spave == 2Mb # quota_reserved_space == 1Mb sync # ->dquot_claim_space() -> WARN_ON Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org> Signed-off-by: Jan Kara <jack@suse.cz> --- fs/quota/dquot.c | 10 ++++++---- include/linux/quotaops.h | 11 +++++++++-- 2 files changed, 15 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index f6eaf0d8fd6a..f11255b18b58 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -1354,28 +1354,30 @@ static qsize_t *inode_reserved_space(struct inode * inode) return inode->i_sb->dq_op->get_reserved_space(inode); } -static void inode_add_rsv_space(struct inode *inode, qsize_t number) +void inode_add_rsv_space(struct inode *inode, qsize_t number) { spin_lock(&inode->i_lock); *inode_reserved_space(inode) += number; spin_unlock(&inode->i_lock); } +EXPORT_SYMBOL(inode_add_rsv_space); - -static void inode_claim_rsv_space(struct inode *inode, qsize_t number) +void inode_claim_rsv_space(struct inode *inode, qsize_t number) { spin_lock(&inode->i_lock); *inode_reserved_space(inode) -= number; __inode_add_bytes(inode, number); spin_unlock(&inode->i_lock); } +EXPORT_SYMBOL(inode_claim_rsv_space); -static void inode_sub_rsv_space(struct inode *inode, qsize_t number) +void inode_sub_rsv_space(struct inode *inode, qsize_t number) { spin_lock(&inode->i_lock); *inode_reserved_space(inode) -= number; spin_unlock(&inode->i_lock); } +EXPORT_SYMBOL(inode_sub_rsv_space); static qsize_t inode_get_rsv_space(struct inode *inode) { diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h index 3ebb23153640..a529d86e7e73 100644 --- a/include/linux/quotaops.h +++ b/include/linux/quotaops.h @@ -26,6 +26,10 @@ static inline void writeout_quota_sb(struct super_block *sb, int type) sb->s_qcop->quota_sync(sb, type); } +void inode_add_rsv_space(struct inode *inode, qsize_t number); +void inode_claim_rsv_space(struct inode *inode, qsize_t number); +void inode_sub_rsv_space(struct inode *inode, qsize_t number); + int dquot_initialize(struct inode *inode, int type); int dquot_drop(struct inode *inode); struct dquot *dqget(struct super_block *sb, unsigned int id, int type); @@ -42,7 +46,6 @@ int dquot_alloc_inode(const struct inode *inode, qsize_t number); int dquot_reserve_space(struct inode *inode, qsize_t number, int prealloc); int dquot_claim_space(struct inode *inode, qsize_t number); void dquot_release_reserved_space(struct inode *inode, qsize_t number); -qsize_t dquot_get_reserved_space(struct inode *inode); int dquot_free_space(struct inode *inode, qsize_t number); int dquot_free_inode(const struct inode *inode, qsize_t number); @@ -199,6 +202,8 @@ static inline int vfs_dq_reserve_space(struct inode *inode, qsize_t nr) if (inode->i_sb->dq_op->reserve_space(inode, nr, 0) == NO_QUOTA) return 1; } + else + inode_add_rsv_space(inode, nr); return 0; } @@ -221,7 +226,7 @@ static inline int vfs_dq_claim_space(struct inode *inode, qsize_t nr) if (inode->i_sb->dq_op->claim_space(inode, nr) == NO_QUOTA) return 1; } else - inode_add_bytes(inode, nr); + inode_claim_rsv_space(inode, nr); mark_inode_dirty(inode); return 0; @@ -235,6 +240,8 @@ void vfs_dq_release_reservation_space(struct inode *inode, qsize_t nr) { if (sb_any_quota_active(inode->i_sb)) inode->i_sb->dq_op->release_rsv(inode, nr); + else + inode_sub_rsv_space(inode, nr); } static inline void vfs_dq_free_space_nodirty(struct inode *inode, qsize_t nr) -- cgit v1.2.3 From 0a5a9c725512461d19397490f3adf29931dca1f2 Mon Sep 17 00:00:00 2001 From: Jan Kara <jack@suse.cz> Date: Tue, 9 Feb 2010 18:20:39 +0100 Subject: quota: Fix warning when a delayed write happens before quota is enabled If a delayed-allocation write happens before quota is enabled, the kernel spits out a warning: WARNING: at fs/quota/dquot.c:988 dquot_claim_space+0x77/0x112() because the fact that user has some delayed allocation is not recorded in quota structure. Make dquot_initialize() update amount of reserved space for user if it sees inode has some space reserved. Also make sure that reserved quota space does not go negative and we warn about the filesystem bug just once. Signed-off-by: Jan Kara <jack@suse.cz> --- fs/quota/dquot.c | 35 +++++++++++++++++++++++++++++++---- 1 file changed, 31 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index f11255b18b58..6c849de5dc8f 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -229,6 +229,8 @@ static struct hlist_head *dquot_hash; struct dqstats dqstats; EXPORT_SYMBOL(dqstats); +static qsize_t inode_get_rsv_space(struct inode *inode); + static inline unsigned int hashfn(const struct super_block *sb, unsigned int id, int type) { @@ -844,11 +846,14 @@ static int dqinit_needed(struct inode *inode, int type) static void add_dquot_ref(struct super_block *sb, int type) { struct inode *inode, *old_inode = NULL; + int reserved = 0; spin_lock(&inode_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW)) continue; + if (unlikely(inode_get_rsv_space(inode) > 0)) + reserved = 1; if (!atomic_read(&inode->i_writecount)) continue; if (!dqinit_needed(inode, type)) @@ -869,6 +874,12 @@ static void add_dquot_ref(struct super_block *sb, int type) } spin_unlock(&inode_lock); iput(old_inode); + + if (reserved) { + printk(KERN_WARNING "VFS (%s): Writes happened before quota" + " was turned on thus quota information is probably " + "inconsistent. Please run quotacheck(8).\n", sb->s_id); + } } /* @@ -982,10 +993,12 @@ static inline void dquot_resv_space(struct dquot *dquot, qsize_t number) /* * Claim reserved quota space */ -static void dquot_claim_reserved_space(struct dquot *dquot, - qsize_t number) +static void dquot_claim_reserved_space(struct dquot *dquot, qsize_t number) { - WARN_ON(dquot->dq_dqb.dqb_rsvspace < number); + if (dquot->dq_dqb.dqb_rsvspace < number) { + WARN_ON_ONCE(1); + number = dquot->dq_dqb.dqb_rsvspace; + } dquot->dq_dqb.dqb_curspace += number; dquot->dq_dqb.dqb_rsvspace -= number; } @@ -993,7 +1006,12 @@ static void dquot_claim_reserved_space(struct dquot *dquot, static inline void dquot_free_reserved_space(struct dquot *dquot, qsize_t number) { - dquot->dq_dqb.dqb_rsvspace -= number; + if (dquot->dq_dqb.dqb_rsvspace >= number) + dquot->dq_dqb.dqb_rsvspace -= number; + else { + WARN_ON_ONCE(1); + dquot->dq_dqb.dqb_rsvspace = 0; + } } static void dquot_decr_inodes(struct dquot *dquot, qsize_t number) @@ -1246,6 +1264,7 @@ static int info_bdq_free(struct dquot *dquot, qsize_t space) return QUOTA_NL_BHARDBELOW; return QUOTA_NL_NOWARN; } + /* * Initialize quota pointers in inode * We do things in a bit complicated way but by that we avoid calling @@ -1257,6 +1276,7 @@ int dquot_initialize(struct inode *inode, int type) int cnt, ret = 0; struct dquot *got[MAXQUOTAS] = { NULL, NULL }; struct super_block *sb = inode->i_sb; + qsize_t rsv; /* First test before acquiring mutex - solves deadlocks when we * re-enter the quota code and are already holding the mutex */ @@ -1290,6 +1310,13 @@ int dquot_initialize(struct inode *inode, int type) if (!inode->i_dquot[cnt]) { inode->i_dquot[cnt] = got[cnt]; got[cnt] = NULL; + /* + * Make quota reservation system happy if someone + * did a write before quota was turned on + */ + rsv = inode_get_rsv_space(inode); + if (unlikely(rsv)) + dquot_resv_space(inode->i_dquot[cnt], rsv); } } out_err: -- cgit v1.2.3 From c411e5f66a5dd36827a5f9d1392a1afdf69ff075 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@infradead.org> Date: Tue, 16 Feb 2010 03:44:47 -0500 Subject: quota: split do_quotactl Split out a helper for each non-trivial command from do_quotactl. Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Jan Kara <jack@suse.cz> --- fs/quota/quota.c | 250 ++++++++++++++++++++++++++++++++----------------------- 1 file changed, 146 insertions(+), 104 deletions(-) (limited to 'fs') diff --git a/fs/quota/quota.c b/fs/quota/quota.c index ee91e2756950..4d7fdc4443b9 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -234,122 +234,164 @@ restart: spin_unlock(&sb_lock); } -/* Copy parameters and call proper function */ -static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, - void __user *addr) +static int quota_quotaon(struct super_block *sb, int type, int cmd, qid_t id, + void __user *addr) { + char *pathname; int ret; - switch (cmd) { - case Q_QUOTAON: { - char *pathname; - - pathname = getname(addr); - if (IS_ERR(pathname)) - return PTR_ERR(pathname); - ret = sb->s_qcop->quota_on(sb, type, id, pathname, 0); - putname(pathname); - return ret; - } - case Q_QUOTAOFF: - return sb->s_qcop->quota_off(sb, type, 0); + pathname = getname(addr); + if (IS_ERR(pathname)) + return PTR_ERR(pathname); + ret = sb->s_qcop->quota_on(sb, type, id, pathname, 0); + putname(pathname); + return ret; +} - case Q_GETFMT: { - __u32 fmt; +static int quota_getfmt(struct super_block *sb, int type, void __user *addr) +{ + __u32 fmt; - down_read(&sb_dqopt(sb)->dqptr_sem); - if (!sb_has_quota_active(sb, type)) { - up_read(&sb_dqopt(sb)->dqptr_sem); - return -ESRCH; - } - fmt = sb_dqopt(sb)->info[type].dqi_format->qf_fmt_id; - up_read(&sb_dqopt(sb)->dqptr_sem); - if (copy_to_user(addr, &fmt, sizeof(fmt))) - return -EFAULT; - return 0; - } - case Q_GETINFO: { - struct if_dqinfo info; - - ret = sb->s_qcop->get_info(sb, type, &info); - if (ret) - return ret; - if (copy_to_user(addr, &info, sizeof(info))) - return -EFAULT; - return 0; - } - case Q_SETINFO: { - struct if_dqinfo info; + down_read(&sb_dqopt(sb)->dqptr_sem); + if (!sb_has_quota_active(sb, type)) { + up_read(&sb_dqopt(sb)->dqptr_sem); + return -ESRCH; + } + fmt = sb_dqopt(sb)->info[type].dqi_format->qf_fmt_id; + up_read(&sb_dqopt(sb)->dqptr_sem); + if (copy_to_user(addr, &fmt, sizeof(fmt))) + return -EFAULT; + return 0; +} - if (copy_from_user(&info, addr, sizeof(info))) - return -EFAULT; - return sb->s_qcop->set_info(sb, type, &info); - } - case Q_GETQUOTA: { - struct if_dqblk idq; - - ret = sb->s_qcop->get_dqblk(sb, type, id, &idq); - if (ret) - return ret; - if (copy_to_user(addr, &idq, sizeof(idq))) - return -EFAULT; - return 0; - } - case Q_SETQUOTA: { - struct if_dqblk idq; +static int quota_getinfo(struct super_block *sb, int type, void __user *addr) +{ + struct if_dqinfo info; + int ret; - if (copy_from_user(&idq, addr, sizeof(idq))) - return -EFAULT; - return sb->s_qcop->set_dqblk(sb, type, id, &idq); - } - case Q_SYNC: - if (sb) - sync_quota_sb(sb, type); - else - sync_dquots(type); - return 0; + ret = sb->s_qcop->get_info(sb, type, &info); + if (!ret && copy_to_user(addr, &info, sizeof(info))) + return -EFAULT; + return ret; +} - case Q_XQUOTAON: - case Q_XQUOTAOFF: - case Q_XQUOTARM: { - __u32 flags; +static int quota_setinfo(struct super_block *sb, int type, void __user *addr) +{ + struct if_dqinfo info; - if (copy_from_user(&flags, addr, sizeof(flags))) - return -EFAULT; - return sb->s_qcop->set_xstate(sb, flags, cmd); - } - case Q_XGETQSTAT: { - struct fs_quota_stat fqs; + if (copy_from_user(&info, addr, sizeof(info))) + return -EFAULT; + return sb->s_qcop->set_info(sb, type, &info); +} + +static int quota_getquota(struct super_block *sb, int type, qid_t id, + void __user *addr) +{ + struct if_dqblk idq; + int ret; + + ret = sb->s_qcop->get_dqblk(sb, type, id, &idq); + if (ret) + return ret; + if (copy_to_user(addr, &idq, sizeof(idq))) + return -EFAULT; + return 0; +} + +static int quota_setquota(struct super_block *sb, int type, qid_t id, + void __user *addr) +{ + struct if_dqblk idq; + + if (copy_from_user(&idq, addr, sizeof(idq))) + return -EFAULT; + return sb->s_qcop->set_dqblk(sb, type, id, &idq); +} + +static int quota_setxstate(struct super_block *sb, int cmd, void __user *addr) +{ + __u32 flags; + + if (copy_from_user(&flags, addr, sizeof(flags))) + return -EFAULT; + return sb->s_qcop->set_xstate(sb, flags, cmd); +} + +static int quota_getxstate(struct super_block *sb, void __user *addr) +{ + struct fs_quota_stat fqs; + int ret; - if ((ret = sb->s_qcop->get_xstate(sb, &fqs))) - return ret; - if (copy_to_user(addr, &fqs, sizeof(fqs))) - return -EFAULT; - return 0; - } - case Q_XSETQLIM: { - struct fs_disk_quota fdq; + ret = sb->s_qcop->get_xstate(sb, &fqs); + if (!ret && copy_to_user(addr, &fqs, sizeof(fqs))) + return -EFAULT; + return ret; +} - if (copy_from_user(&fdq, addr, sizeof(fdq))) - return -EFAULT; - return sb->s_qcop->set_xquota(sb, type, id, &fdq); - } - case Q_XGETQUOTA: { - struct fs_disk_quota fdq; - - ret = sb->s_qcop->get_xquota(sb, type, id, &fdq); - if (ret) - return ret; - if (copy_to_user(addr, &fdq, sizeof(fdq))) - return -EFAULT; - return 0; - } - case Q_XQUOTASYNC: - return sb->s_qcop->quota_sync(sb, type); - /* We never reach here unless validity check is broken */ - default: - BUG(); +static int quota_setxquota(struct super_block *sb, int type, qid_t id, + void __user *addr) +{ + struct fs_disk_quota fdq; + + if (copy_from_user(&fdq, addr, sizeof(fdq))) + return -EFAULT; + return sb->s_qcop->set_xquota(sb, type, id, &fdq); +} + +static int quota_getxquota(struct super_block *sb, int type, qid_t id, + void __user *addr) +{ + struct fs_disk_quota fdq; + int ret; + + ret = sb->s_qcop->get_xquota(sb, type, id, &fdq); + if (!ret && copy_to_user(addr, &fdq, sizeof(fdq))) + return -EFAULT; + return ret; +} + +/* Copy parameters and call proper function */ +static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, + void __user *addr) +{ + switch (cmd) { + case Q_QUOTAON: + return quota_quotaon(sb, type, cmd, id, addr); + case Q_QUOTAOFF: + return sb->s_qcop->quota_off(sb, type, 0); + case Q_GETFMT: + return quota_getfmt(sb, type, addr); + case Q_GETINFO: + return quota_getinfo(sb, type, addr); + case Q_SETINFO: + return quota_setinfo(sb, type, addr); + case Q_GETQUOTA: + return quota_getquota(sb, type, id, addr); + case Q_SETQUOTA: + return quota_setquota(sb, type, id, addr); + case Q_SYNC: + if (sb) + sync_quota_sb(sb, type); + else + sync_dquots(type); + return 0; + case Q_XQUOTAON: + case Q_XQUOTAOFF: + case Q_XQUOTARM: + return quota_setxstate(sb, cmd, addr); + case Q_XGETQSTAT: + return quota_getxstate(sb, addr); + case Q_XSETQLIM: + return quota_setxquota(sb, type, id, addr); + case Q_XGETQUOTA: + return quota_getxquota(sb, type, id, addr); + case Q_XQUOTASYNC: + return sb->s_qcop->quota_sync(sb, type); + /* We never reach here unless validity check is broken */ + default: + BUG(); } + return 0; } -- cgit v1.2.3 From f450d4fee42c52e8045131a355b2de03094aa066 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@infradead.org> Date: Tue, 16 Feb 2010 03:44:48 -0500 Subject: quota: clean up checks for supported quota methods Move the checks for sb->s_qcop->foo next to the actual calls for them, same for sb_has_quota_active checks where applicable. Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Jan Kara <jack@suse.cz> --- fs/quota/quota.c | 121 +++++++++++++++++-------------------------------------- 1 file changed, 37 insertions(+), 84 deletions(-) (limited to 'fs') diff --git a/fs/quota/quota.c b/fs/quota/quota.c index 4d7fdc4443b9..dcf7db91fc95 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -33,54 +33,6 @@ static int generic_quotactl_valid(struct super_block *sb, int type, int cmd, if (sb && !sb->s_qcop) return -ENOSYS; - switch (cmd) { - case Q_GETFMT: - break; - case Q_QUOTAON: - if (!sb->s_qcop->quota_on) - return -ENOSYS; - break; - case Q_QUOTAOFF: - if (!sb->s_qcop->quota_off) - return -ENOSYS; - break; - case Q_SETINFO: - if (!sb->s_qcop->set_info) - return -ENOSYS; - break; - case Q_GETINFO: - if (!sb->s_qcop->get_info) - return -ENOSYS; - break; - case Q_SETQUOTA: - if (!sb->s_qcop->set_dqblk) - return -ENOSYS; - break; - case Q_GETQUOTA: - if (!sb->s_qcop->get_dqblk) - return -ENOSYS; - break; - case Q_SYNC: - if (sb && !sb->s_qcop->quota_sync) - return -ENOSYS; - break; - default: - return -EINVAL; - } - - /* Is quota turned on for commands which need it? */ - switch (cmd) { - case Q_GETFMT: - case Q_GETINFO: - case Q_SETINFO: - case Q_SETQUOTA: - case Q_GETQUOTA: - /* This is just an informative test so we are satisfied - * without the lock */ - if (!sb_has_quota_active(sb, type)) - return -ESRCH; - } - /* Check privileges */ if (cmd == Q_GETQUOTA) { if (((type == USRQUOTA && current_euid() != id) || @@ -106,33 +58,6 @@ static int xqm_quotactl_valid(struct super_block *sb, int type, int cmd, if (!sb->s_qcop) return -ENOSYS; - switch (cmd) { - case Q_XQUOTAON: - case Q_XQUOTAOFF: - case Q_XQUOTARM: - if (!sb->s_qcop->set_xstate) - return -ENOSYS; - break; - case Q_XGETQSTAT: - if (!sb->s_qcop->get_xstate) - return -ENOSYS; - break; - case Q_XSETQLIM: - if (!sb->s_qcop->set_xquota) - return -ENOSYS; - break; - case Q_XGETQUOTA: - if (!sb->s_qcop->get_xquota) - return -ENOSYS; - break; - case Q_XQUOTASYNC: - if (!sb->s_qcop->quota_sync) - return -ENOSYS; - break; - default: - return -EINVAL; - } - /* Check privileges */ if (cmd == Q_XGETQUOTA) { if (((type == XQM_USRQUOTA && current_euid() != id) || @@ -238,12 +163,13 @@ static int quota_quotaon(struct super_block *sb, int type, int cmd, qid_t id, void __user *addr) { char *pathname; - int ret; + int ret = -ENOSYS; pathname = getname(addr); if (IS_ERR(pathname)) return PTR_ERR(pathname); - ret = sb->s_qcop->quota_on(sb, type, id, pathname, 0); + if (sb->s_qcop->quota_on) + ret = sb->s_qcop->quota_on(sb, type, id, pathname, 0); putname(pathname); return ret; } @@ -269,6 +195,10 @@ static int quota_getinfo(struct super_block *sb, int type, void __user *addr) struct if_dqinfo info; int ret; + if (!sb_has_quota_active(sb, type)) + return -ESRCH; + if (!sb->s_qcop->get_info) + return -ENOSYS; ret = sb->s_qcop->get_info(sb, type, &info); if (!ret && copy_to_user(addr, &info, sizeof(info))) return -EFAULT; @@ -281,6 +211,10 @@ static int quota_setinfo(struct super_block *sb, int type, void __user *addr) if (copy_from_user(&info, addr, sizeof(info))) return -EFAULT; + if (!sb_has_quota_active(sb, type)) + return -ESRCH; + if (!sb->s_qcop->set_info) + return -ENOSYS; return sb->s_qcop->set_info(sb, type, &info); } @@ -290,6 +224,10 @@ static int quota_getquota(struct super_block *sb, int type, qid_t id, struct if_dqblk idq; int ret; + if (!sb_has_quota_active(sb, type)) + return -ESRCH; + if (!sb->s_qcop->get_dqblk) + return -ENOSYS; ret = sb->s_qcop->get_dqblk(sb, type, id, &idq); if (ret) return ret; @@ -305,6 +243,10 @@ static int quota_setquota(struct super_block *sb, int type, qid_t id, if (copy_from_user(&idq, addr, sizeof(idq))) return -EFAULT; + if (!sb_has_quota_active(sb, type)) + return -ESRCH; + if (!sb->s_qcop->set_dqblk) + return -ENOSYS; return sb->s_qcop->set_dqblk(sb, type, id, &idq); } @@ -314,6 +256,8 @@ static int quota_setxstate(struct super_block *sb, int cmd, void __user *addr) if (copy_from_user(&flags, addr, sizeof(flags))) return -EFAULT; + if (!sb->s_qcop->set_xstate) + return -ENOSYS; return sb->s_qcop->set_xstate(sb, flags, cmd); } @@ -321,7 +265,9 @@ static int quota_getxstate(struct super_block *sb, void __user *addr) { struct fs_quota_stat fqs; int ret; - + + if (!sb->s_qcop->get_xstate) + return -ENOSYS; ret = sb->s_qcop->get_xstate(sb, &fqs); if (!ret && copy_to_user(addr, &fqs, sizeof(fqs))) return -EFAULT; @@ -335,6 +281,8 @@ static int quota_setxquota(struct super_block *sb, int type, qid_t id, if (copy_from_user(&fdq, addr, sizeof(fdq))) return -EFAULT; + if (!sb->s_qcop->set_xquota) + return -ENOSYS; return sb->s_qcop->set_xquota(sb, type, id, &fdq); } @@ -344,6 +292,8 @@ static int quota_getxquota(struct super_block *sb, int type, qid_t id, struct fs_disk_quota fdq; int ret; + if (!sb->s_qcop->get_xquota) + return -ENOSYS; ret = sb->s_qcop->get_xquota(sb, type, id, &fdq); if (!ret && copy_to_user(addr, &fdq, sizeof(fdq))) return -EFAULT; @@ -358,6 +308,8 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, case Q_QUOTAON: return quota_quotaon(sb, type, cmd, id, addr); case Q_QUOTAOFF: + if (!sb->s_qcop->quota_off) + return -ENOSYS; return sb->s_qcop->quota_off(sb, type, 0); case Q_GETFMT: return quota_getfmt(sb, type, addr); @@ -370,9 +322,11 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, case Q_SETQUOTA: return quota_setquota(sb, type, id, addr); case Q_SYNC: - if (sb) + if (sb) { + if (!sb->s_qcop->quota_sync) + return -ENOSYS; sync_quota_sb(sb, type); - else + } else sync_dquots(type); return 0; case Q_XQUOTAON: @@ -386,13 +340,12 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, case Q_XGETQUOTA: return quota_getxquota(sb, type, id, addr); case Q_XQUOTASYNC: + if (!sb->s_qcop->quota_sync) + return -ENOSYS; return sb->s_qcop->quota_sync(sb, type); - /* We never reach here unless validity check is broken */ default: - BUG(); + return -EINVAL; } - - return 0; } /* -- cgit v1.2.3 From 6ae09575b3c951ad77c07d068b8dbbc09031b2d1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@infradead.org> Date: Tue, 16 Feb 2010 03:44:49 -0500 Subject: quota: special case Q_SYNC without device name The Q_SYNC command can be called without the path to a device, in which case it iterates over all superblocks. Special case this variant directly in sys_quotactl so that the other code always gets a superblock and doesn't need to deal with this case. Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Jan Kara <jack@suse.cz> --- fs/quota/quota.c | 40 +++++++++++++++++++++++++++------------- 1 file changed, 27 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/quota/quota.c b/fs/quota/quota.c index dcf7db91fc95..9fde5cd84f8d 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -124,10 +124,17 @@ void sync_quota_sb(struct super_block *sb, int type) } #endif -static void sync_dquots(int type) +static int quota_sync_all(int type) { struct super_block *sb; int cnt; + int ret; + + if (type >= MAXQUOTAS) + return -EINVAL; + ret = security_quotactl(Q_SYNC, type, 0, NULL); + if (ret) + return ret; spin_lock(&sb_lock); restart: @@ -157,6 +164,8 @@ restart: goto restart; } spin_unlock(&sb_lock); + + return 0; } static int quota_quotaon(struct super_block *sb, int type, int cmd, qid_t id, @@ -322,12 +331,9 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, case Q_SETQUOTA: return quota_setquota(sb, type, id, addr); case Q_SYNC: - if (sb) { - if (!sb->s_qcop->quota_sync) - return -ENOSYS; - sync_quota_sb(sb, type); - } else - sync_dquots(type); + if (!sb->s_qcop->quota_sync) + return -ENOSYS; + sync_quota_sb(sb, type); return 0; case Q_XQUOTAON: case Q_XQUOTAOFF: @@ -392,18 +398,26 @@ SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special, cmds = cmd >> SUBCMDSHIFT; type = cmd & SUBCMDMASK; - if (cmds != Q_SYNC || special) { - sb = quotactl_block(special); - if (IS_ERR(sb)) - return PTR_ERR(sb); + /* + * As a special case Q_SYNC can be called without a specific device. + * It will iterate all superblocks that have quota enabled and call + * the sync action on each of them. + */ + if (!special) { + if (cmds == Q_SYNC) + return quota_sync_all(type); + return -ENODEV; } + sb = quotactl_block(special); + if (IS_ERR(sb)) + return PTR_ERR(sb); + ret = check_quotactl_valid(sb, type, cmds, id); if (ret >= 0) ret = do_quotactl(sb, type, cmds, id, addr); - if (sb) - drop_super(sb); + drop_super(sb); return ret; } -- cgit v1.2.3 From c988afb5fa3fc450207c3dfc0ce535f4bfdae4d1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@infradead.org> Date: Tue, 16 Feb 2010 03:44:50 -0500 Subject: quota: simplify permission checking Stop having complicated different routines for checking permissions for XQM vs "VFS" quotas. Instead do the checks for having sb->s_qcop and a valid type directly in do_quotactl, and munge the *quotactl_valid functions into a check_quotactl_permission helper that only checks for permissions. Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Jan Kara <jack@suse.cz> --- fs/quota/quota.c | 92 +++++++++++++++++++------------------------------------- 1 file changed, 31 insertions(+), 61 deletions(-) (limited to 'fs') diff --git a/fs/quota/quota.c b/fs/quota/quota.c index 9fde5cd84f8d..d0efe302b1c1 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -21,69 +21,30 @@ #include <net/netlink.h> #include <net/genetlink.h> -/* Check validity of generic quotactl commands */ -static int generic_quotactl_valid(struct super_block *sb, int type, int cmd, - qid_t id) +static int check_quotactl_permission(struct super_block *sb, int type, int cmd, + qid_t id) { - if (type >= MAXQUOTAS) - return -EINVAL; - if (!sb && cmd != Q_SYNC) - return -ENODEV; - /* Is operation supported? */ - if (sb && !sb->s_qcop) - return -ENOSYS; - - /* Check privileges */ - if (cmd == Q_GETQUOTA) { - if (((type == USRQUOTA && current_euid() != id) || - (type == GRPQUOTA && !in_egroup_p(id))) && - !capable(CAP_SYS_ADMIN)) - return -EPERM; - } - else if (cmd != Q_GETFMT && cmd != Q_SYNC && cmd != Q_GETINFO) - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - return 0; -} - -/* Check validity of XFS Quota Manager commands */ -static int xqm_quotactl_valid(struct super_block *sb, int type, int cmd, - qid_t id) -{ - if (type >= XQM_MAXQUOTAS) - return -EINVAL; - if (!sb) - return -ENODEV; - if (!sb->s_qcop) - return -ENOSYS; - - /* Check privileges */ - if (cmd == Q_XGETQUOTA) { - if (((type == XQM_USRQUOTA && current_euid() != id) || - (type == XQM_GRPQUOTA && !in_egroup_p(id))) && - !capable(CAP_SYS_ADMIN)) - return -EPERM; - } else if (cmd != Q_XGETQSTAT && cmd != Q_XQUOTASYNC) { + switch (cmd) { + /* these commands do not require any special privilegues */ + case Q_GETFMT: + case Q_SYNC: + case Q_GETINFO: + case Q_XGETQSTAT: + case Q_XQUOTASYNC: + break; + /* allow to query information for dquots we "own" */ + case Q_GETQUOTA: + case Q_XGETQUOTA: + if ((type == USRQUOTA && current_euid() == id) || + (type == GRPQUOTA && in_egroup_p(id))) + break; + /*FALLTHROUGH*/ + default: if (!capable(CAP_SYS_ADMIN)) return -EPERM; } - return 0; -} - -static int check_quotactl_valid(struct super_block *sb, int type, int cmd, - qid_t id) -{ - int error; - - if (XQM_COMMAND(cmd)) - error = xqm_quotactl_valid(sb, type, cmd, id); - else - error = generic_quotactl_valid(sb, type, cmd, id); - if (!error) - error = security_quotactl(cmd, type, id, sb); - return error; + return security_quotactl(cmd, type, id, sb); } #ifdef CONFIG_QUOTA @@ -313,6 +274,17 @@ static int quota_getxquota(struct super_block *sb, int type, qid_t id, static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, void __user *addr) { + int ret; + + if (type >= (XQM_COMMAND(cmd) ? XQM_MAXQUOTAS : MAXQUOTAS)) + return -EINVAL; + if (!sb->s_qcop) + return -ENOSYS; + + ret = check_quotactl_permission(sb, type, cmd, id); + if (ret < 0) + return ret; + switch (cmd) { case Q_QUOTAON: return quota_quotaon(sb, type, cmd, id, addr); @@ -413,9 +385,7 @@ SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special, if (IS_ERR(sb)) return PTR_ERR(sb); - ret = check_quotactl_valid(sb, type, cmds, id); - if (ret >= 0) - ret = do_quotactl(sb, type, cmds, id, addr); + ret = do_quotactl(sb, type, cmds, id, addr); drop_super(sb); return ret; -- cgit v1.2.3 From 8c4e4acd660a09e571a71583b5bbe1eee700c9ad Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@infradead.org> Date: Tue, 16 Feb 2010 03:44:51 -0500 Subject: quota: clean up Q_XQUOTASYNC Currently Q_XQUOTASYNC calls into the quota_sync method, but XFS does something entirely different in it than the rest of the filesystems. xfs_quota which calls Q_XQUOTASYNC expects an asynchronous data writeout to flush delayed allocations, while the "VFS" quota support wants to flush changes to the quota file. So make Q_XQUOTASYNC call into the writeback code directly and make the quota_sync method optional as XFS doesn't need in the sense expected by the rest of the quota code. GFS2 was using limited XFS-style quota and has a quota_sync method fitting neither the style used by vfs_quota_sync nor xfs_fs_quota_sync. I left it in for now as per discussion with Steve it expects to be called from the sync path this way. Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Jan Kara <jack@suse.cz> --- fs/quota/quota.c | 11 +++++++---- fs/xfs/linux-2.6/xfs_quotaops.c | 15 --------------- include/linux/quotaops.h | 2 +- 3 files changed, 8 insertions(+), 20 deletions(-) (limited to 'fs') diff --git a/fs/quota/quota.c b/fs/quota/quota.c index d0efe302b1c1..3d31228082ea 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -18,6 +18,7 @@ #include <linux/capability.h> #include <linux/quotaops.h> #include <linux/types.h> +#include <linux/writeback.h> #include <net/netlink.h> #include <net/genetlink.h> @@ -52,7 +53,7 @@ void sync_quota_sb(struct super_block *sb, int type) { int cnt; - if (!sb->s_qcop->quota_sync) + if (!sb->s_qcop || !sb->s_qcop->quota_sync) return; sb->s_qcop->quota_sync(sb, type); @@ -318,9 +319,11 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, case Q_XGETQUOTA: return quota_getxquota(sb, type, id, addr); case Q_XQUOTASYNC: - if (!sb->s_qcop->quota_sync) - return -ENOSYS; - return sb->s_qcop->quota_sync(sb, type); + /* caller already holds s_umount */ + if (sb->s_flags & MS_RDONLY) + return -EROFS; + writeback_inodes_sb(sb); + return 0; default: return -EINVAL; } diff --git a/fs/xfs/linux-2.6/xfs_quotaops.c b/fs/xfs/linux-2.6/xfs_quotaops.c index 3d4a0c84d634..07d67c624922 100644 --- a/fs/xfs/linux-2.6/xfs_quotaops.c +++ b/fs/xfs/linux-2.6/xfs_quotaops.c @@ -43,20 +43,6 @@ xfs_quota_type(int type) } } -STATIC int -xfs_fs_quota_sync( - struct super_block *sb, - int type) -{ - struct xfs_mount *mp = XFS_M(sb); - - if (sb->s_flags & MS_RDONLY) - return -EROFS; - if (!XFS_IS_QUOTA_RUNNING(mp)) - return -ENOSYS; - return -xfs_sync_data(mp, 0); -} - STATIC int xfs_fs_get_xstate( struct super_block *sb, @@ -151,7 +137,6 @@ xfs_fs_set_xquota( } const struct quotactl_ops xfs_quotactl_operations = { - .quota_sync = xfs_fs_quota_sync, .get_xstate = xfs_fs_get_xstate, .set_xstate = xfs_fs_set_xstate, .get_xquota = xfs_fs_get_xquota, diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h index a529d86e7e73..69d26bc0f884 100644 --- a/include/linux/quotaops.h +++ b/include/linux/quotaops.h @@ -22,7 +22,7 @@ static inline struct quota_info *sb_dqopt(struct super_block *sb) void sync_quota_sb(struct super_block *sb, int type); static inline void writeout_quota_sb(struct super_block *sb, int type) { - if (sb->s_qcop->quota_sync) + if (sb->s_qcop && sb->s_qcop->quota_sync) sb->s_qcop->quota_sync(sb, type); } -- cgit v1.2.3 From 5fb324ad24febe57a8a2e62903dcb7bad546ea71 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@infradead.org> Date: Tue, 16 Feb 2010 03:44:52 -0500 Subject: quota: move code from sync_quota_sb into vfs_quota_sync Currenly sync_quota_sb does a lot of sync and truncate action that only applies to "VFS" style quotas and is actively harmful for the sync performance in XFS. Move it into vfs_quota_sync and add a wait parameter to ->quota_sync to tell if we need it or not. My audit of the GFS2 code says it's also not needed given the way GFS2 implements quotas, but I'd be happy if this can get a detailed review. Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Jan Kara <jack@suse.cz> --- fs/gfs2/quota.c | 9 +++++++-- fs/gfs2/quota.h | 2 +- fs/gfs2/super.c | 2 +- fs/gfs2/sys.c | 2 +- fs/quota/dquot.c | 29 ++++++++++++++++++++++++++++- fs/quota/quota.c | 46 +++++----------------------------------------- fs/sync.c | 14 +++++++------- include/linux/quota.h | 2 +- include/linux/quotaops.h | 17 +---------------- 9 files changed, 52 insertions(+), 71 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index e3bf6eab8750..6dbcbad6ab17 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -1083,7 +1083,7 @@ void gfs2_quota_change(struct gfs2_inode *ip, s64 change, } } -int gfs2_quota_sync(struct super_block *sb, int type) +int gfs2_quota_sync(struct super_block *sb, int type, int wait) { struct gfs2_sbd *sdp = sb->s_fs_info; struct gfs2_quota_data **qda; @@ -1127,6 +1127,11 @@ int gfs2_quota_sync(struct super_block *sb, int type) return error; } +static int gfs2_quota_sync_timeo(struct super_block *sb, int type) +{ + return gfs2_quota_sync(sb, type, 0); +} + int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id) { struct gfs2_quota_data *qd; @@ -1382,7 +1387,7 @@ int gfs2_quotad(void *data) &tune->gt_statfs_quantum); /* Update quota file */ - quotad_check_timeo(sdp, "sync", gfs2_quota_sync, t, + quotad_check_timeo(sdp, "sync", gfs2_quota_sync_timeo, t, "ad_timeo, &tune->gt_quota_quantum); /* Check for & recover partially truncated inodes */ diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h index e271fa07ad02..195f60c8bd14 100644 --- a/fs/gfs2/quota.h +++ b/fs/gfs2/quota.h @@ -25,7 +25,7 @@ extern int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid); extern void gfs2_quota_change(struct gfs2_inode *ip, s64 change, u32 uid, u32 gid); -extern int gfs2_quota_sync(struct super_block *sb, int type); +extern int gfs2_quota_sync(struct super_block *sb, int type, int wait); extern int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id); extern int gfs2_quota_init(struct gfs2_sbd *sdp); diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index b9dd3da22c0a..a8c2bcd0fcc8 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -764,7 +764,7 @@ static int gfs2_make_fs_ro(struct gfs2_sbd *sdp) int error; flush_workqueue(gfs2_delete_workqueue); - gfs2_quota_sync(sdp->sd_vfs, 0); + gfs2_quota_sync(sdp->sd_vfs, 0, 1); gfs2_statfs_sync(sdp->sd_vfs, 0); error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, GL_NOCACHE, diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index 0dc34621f6a6..4496cc37a0fa 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -167,7 +167,7 @@ static ssize_t quota_sync_store(struct gfs2_sbd *sdp, const char *buf, if (simple_strtol(buf, NULL, 0) != 1) return -EINVAL; - gfs2_quota_sync(sdp->sd_vfs, 0); + gfs2_quota_sync(sdp->sd_vfs, 0, 1); return len; } diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 6c849de5dc8f..4c2213f7ed36 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -570,7 +570,7 @@ out: } EXPORT_SYMBOL(dquot_scan_active); -int vfs_quota_sync(struct super_block *sb, int type) +int vfs_quota_sync(struct super_block *sb, int type, int wait) { struct list_head *dirty; struct dquot *dquot; @@ -615,6 +615,33 @@ int vfs_quota_sync(struct super_block *sb, int type) spin_unlock(&dq_list_lock); mutex_unlock(&dqopt->dqonoff_mutex); + if (!wait || (sb_dqopt(sb)->flags & DQUOT_QUOTA_SYS_FILE)) + return 0; + + /* This is not very clever (and fast) but currently I don't know about + * any other simple way of getting quota data to disk and we must get + * them there for userspace to be visible... */ + if (sb->s_op->sync_fs) + sb->s_op->sync_fs(sb, 1); + sync_blockdev(sb->s_bdev); + + /* + * Now when everything is written we can discard the pagecache so + * that userspace sees the changes. + */ + mutex_lock(&sb_dqopt(sb)->dqonoff_mutex); + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + if (type != -1 && cnt != type) + continue; + if (!sb_has_quota_active(sb, cnt)) + continue; + mutex_lock_nested(&sb_dqopt(sb)->files[cnt]->i_mutex, + I_MUTEX_QUOTA); + truncate_inode_pages(&sb_dqopt(sb)->files[cnt]->i_data, 0); + mutex_unlock(&sb_dqopt(sb)->files[cnt]->i_mutex); + } + mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); + return 0; } EXPORT_SYMBOL(vfs_quota_sync); diff --git a/fs/quota/quota.c b/fs/quota/quota.c index 3d31228082ea..0593b229656c 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -48,44 +48,6 @@ static int check_quotactl_permission(struct super_block *sb, int type, int cmd, return security_quotactl(cmd, type, id, sb); } -#ifdef CONFIG_QUOTA -void sync_quota_sb(struct super_block *sb, int type) -{ - int cnt; - - if (!sb->s_qcop || !sb->s_qcop->quota_sync) - return; - - sb->s_qcop->quota_sync(sb, type); - - if (sb_dqopt(sb)->flags & DQUOT_QUOTA_SYS_FILE) - return; - /* This is not very clever (and fast) but currently I don't know about - * any other simple way of getting quota data to disk and we must get - * them there for userspace to be visible... */ - if (sb->s_op->sync_fs) - sb->s_op->sync_fs(sb, 1); - sync_blockdev(sb->s_bdev); - - /* - * Now when everything is written we can discard the pagecache so - * that userspace sees the changes. - */ - mutex_lock(&sb_dqopt(sb)->dqonoff_mutex); - for (cnt = 0; cnt < MAXQUOTAS; cnt++) { - if (type != -1 && cnt != type) - continue; - if (!sb_has_quota_active(sb, cnt)) - continue; - mutex_lock_nested(&sb_dqopt(sb)->files[cnt]->i_mutex, - I_MUTEX_QUOTA); - truncate_inode_pages(&sb_dqopt(sb)->files[cnt]->i_data, 0); - mutex_unlock(&sb_dqopt(sb)->files[cnt]->i_mutex); - } - mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); -} -#endif - static int quota_sync_all(int type) { struct super_block *sb; @@ -101,6 +63,9 @@ static int quota_sync_all(int type) spin_lock(&sb_lock); restart: list_for_each_entry(sb, &super_blocks, s_list) { + if (!sb->s_qcop || !sb->s_qcop->quota_sync) + continue; + /* This test just improves performance so it needn't be * reliable... */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { @@ -119,7 +84,7 @@ restart: spin_unlock(&sb_lock); down_read(&sb->s_umount); if (sb->s_root) - sync_quota_sb(sb, type); + sb->s_qcop->quota_sync(sb, type, 1); up_read(&sb->s_umount); spin_lock(&sb_lock); if (__put_super_and_need_restart(sb)) @@ -306,8 +271,7 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, case Q_SYNC: if (!sb->s_qcop->quota_sync) return -ENOSYS; - sync_quota_sb(sb, type); - return 0; + return sb->s_qcop->quota_sync(sb, type, 1); case Q_XQUOTAON: case Q_XQUOTAOFF: case Q_XQUOTARM: diff --git a/fs/sync.c b/fs/sync.c index 418727a2a239..f557d71cb097 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -34,14 +34,14 @@ static int __sync_filesystem(struct super_block *sb, int wait) if (!sb->s_bdi) return 0; - /* Avoid doing twice syncing and cache pruning for quota sync */ - if (!wait) { - writeout_quota_sb(sb, -1); - writeback_inodes_sb(sb); - } else { - sync_quota_sb(sb, -1); + if (sb->s_qcop && sb->s_qcop->quota_sync) + sb->s_qcop->quota_sync(sb, -1, wait); + + if (wait) sync_inodes_sb(sb); - } + else + writeback_inodes_sb(sb); + if (sb->s_op->sync_fs) sb->s_op->sync_fs(sb, wait); return __sync_blockdev(sb->s_bdev, wait); diff --git a/include/linux/quota.h b/include/linux/quota.h index a6861f117480..570348cbccb1 100644 --- a/include/linux/quota.h +++ b/include/linux/quota.h @@ -324,7 +324,7 @@ struct dquot_operations { struct quotactl_ops { int (*quota_on)(struct super_block *, int, int, char *, int); int (*quota_off)(struct super_block *, int, int); - int (*quota_sync)(struct super_block *, int); + int (*quota_sync)(struct super_block *, int, int); int (*get_info)(struct super_block *, int, struct if_dqinfo *); int (*set_info)(struct super_block *, int, struct if_dqinfo *); int (*get_dqblk)(struct super_block *, int, qid_t, struct if_dqblk *); diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h index 69d26bc0f884..8cfd0d44c994 100644 --- a/include/linux/quotaops.h +++ b/include/linux/quotaops.h @@ -19,13 +19,6 @@ static inline struct quota_info *sb_dqopt(struct super_block *sb) /* * declaration of quota_function calls in kernel. */ -void sync_quota_sb(struct super_block *sb, int type); -static inline void writeout_quota_sb(struct super_block *sb, int type) -{ - if (sb->s_qcop && sb->s_qcop->quota_sync) - sb->s_qcop->quota_sync(sb, type); -} - void inode_add_rsv_space(struct inode *inode, qsize_t number); void inode_claim_rsv_space(struct inode *inode, qsize_t number); void inode_sub_rsv_space(struct inode *inode, qsize_t number); @@ -67,7 +60,7 @@ int vfs_quota_on_mount(struct super_block *sb, char *qf_name, int format_id, int type); int vfs_quota_off(struct super_block *sb, int type, int remount); int vfs_quota_disable(struct super_block *sb, int type, unsigned int flags); -int vfs_quota_sync(struct super_block *sb, int type); +int vfs_quota_sync(struct super_block *sb, int type, int wait); int vfs_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii); int vfs_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii); int vfs_get_dqblk(struct super_block *sb, int type, qid_t id, struct if_dqblk *di); @@ -340,14 +333,6 @@ static inline void vfs_dq_free_inode(struct inode *inode) { } -static inline void sync_quota_sb(struct super_block *sb, int type) -{ -} - -static inline void writeout_quota_sb(struct super_block *sb, int type) -{ -} - static inline int vfs_dq_off(struct super_block *sb, int remount) { return 0; -- cgit v1.2.3 From a56fca23f67282467c08e75c40081da2345dfdbf Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@infradead.org> Date: Tue, 16 Feb 2010 03:44:53 -0500 Subject: quota: remove invalid optimization from quota_sync_all Checking the "VFS" quota enabled and dirty bits from generic code means this code will never get called for other implementations, e.g. XFS and GFS2. Grabbing the reference on the superblock really isn't much overhead for a global Q_SYNC call, so just drop this optimization. Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Jan Kara <jack@suse.cz> --- fs/quota/quota.c | 15 --------------- 1 file changed, 15 deletions(-) (limited to 'fs') diff --git a/fs/quota/quota.c b/fs/quota/quota.c index 0593b229656c..a43bb2c3ceaf 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -51,7 +51,6 @@ static int check_quotactl_permission(struct super_block *sb, int type, int cmd, static int quota_sync_all(int type) { struct super_block *sb; - int cnt; int ret; if (type >= MAXQUOTAS) @@ -66,20 +65,6 @@ restart: if (!sb->s_qcop || !sb->s_qcop->quota_sync) continue; - /* This test just improves performance so it needn't be - * reliable... */ - for (cnt = 0; cnt < MAXQUOTAS; cnt++) { - if (type != -1 && type != cnt) - continue; - if (!sb_has_quota_active(sb, cnt)) - continue; - if (!info_dirty(&sb_dqopt(sb)->info[cnt]) && - list_empty(&sb_dqopt(sb)->info[cnt].dqi_dirty_list)) - continue; - break; - } - if (cnt == MAXQUOTAS) - continue; sb->s_count++; spin_unlock(&sb_lock); down_read(&sb->s_umount); -- cgit v1.2.3 From 799a9d44023c069f46bc5933a930eab0bd37d0df Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@infradead.org> Date: Tue, 16 Feb 2010 03:44:54 -0500 Subject: quota: split out netlink notification support from quota.c Instead of adding ifdefs just split it into a new file. Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Jan Kara <jack@suse.cz> --- fs/quota/Makefile | 1 + fs/quota/netlink.c | 95 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/quota/quota.c | 93 ---------------------------------------------------- 3 files changed, 96 insertions(+), 93 deletions(-) create mode 100644 fs/quota/netlink.c (limited to 'fs') diff --git a/fs/quota/Makefile b/fs/quota/Makefile index 68d4f6dc0578..dcba20445ccb 100644 --- a/fs/quota/Makefile +++ b/fs/quota/Makefile @@ -3,3 +3,4 @@ obj-$(CONFIG_QFMT_V1) += quota_v1.o obj-$(CONFIG_QFMT_V2) += quota_v2.o obj-$(CONFIG_QUOTA_TREE) += quota_tree.o obj-$(CONFIG_QUOTACTL) += quota.o +obj-$(CONFIG_QUOTA_NETLINK_INTERFACE) += netlink.o diff --git a/fs/quota/netlink.c b/fs/quota/netlink.c new file mode 100644 index 000000000000..2663ed90fb03 --- /dev/null +++ b/fs/quota/netlink.c @@ -0,0 +1,95 @@ + +#include <linux/cred.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/quotaops.h> +#include <linux/sched.h> +#include <net/netlink.h> +#include <net/genetlink.h> + +/* Netlink family structure for quota */ +static struct genl_family quota_genl_family = { + .id = GENL_ID_GENERATE, + .hdrsize = 0, + .name = "VFS_DQUOT", + .version = 1, + .maxattr = QUOTA_NL_A_MAX, +}; + +/** + * quota_send_warning - Send warning to userspace about exceeded quota + * @type: The quota type: USRQQUOTA, GRPQUOTA,... + * @id: The user or group id of the quota that was exceeded + * @dev: The device on which the fs is mounted (sb->s_dev) + * @warntype: The type of the warning: QUOTA_NL_... + * + * This can be used by filesystems (including those which don't use + * dquot) to send a message to userspace relating to quota limits. + * + */ + +void quota_send_warning(short type, unsigned int id, dev_t dev, + const char warntype) +{ + static atomic_t seq; + struct sk_buff *skb; + void *msg_head; + int ret; + int msg_size = 4 * nla_total_size(sizeof(u32)) + + 2 * nla_total_size(sizeof(u64)); + + /* We have to allocate using GFP_NOFS as we are called from a + * filesystem performing write and thus further recursion into + * the fs to free some data could cause deadlocks. */ + skb = genlmsg_new(msg_size, GFP_NOFS); + if (!skb) { + printk(KERN_ERR + "VFS: Not enough memory to send quota warning.\n"); + return; + } + msg_head = genlmsg_put(skb, 0, atomic_add_return(1, &seq), + "a_genl_family, 0, QUOTA_NL_C_WARNING); + if (!msg_head) { + printk(KERN_ERR + "VFS: Cannot store netlink header in quota warning.\n"); + goto err_out; + } + ret = nla_put_u32(skb, QUOTA_NL_A_QTYPE, type); + if (ret) + goto attr_err_out; + ret = nla_put_u64(skb, QUOTA_NL_A_EXCESS_ID, id); + if (ret) + goto attr_err_out; + ret = nla_put_u32(skb, QUOTA_NL_A_WARNING, warntype); + if (ret) + goto attr_err_out; + ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MAJOR, MAJOR(dev)); + if (ret) + goto attr_err_out; + ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MINOR, MINOR(dev)); + if (ret) + goto attr_err_out; + ret = nla_put_u64(skb, QUOTA_NL_A_CAUSED_ID, current_uid()); + if (ret) + goto attr_err_out; + genlmsg_end(skb, msg_head); + + genlmsg_multicast(skb, 0, quota_genl_family.id, GFP_NOFS); + return; +attr_err_out: + printk(KERN_ERR "VFS: Not enough space to compose quota message!\n"); +err_out: + kfree_skb(skb); +} +EXPORT_SYMBOL(quota_send_warning); + +static int __init quota_init(void) +{ + if (genl_register_family("a_genl_family) != 0) + printk(KERN_ERR + "VFS: Failed to create quota netlink interface.\n"); + return 0; +}; + +module_init(quota_init); diff --git a/fs/quota/quota.c b/fs/quota/quota.c index a43bb2c3ceaf..4506c6596347 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -19,8 +19,6 @@ #include <linux/quotaops.h> #include <linux/types.h> #include <linux/writeback.h> -#include <net/netlink.h> -#include <net/genetlink.h> static int check_quotactl_permission(struct super_block *sb, int type, int cmd, qid_t id) @@ -458,94 +456,3 @@ asmlinkage long sys32_quotactl(unsigned int cmd, const char __user *special, return ret; } #endif - - -#ifdef CONFIG_QUOTA_NETLINK_INTERFACE - -/* Netlink family structure for quota */ -static struct genl_family quota_genl_family = { - .id = GENL_ID_GENERATE, - .hdrsize = 0, - .name = "VFS_DQUOT", - .version = 1, - .maxattr = QUOTA_NL_A_MAX, -}; - -/** - * quota_send_warning - Send warning to userspace about exceeded quota - * @type: The quota type: USRQQUOTA, GRPQUOTA,... - * @id: The user or group id of the quota that was exceeded - * @dev: The device on which the fs is mounted (sb->s_dev) - * @warntype: The type of the warning: QUOTA_NL_... - * - * This can be used by filesystems (including those which don't use - * dquot) to send a message to userspace relating to quota limits. - * - */ - -void quota_send_warning(short type, unsigned int id, dev_t dev, - const char warntype) -{ - static atomic_t seq; - struct sk_buff *skb; - void *msg_head; - int ret; - int msg_size = 4 * nla_total_size(sizeof(u32)) + - 2 * nla_total_size(sizeof(u64)); - - /* We have to allocate using GFP_NOFS as we are called from a - * filesystem performing write and thus further recursion into - * the fs to free some data could cause deadlocks. */ - skb = genlmsg_new(msg_size, GFP_NOFS); - if (!skb) { - printk(KERN_ERR - "VFS: Not enough memory to send quota warning.\n"); - return; - } - msg_head = genlmsg_put(skb, 0, atomic_add_return(1, &seq), - "a_genl_family, 0, QUOTA_NL_C_WARNING); - if (!msg_head) { - printk(KERN_ERR - "VFS: Cannot store netlink header in quota warning.\n"); - goto err_out; - } - ret = nla_put_u32(skb, QUOTA_NL_A_QTYPE, type); - if (ret) - goto attr_err_out; - ret = nla_put_u64(skb, QUOTA_NL_A_EXCESS_ID, id); - if (ret) - goto attr_err_out; - ret = nla_put_u32(skb, QUOTA_NL_A_WARNING, warntype); - if (ret) - goto attr_err_out; - ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MAJOR, MAJOR(dev)); - if (ret) - goto attr_err_out; - ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MINOR, MINOR(dev)); - if (ret) - goto attr_err_out; - ret = nla_put_u64(skb, QUOTA_NL_A_CAUSED_ID, current_uid()); - if (ret) - goto attr_err_out; - genlmsg_end(skb, msg_head); - - genlmsg_multicast(skb, 0, quota_genl_family.id, GFP_NOFS); - return; -attr_err_out: - printk(KERN_ERR "VFS: Not enough space to compose quota message!\n"); -err_out: - kfree_skb(skb); -} -EXPORT_SYMBOL(quota_send_warning); - -static int __init quota_init(void) -{ - if (genl_register_family("a_genl_family) != 0) - printk(KERN_ERR - "VFS: Failed to create quota netlink interface.\n"); - return 0; -}; - -module_init(quota_init); -#endif - -- cgit v1.2.3 From 5582c76f901d240f57329212b59b4d957ea8d6cf Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@infradead.org> Date: Tue, 16 Feb 2010 03:44:55 -0500 Subject: quota: split out compat_sys_quotactl support from quota.c Instead of adding ifdefs just split it into a new file. Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Jan Kara <jack@suse.cz> --- fs/quota/Kconfig | 5 +++ fs/quota/Makefile | 1 + fs/quota/compat.c | 118 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/quota/quota.c | 117 ----------------------------------------------------- 4 files changed, 124 insertions(+), 117 deletions(-) create mode 100644 fs/quota/compat.c (limited to 'fs') diff --git a/fs/quota/Kconfig b/fs/quota/Kconfig index efc02ebb8c70..dad7fb247ddc 100644 --- a/fs/quota/Kconfig +++ b/fs/quota/Kconfig @@ -59,3 +59,8 @@ config QUOTACTL bool depends on XFS_QUOTA || QUOTA default y + +config QUOTACTL_COMPAT + bool + depends on QUOTACTL && COMPAT_FOR_U64_ALIGNMENT + default y diff --git a/fs/quota/Makefile b/fs/quota/Makefile index dcba20445ccb..5f9e9e276af0 100644 --- a/fs/quota/Makefile +++ b/fs/quota/Makefile @@ -3,4 +3,5 @@ obj-$(CONFIG_QFMT_V1) += quota_v1.o obj-$(CONFIG_QFMT_V2) += quota_v2.o obj-$(CONFIG_QUOTA_TREE) += quota_tree.o obj-$(CONFIG_QUOTACTL) += quota.o +obj-$(CONFIG_QUOTACTL_COMPAT) += compat.o obj-$(CONFIG_QUOTA_NETLINK_INTERFACE) += netlink.o diff --git a/fs/quota/compat.c b/fs/quota/compat.c new file mode 100644 index 000000000000..fb1892fe3e56 --- /dev/null +++ b/fs/quota/compat.c @@ -0,0 +1,118 @@ + +#include <linux/syscalls.h> +#include <linux/compat.h> +#include <linux/quotaops.h> + +/* + * This code works only for 32 bit quota tools over 64 bit OS (x86_64, ia64) + * and is necessary due to alignment problems. + */ +struct compat_if_dqblk { + compat_u64 dqb_bhardlimit; + compat_u64 dqb_bsoftlimit; + compat_u64 dqb_curspace; + compat_u64 dqb_ihardlimit; + compat_u64 dqb_isoftlimit; + compat_u64 dqb_curinodes; + compat_u64 dqb_btime; + compat_u64 dqb_itime; + compat_uint_t dqb_valid; +}; + +/* XFS structures */ +struct compat_fs_qfilestat { + compat_u64 dqb_bhardlimit; + compat_u64 qfs_nblks; + compat_uint_t qfs_nextents; +}; + +struct compat_fs_quota_stat { + __s8 qs_version; + __u16 qs_flags; + __s8 qs_pad; + struct compat_fs_qfilestat qs_uquota; + struct compat_fs_qfilestat qs_gquota; + compat_uint_t qs_incoredqs; + compat_int_t qs_btimelimit; + compat_int_t qs_itimelimit; + compat_int_t qs_rtbtimelimit; + __u16 qs_bwarnlimit; + __u16 qs_iwarnlimit; +}; + +asmlinkage long sys32_quotactl(unsigned int cmd, const char __user *special, + qid_t id, void __user *addr) +{ + unsigned int cmds; + struct if_dqblk __user *dqblk; + struct compat_if_dqblk __user *compat_dqblk; + struct fs_quota_stat __user *fsqstat; + struct compat_fs_quota_stat __user *compat_fsqstat; + compat_uint_t data; + u16 xdata; + long ret; + + cmds = cmd >> SUBCMDSHIFT; + + switch (cmds) { + case Q_GETQUOTA: + dqblk = compat_alloc_user_space(sizeof(struct if_dqblk)); + compat_dqblk = addr; + ret = sys_quotactl(cmd, special, id, dqblk); + if (ret) + break; + if (copy_in_user(compat_dqblk, dqblk, sizeof(*compat_dqblk)) || + get_user(data, &dqblk->dqb_valid) || + put_user(data, &compat_dqblk->dqb_valid)) + ret = -EFAULT; + break; + case Q_SETQUOTA: + dqblk = compat_alloc_user_space(sizeof(struct if_dqblk)); + compat_dqblk = addr; + ret = -EFAULT; + if (copy_in_user(dqblk, compat_dqblk, sizeof(*compat_dqblk)) || + get_user(data, &compat_dqblk->dqb_valid) || + put_user(data, &dqblk->dqb_valid)) + break; + ret = sys_quotactl(cmd, special, id, dqblk); + break; + case Q_XGETQSTAT: + fsqstat = compat_alloc_user_space(sizeof(struct fs_quota_stat)); + compat_fsqstat = addr; + ret = sys_quotactl(cmd, special, id, fsqstat); + if (ret) + break; + ret = -EFAULT; + /* Copying qs_version, qs_flags, qs_pad */ + if (copy_in_user(compat_fsqstat, fsqstat, + offsetof(struct compat_fs_quota_stat, qs_uquota))) + break; + /* Copying qs_uquota */ + if (copy_in_user(&compat_fsqstat->qs_uquota, + &fsqstat->qs_uquota, + sizeof(compat_fsqstat->qs_uquota)) || + get_user(data, &fsqstat->qs_uquota.qfs_nextents) || + put_user(data, &compat_fsqstat->qs_uquota.qfs_nextents)) + break; + /* Copying qs_gquota */ + if (copy_in_user(&compat_fsqstat->qs_gquota, + &fsqstat->qs_gquota, + sizeof(compat_fsqstat->qs_gquota)) || + get_user(data, &fsqstat->qs_gquota.qfs_nextents) || + put_user(data, &compat_fsqstat->qs_gquota.qfs_nextents)) + break; + /* Copying the rest */ + if (copy_in_user(&compat_fsqstat->qs_incoredqs, + &fsqstat->qs_incoredqs, + sizeof(struct compat_fs_quota_stat) - + offsetof(struct compat_fs_quota_stat, qs_incoredqs)) || + get_user(xdata, &fsqstat->qs_iwarnlimit) || + put_user(xdata, &compat_fsqstat->qs_iwarnlimit)) + break; + ret = 0; + break; + default: + ret = sys_quotactl(cmd, special, id, addr); + } + return ret; +} diff --git a/fs/quota/quota.c b/fs/quota/quota.c index 4506c6596347..95388f9b7356 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -10,7 +10,6 @@ #include <linux/slab.h> #include <asm/current.h> #include <asm/uaccess.h> -#include <linux/compat.h> #include <linux/kernel.h> #include <linux/security.h> #include <linux/syscalls.h> @@ -340,119 +339,3 @@ SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special, drop_super(sb); return ret; } - -#if defined(CONFIG_COMPAT_FOR_U64_ALIGNMENT) -/* - * This code works only for 32 bit quota tools over 64 bit OS (x86_64, ia64) - * and is necessary due to alignment problems. - */ -struct compat_if_dqblk { - compat_u64 dqb_bhardlimit; - compat_u64 dqb_bsoftlimit; - compat_u64 dqb_curspace; - compat_u64 dqb_ihardlimit; - compat_u64 dqb_isoftlimit; - compat_u64 dqb_curinodes; - compat_u64 dqb_btime; - compat_u64 dqb_itime; - compat_uint_t dqb_valid; -}; - -/* XFS structures */ -struct compat_fs_qfilestat { - compat_u64 dqb_bhardlimit; - compat_u64 qfs_nblks; - compat_uint_t qfs_nextents; -}; - -struct compat_fs_quota_stat { - __s8 qs_version; - __u16 qs_flags; - __s8 qs_pad; - struct compat_fs_qfilestat qs_uquota; - struct compat_fs_qfilestat qs_gquota; - compat_uint_t qs_incoredqs; - compat_int_t qs_btimelimit; - compat_int_t qs_itimelimit; - compat_int_t qs_rtbtimelimit; - __u16 qs_bwarnlimit; - __u16 qs_iwarnlimit; -}; - -asmlinkage long sys32_quotactl(unsigned int cmd, const char __user *special, - qid_t id, void __user *addr) -{ - unsigned int cmds; - struct if_dqblk __user *dqblk; - struct compat_if_dqblk __user *compat_dqblk; - struct fs_quota_stat __user *fsqstat; - struct compat_fs_quota_stat __user *compat_fsqstat; - compat_uint_t data; - u16 xdata; - long ret; - - cmds = cmd >> SUBCMDSHIFT; - - switch (cmds) { - case Q_GETQUOTA: - dqblk = compat_alloc_user_space(sizeof(struct if_dqblk)); - compat_dqblk = addr; - ret = sys_quotactl(cmd, special, id, dqblk); - if (ret) - break; - if (copy_in_user(compat_dqblk, dqblk, sizeof(*compat_dqblk)) || - get_user(data, &dqblk->dqb_valid) || - put_user(data, &compat_dqblk->dqb_valid)) - ret = -EFAULT; - break; - case Q_SETQUOTA: - dqblk = compat_alloc_user_space(sizeof(struct if_dqblk)); - compat_dqblk = addr; - ret = -EFAULT; - if (copy_in_user(dqblk, compat_dqblk, sizeof(*compat_dqblk)) || - get_user(data, &compat_dqblk->dqb_valid) || - put_user(data, &dqblk->dqb_valid)) - break; - ret = sys_quotactl(cmd, special, id, dqblk); - break; - case Q_XGETQSTAT: - fsqstat = compat_alloc_user_space(sizeof(struct fs_quota_stat)); - compat_fsqstat = addr; - ret = sys_quotactl(cmd, special, id, fsqstat); - if (ret) - break; - ret = -EFAULT; - /* Copying qs_version, qs_flags, qs_pad */ - if (copy_in_user(compat_fsqstat, fsqstat, - offsetof(struct compat_fs_quota_stat, qs_uquota))) - break; - /* Copying qs_uquota */ - if (copy_in_user(&compat_fsqstat->qs_uquota, - &fsqstat->qs_uquota, - sizeof(compat_fsqstat->qs_uquota)) || - get_user(data, &fsqstat->qs_uquota.qfs_nextents) || - put_user(data, &compat_fsqstat->qs_uquota.qfs_nextents)) - break; - /* Copying qs_gquota */ - if (copy_in_user(&compat_fsqstat->qs_gquota, - &fsqstat->qs_gquota, - sizeof(compat_fsqstat->qs_gquota)) || - get_user(data, &fsqstat->qs_gquota.qfs_nextents) || - put_user(data, &compat_fsqstat->qs_gquota.qfs_nextents)) - break; - /* Copying the rest */ - if (copy_in_user(&compat_fsqstat->qs_incoredqs, - &fsqstat->qs_incoredqs, - sizeof(struct compat_fs_quota_stat) - - offsetof(struct compat_fs_quota_stat, qs_incoredqs)) || - get_user(xdata, &fsqstat->qs_iwarnlimit) || - put_user(xdata, &compat_fsqstat->qs_iwarnlimit)) - break; - ret = 0; - break; - default: - ret = sys_quotactl(cmd, special, id, addr); - } - return ret; -} -#endif -- cgit v1.2.3 From ac0e773718dc20551e72900d2e7eada96ac91100 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@infradead.org> Date: Tue, 16 Feb 2010 03:44:56 -0500 Subject: quota: drop permission checks from xfs_fs_set_xstate/xfs_fs_set_xquota We already do these checks in the generic code. Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Jan Kara <jack@suse.cz> --- fs/xfs/linux-2.6/xfs_quotaops.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_quotaops.c b/fs/xfs/linux-2.6/xfs_quotaops.c index 07d67c624922..1947514ce1ad 100644 --- a/fs/xfs/linux-2.6/xfs_quotaops.c +++ b/fs/xfs/linux-2.6/xfs_quotaops.c @@ -68,8 +68,6 @@ xfs_fs_set_xstate( return -EROFS; if (op != Q_XQUOTARM && !XFS_IS_QUOTA_RUNNING(mp)) return -ENOSYS; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; if (uflags & XFS_QUOTA_UDQ_ACCT) flags |= XFS_UQUOTA_ACCT; @@ -130,8 +128,6 @@ xfs_fs_set_xquota( return -ENOSYS; if (!XFS_IS_QUOTA_ON(mp)) return -ESRCH; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; return -xfs_qm_scall_setqlim(mp, id, xfs_quota_type(type), fdq); } -- cgit v1.2.3 From e5472147e1c0712d95d973acfdbd862957c77add Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov <dmonakhov@openvz.org> Date: Tue, 16 Feb 2010 19:33:42 +0300 Subject: ext3: quota_write cross block boundary behaviour We always assume what dquot update result in changes in one data block But ext3_quota_write() function may handle cross block boundary writes In fact if this ever happen it will result in incorrect journal credits reservation. And later bug_on triggering. As soon this never happen the boundary cross loop is NOOP. In order to make things straight let's remove this loop and assert cross boundary condition. Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org> Signed-off-by: Jan Kara <jack@suse.cz> --- fs/ext3/super.c | 69 ++++++++++++++++++++++++++++----------------------------- 1 file changed, 34 insertions(+), 35 deletions(-) (limited to 'fs') diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 241c520b3081..5c54e5f685d4 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -2948,9 +2948,7 @@ static ssize_t ext3_quota_write(struct super_block *sb, int type, sector_t blk = off >> EXT3_BLOCK_SIZE_BITS(sb); int err = 0; int offset = off & (sb->s_blocksize - 1); - int tocopy; int journal_quota = EXT3_SB(sb)->s_qf_names[type] != NULL; - size_t towrite = len; struct buffer_head *bh; handle_t *handle = journal_current_handle(); @@ -2961,53 +2959,54 @@ static ssize_t ext3_quota_write(struct super_block *sb, int type, (unsigned long long)off, (unsigned long long)len); return -EIO; } + + /* + * Since we account only one data block in transaction credits, + * then it is impossible to cross a block boundary. + */ + if (sb->s_blocksize - offset < len) { + ext3_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)" + " cancelled because not block aligned", + (unsigned long long)off, (unsigned long long)len); + return -EIO; + } mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA); - while (towrite > 0) { - tocopy = sb->s_blocksize - offset < towrite ? - sb->s_blocksize - offset : towrite; - bh = ext3_bread(handle, inode, blk, 1, &err); - if (!bh) + bh = ext3_bread(handle, inode, blk, 1, &err); + if (!bh) + goto out; + if (journal_quota) { + err = ext3_journal_get_write_access(handle, bh); + if (err) { + brelse(bh); goto out; - if (journal_quota) { - err = ext3_journal_get_write_access(handle, bh); - if (err) { - brelse(bh); - goto out; - } - } - lock_buffer(bh); - memcpy(bh->b_data+offset, data, tocopy); - flush_dcache_page(bh->b_page); - unlock_buffer(bh); - if (journal_quota) - err = ext3_journal_dirty_metadata(handle, bh); - else { - /* Always do at least ordered writes for quotas */ - err = ext3_journal_dirty_data(handle, bh); - mark_buffer_dirty(bh); } - brelse(bh); - if (err) - goto out; - offset = 0; - towrite -= tocopy; - data += tocopy; - blk++; } + lock_buffer(bh); + memcpy(bh->b_data+offset, data, len); + flush_dcache_page(bh->b_page); + unlock_buffer(bh); + if (journal_quota) + err = ext3_journal_dirty_metadata(handle, bh); + else { + /* Always do at least ordered writes for quotas */ + err = ext3_journal_dirty_data(handle, bh); + mark_buffer_dirty(bh); + } + brelse(bh); out: - if (len == towrite) { + if (err) { mutex_unlock(&inode->i_mutex); return err; } - if (inode->i_size < off+len-towrite) { - i_size_write(inode, off+len-towrite); + if (inode->i_size < off + len) { + i_size_write(inode, off + len); EXT3_I(inode)->i_disksize = inode->i_size; } inode->i_version++; inode->i_mtime = inode->i_ctime = CURRENT_TIME; ext3_mark_inode_dirty(handle, inode); mutex_unlock(&inode->i_mutex); - return len - towrite; + return len; } #endif -- cgit v1.2.3 From 86963918965eb8fe0c8ae009e7c1b4c630f533d5 Mon Sep 17 00:00:00 2001 From: Jan Kara <jack@suse.cz> Date: Tue, 16 Feb 2010 20:37:12 +0100 Subject: jbd: Delay discarding buffers in journal_unmap_buffer Delay discarding buffers in journal_unmap_buffer until we know that "add to orphan" operation has definitely been committed, otherwise the log space of committing transation may be freed and reused before truncate get committed, updates may get lost if crash happens. This patch is a backport of JBD2 fix by dingdinghua <dingdinghua@nrchpc.ac.cn>. Signed-off-by: Jan Kara <jack@suse.cz> --- fs/jbd/commit.c | 10 +++++----- fs/jbd/transaction.c | 43 +++++++++++++++++++++++++++++++------------ 2 files changed, 36 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c index 4bd882548c45..2c90e3ef625f 100644 --- a/fs/jbd/commit.c +++ b/fs/jbd/commit.c @@ -862,12 +862,12 @@ restart_loop: /* A buffer which has been freed while still being * journaled by a previous transaction may end up still * being dirty here, but we want to avoid writing back - * that buffer in the future now that the last use has - * been committed. That's not only a performance gain, - * it also stops aliasing problems if the buffer is left - * behind for writeback and gets reallocated for another + * that buffer in the future after the "add to orphan" + * operation been committed, That's not only a performance + * gain, it also stops aliasing problems if the buffer is + * left behind for writeback and gets reallocated for another * use in a different page. */ - if (buffer_freed(bh)) { + if (buffer_freed(bh) && !jh->b_next_transaction) { clear_buffer_freed(bh); clear_buffer_jbddirty(bh); } diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c index 006f9ad838a2..99e9fea11077 100644 --- a/fs/jbd/transaction.c +++ b/fs/jbd/transaction.c @@ -1864,6 +1864,21 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) if (!jh) goto zap_buffer_no_jh; + /* + * We cannot remove the buffer from checkpoint lists until the + * transaction adding inode to orphan list (let's call it T) + * is committed. Otherwise if the transaction changing the + * buffer would be cleaned from the journal before T is + * committed, a crash will cause that the correct contents of + * the buffer will be lost. On the other hand we have to + * clear the buffer dirty bit at latest at the moment when the + * transaction marking the buffer as freed in the filesystem + * structures is committed because from that moment on the + * buffer can be reallocated and used by a different page. + * Since the block hasn't been freed yet but the inode has + * already been added to orphan list, it is safe for us to add + * the buffer to BJ_Forget list of the newest transaction. + */ transaction = jh->b_transaction; if (transaction == NULL) { /* First case: not on any transaction. If it @@ -1929,16 +1944,15 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) goto zap_buffer; } /* - * If it is committing, we simply cannot touch it. We - * can remove it's next_transaction pointer from the - * running transaction if that is set, but nothing - * else. */ + * The buffer is committing, we simply cannot touch + * it. So we just set j_next_transaction to the + * running transaction (if there is one) and mark + * buffer as freed so that commit code knows it should + * clear dirty bits when it is done with the buffer. + */ set_buffer_freed(bh); - if (jh->b_next_transaction) { - J_ASSERT(jh->b_next_transaction == - journal->j_running_transaction); - jh->b_next_transaction = NULL; - } + if (journal->j_running_transaction && buffer_jbddirty(bh)) + jh->b_next_transaction = journal->j_running_transaction; journal_put_journal_head(jh); spin_unlock(&journal->j_list_lock); jbd_unlock_bh_state(bh); @@ -2120,7 +2134,7 @@ void journal_file_buffer(struct journal_head *jh, */ void __journal_refile_buffer(struct journal_head *jh) { - int was_dirty; + int was_dirty, jlist; struct buffer_head *bh = jh2bh(jh); J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); @@ -2142,8 +2156,13 @@ void __journal_refile_buffer(struct journal_head *jh) __journal_temp_unlink_buffer(jh); jh->b_transaction = jh->b_next_transaction; jh->b_next_transaction = NULL; - __journal_file_buffer(jh, jh->b_transaction, - jh->b_modified ? BJ_Metadata : BJ_Reserved); + if (buffer_freed(bh)) + jlist = BJ_Forget; + else if (jh->b_modified) + jlist = BJ_Metadata; + else + jlist = BJ_Reserved; + __journal_file_buffer(jh, jh->b_transaction, jlist); J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING); if (was_dirty) -- cgit v1.2.3 From ad1e6e8da9fe8cb7ecfde8eabacedc3b50fceae4 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov <dmonakhov@openvz.org> Date: Tue, 16 Feb 2010 08:31:49 +0300 Subject: quota: sb_quota state flags cleanup - remove hardcoded USRQUOTA/GRPQUOTA flags - convert int to bool for appropriate functions Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org> Signed-off-by: Jan Kara <jack@suse.cz> --- fs/quota/dquot.c | 3 ++- include/linux/quota.h | 15 +++++++-------- include/linux/quotaops.h | 31 +++++++++++++++++-------------- 3 files changed, 26 insertions(+), 23 deletions(-) (limited to 'fs') diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 4c2213f7ed36..5a831dc5ab28 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -1301,7 +1301,7 @@ int dquot_initialize(struct inode *inode, int type) { unsigned int id = 0; int cnt, ret = 0; - struct dquot *got[MAXQUOTAS] = { NULL, NULL }; + struct dquot *got[MAXQUOTAS]; struct super_block *sb = inode->i_sb; qsize_t rsv; @@ -1312,6 +1312,7 @@ int dquot_initialize(struct inode *inode, int type) /* First get references to structures we might need. */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + got[cnt] = NULL; if (type != -1 && cnt != type) continue; switch (cnt) { diff --git a/include/linux/quota.h b/include/linux/quota.h index 570348cbccb1..92547a57e25a 100644 --- a/include/linux/quota.h +++ b/include/linux/quota.h @@ -357,26 +357,25 @@ enum { #define DQUOT_STATE_FLAGS (DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED | \ DQUOT_SUSPENDED) /* Other quota flags */ -#define DQUOT_QUOTA_SYS_FILE (1 << 6) /* Quota file is a special +#define DQUOT_STATE_LAST (_DQUOT_STATE_FLAGS * MAXQUOTAS) +#define DQUOT_QUOTA_SYS_FILE (1 << DQUOT_STATE_LAST) + /* Quota file is a special * system file and user cannot * touch it. Filesystem is * responsible for setting * S_NOQUOTA, S_NOATIME flags */ -#define DQUOT_NEGATIVE_USAGE (1 << 7) /* Allow negative quota usage */ +#define DQUOT_NEGATIVE_USAGE (1 << (DQUOT_STATE_LAST + 1)) + /* Allow negative quota usage */ static inline unsigned int dquot_state_flag(unsigned int flags, int type) { - if (type == USRQUOTA) - return flags; - return flags << _DQUOT_STATE_FLAGS; + return flags << _DQUOT_STATE_FLAGS * type; } static inline unsigned int dquot_generic_flag(unsigned int flags, int type) { - if (type == USRQUOTA) - return flags; - return flags >> _DQUOT_STATE_FLAGS; + return (flags >> _DQUOT_STATE_FLAGS * type) & DQUOT_STATE_FLAGS; } #ifdef CONFIG_QUOTA_NETLINK_INTERFACE diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h index 8cfd0d44c994..e563a20cff4f 100644 --- a/include/linux/quotaops.h +++ b/include/linux/quotaops.h @@ -79,53 +79,56 @@ static inline struct mem_dqinfo *sb_dqinfo(struct super_block *sb, int type) * Functions for checking status of quota */ -static inline int sb_has_quota_usage_enabled(struct super_block *sb, int type) +static inline bool sb_has_quota_usage_enabled(struct super_block *sb, int type) { return sb_dqopt(sb)->flags & dquot_state_flag(DQUOT_USAGE_ENABLED, type); } -static inline int sb_has_quota_limits_enabled(struct super_block *sb, int type) +static inline bool sb_has_quota_limits_enabled(struct super_block *sb, int type) { return sb_dqopt(sb)->flags & dquot_state_flag(DQUOT_LIMITS_ENABLED, type); } -static inline int sb_has_quota_suspended(struct super_block *sb, int type) +static inline bool sb_has_quota_suspended(struct super_block *sb, int type) { return sb_dqopt(sb)->flags & dquot_state_flag(DQUOT_SUSPENDED, type); } -static inline int sb_any_quota_suspended(struct super_block *sb) +static inline unsigned sb_any_quota_suspended(struct super_block *sb) { - return sb_has_quota_suspended(sb, USRQUOTA) || - sb_has_quota_suspended(sb, GRPQUOTA); + unsigned type, tmsk = 0; + for (type = 0; type < MAXQUOTAS; type++) + tmsk |= sb_has_quota_suspended(sb, type) << type; + return tmsk; } /* Does kernel know about any quota information for given sb + type? */ -static inline int sb_has_quota_loaded(struct super_block *sb, int type) +static inline bool sb_has_quota_loaded(struct super_block *sb, int type) { /* Currently if anything is on, then quota usage is on as well */ return sb_has_quota_usage_enabled(sb, type); } -static inline int sb_any_quota_loaded(struct super_block *sb) +static inline unsigned sb_any_quota_loaded(struct super_block *sb) { - return sb_has_quota_loaded(sb, USRQUOTA) || - sb_has_quota_loaded(sb, GRPQUOTA); + unsigned type, tmsk = 0; + for (type = 0; type < MAXQUOTAS; type++) + tmsk |= sb_has_quota_loaded(sb, type) << type; + return tmsk; } -static inline int sb_has_quota_active(struct super_block *sb, int type) +static inline bool sb_has_quota_active(struct super_block *sb, int type) { return sb_has_quota_loaded(sb, type) && !sb_has_quota_suspended(sb, type); } -static inline int sb_any_quota_active(struct super_block *sb) +static inline unsigned sb_any_quota_active(struct super_block *sb) { - return sb_has_quota_active(sb, USRQUOTA) || - sb_has_quota_active(sb, GRPQUOTA); + return sb_any_quota_loaded(sb) & ~sb_any_quota_suspended(sb); } /* -- cgit v1.2.3 From 8ddd69d6df4758bf0cab981481af24cc84419567 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov <dmonakhov@openvz.org> Date: Tue, 16 Feb 2010 08:31:50 +0300 Subject: quota: generalize quota transfer interface Current quota transfer interface support only uid/gid. This patch extend interface in order to support various quotas types The goal is accomplished without changes in most frequently used vfs_dq_transfer() func. Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org> Signed-off-by: Jan Kara <jack@suse.cz> --- fs/quota/dquot.c | 32 ++++++++++++++++++++------------ include/linux/quota.h | 2 +- include/linux/quotaops.h | 2 +- 3 files changed, 22 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 5a831dc5ab28..4d2041fddefc 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -1695,15 +1695,13 @@ EXPORT_SYMBOL(dquot_free_inode); * This operation can block, but only after everything is updated * A transaction must be started when entering this function. */ -int dquot_transfer(struct inode *inode, struct iattr *iattr) +int dquot_transfer(struct inode *inode, qid_t *chid, unsigned long mask) { qsize_t space, cur_space; qsize_t rsv_space = 0; struct dquot *transfer_from[MAXQUOTAS]; struct dquot *transfer_to[MAXQUOTAS]; int cnt, ret = QUOTA_OK; - int chuid = iattr->ia_valid & ATTR_UID && inode->i_uid != iattr->ia_uid, - chgid = iattr->ia_valid & ATTR_GID && inode->i_gid != iattr->ia_gid; char warntype_to[MAXQUOTAS]; char warntype_from_inodes[MAXQUOTAS], warntype_from_space[MAXQUOTAS]; @@ -1717,13 +1715,10 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr) transfer_to[cnt] = NULL; warntype_to[cnt] = QUOTA_NL_NOWARN; } - if (chuid) - transfer_to[USRQUOTA] = dqget(inode->i_sb, iattr->ia_uid, - USRQUOTA); - if (chgid) - transfer_to[GRPQUOTA] = dqget(inode->i_sb, iattr->ia_gid, - GRPQUOTA); - + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + if (mask & (1 << cnt)) + transfer_to[cnt] = dqget(inode->i_sb, chid[cnt], cnt); + } down_write(&sb_dqopt(inode->i_sb)->dqptr_sem); if (IS_NOQUOTA(inode)) { /* File without quota accounting? */ up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); @@ -1799,12 +1794,25 @@ over_quota: } EXPORT_SYMBOL(dquot_transfer); -/* Wrapper for transferring ownership of an inode */ +/* Wrapper for transferring ownership of an inode for uid/gid only + * Called from FSXXX_setattr() + */ int vfs_dq_transfer(struct inode *inode, struct iattr *iattr) { + qid_t chid[MAXQUOTAS]; + unsigned long mask = 0; + + if (iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) { + mask |= 1 << USRQUOTA; + chid[USRQUOTA] = iattr->ia_uid; + } + if (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid) { + mask |= 1 << GRPQUOTA; + chid[GRPQUOTA] = iattr->ia_gid; + } if (sb_any_quota_active(inode->i_sb) && !IS_NOQUOTA(inode)) { vfs_dq_init(inode); - if (inode->i_sb->dq_op->transfer(inode, iattr) == NO_QUOTA) + if (inode->i_sb->dq_op->transfer(inode, chid, mask) == NO_QUOTA) return 1; } return 0; diff --git a/include/linux/quota.h b/include/linux/quota.h index 92547a57e25a..edf34f2fe87d 100644 --- a/include/linux/quota.h +++ b/include/linux/quota.h @@ -301,7 +301,7 @@ struct dquot_operations { int (*alloc_inode) (const struct inode *, qsize_t); int (*free_space) (struct inode *, qsize_t); int (*free_inode) (const struct inode *, qsize_t); - int (*transfer) (struct inode *, struct iattr *); + int (*transfer) (struct inode *, qid_t *, unsigned long); int (*write_dquot) (struct dquot *); /* Ordinary dquot write */ struct dquot *(*alloc_dquot)(struct super_block *, int); /* Allocate memory for new dquot */ void (*destroy_dquot)(struct dquot *); /* Free memory for dquot */ diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h index e563a20cff4f..e1cae204b5d9 100644 --- a/include/linux/quotaops.h +++ b/include/linux/quotaops.h @@ -43,7 +43,7 @@ void dquot_release_reserved_space(struct inode *inode, qsize_t number); int dquot_free_space(struct inode *inode, qsize_t number); int dquot_free_inode(const struct inode *inode, qsize_t number); -int dquot_transfer(struct inode *inode, struct iattr *iattr); +int dquot_transfer(struct inode *inode, qid_t *chid, unsigned long mask); int dquot_commit(struct dquot *dquot); int dquot_acquire(struct dquot *dquot); int dquot_release(struct dquot *dquot); -- cgit v1.2.3 From ab94c39b6fa076d4f6d2903dcc54cda35d938776 Mon Sep 17 00:00:00 2001 From: Jan Kara <jack@suse.cz> Date: Mon, 22 Feb 2010 21:07:17 +0100 Subject: quota: Properly invalidate caches even for filesystems with blocksize < pagesize Sometimes invalidate_bdev() can fail to invalidate a part of block device cache because of dirty data. If the filesystem has blocksize smaller than page size, this can happen even for pages containing quota files and thus kernel would operate on stale data. Fix the issue by syncing the filesystem before invalidating the cache. Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Jan Kara <jack@suse.cz> --- fs/quota/dquot.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 4d2041fddefc..10d021dd37c1 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -2033,11 +2033,13 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id, } if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) { - /* As we bypass the pagecache we must now flush the inode so - * that we see all the changes from userspace... */ - write_inode_now(inode, 1); - /* And now flush the block cache so that kernel sees the - * changes */ + /* As we bypass the pagecache we must now flush all the + * dirty data and invalidate caches so that kernel sees + * changes from userspace. It is not enough to just flush + * the quota file since if blocksize < pagesize, invalidation + * of the cache could fail because of other unrelated dirty + * data */ + sync_filesystem(sb); invalidate_bdev(sb->s_bdev); } mutex_lock(&dqopt->dqonoff_mutex); -- cgit v1.2.3 From 7eb4969e04060dcf3fbd46af9c21b1059b853068 Mon Sep 17 00:00:00 2001 From: Jan Kara <jack@suse.cz> Date: Mon, 1 Mar 2010 14:02:37 +0100 Subject: ext3: Truncate allocated blocks if direct IO write fails to update i_size We have to truncate blocks allocated to file during direct IO when we fail to update i_size properly. Signed-off-by: Jan Kara <jack@suse.cz> --- fs/ext3/inode.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 44b53386ab8b..c0ff9d6ffde6 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -1785,8 +1785,9 @@ retry: handle = ext3_journal_start(inode, 2); if (IS_ERR(handle)) { /* This is really bad luck. We've written the data - * but cannot extend i_size. Bail out and pretend - * the write failed... */ + * but cannot extend i_size. Truncate allocated blocks + * and pretend the write failed... */ + ext3_truncate(inode); ret = PTR_ERR(handle); goto out; } -- cgit v1.2.3 From 49792c806d0bfd53afc789dcdf50dc9bed2c5b83 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov <dmonakhov@openvz.org> Date: Tue, 2 Mar 2010 15:51:02 +0300 Subject: ext3: add writepage sanity checks - There is theoretical possibility to perform writepage on RO superblock. Add explicit check for what case. - Page must being locked before writepage. Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org> Signed-off-by: Jan Kara <jack@suse.cz> --- fs/ext3/inode.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs') diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index c0ff9d6ffde6..eda9121d7d57 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -1528,6 +1528,7 @@ static int ext3_ordered_writepage(struct page *page, int err; J_ASSERT(PageLocked(page)); + WARN_ON_ONCE(IS_RDONLY(inode)); /* * We give up here if we're reentered, because it might be for a @@ -1600,6 +1601,9 @@ static int ext3_writeback_writepage(struct page *page, int ret = 0; int err; + J_ASSERT(PageLocked(page)); + WARN_ON_ONCE(IS_RDONLY(inode)); + if (ext3_journal_current_handle()) goto out_fail; @@ -1642,6 +1646,9 @@ static int ext3_journalled_writepage(struct page *page, int ret = 0; int err; + J_ASSERT(PageLocked(page)); + WARN_ON_ONCE(IS_RDONLY(inode)); + if (ext3_journal_current_handle()) goto no_write; -- cgit v1.2.3 From 5dd4056db84387975140ff2568eaa0406f07985e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@infradead.org> Date: Wed, 3 Mar 2010 09:05:00 -0500 Subject: dquot: cleanup space allocation / freeing routines Get rid of the alloc_space, free_space, reserve_space, claim_space and release_rsv dquot operations - they are always called from the filesystem and if a filesystem really needs their own (which none currently does) it can just call into it's own routine directly. Move shared logic into the common __dquot_alloc_space, dquot_claim_space_nodirty and __dquot_free_space low-level methods, and rationalize the wrappers around it to move as much as possible code into the common block for CONFIG_QUOTA vs not. Also rename all these helpers to be named dquot_* instead of vfs_dq_*. Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Jan Kara <jack@suse.cz> --- Documentation/filesystems/Locking | 6 +- fs/ext2/balloc.c | 12 ++- fs/ext2/xattr.c | 10 +- fs/ext3/balloc.c | 11 +- fs/ext3/inode.c | 2 +- fs/ext3/super.c | 2 - fs/ext3/xattr.c | 8 +- fs/ext4/inode.c | 20 ++-- fs/ext4/mballoc.c | 6 +- fs/ext4/super.c | 5 - fs/ext4/xattr.c | 8 +- fs/jfs/jfs_dtree.c | 28 ++--- fs/jfs/jfs_extent.c | 16 +-- fs/jfs/jfs_xtree.c | 21 ++-- fs/jfs/xattr.c | 17 ++-- fs/ocfs2/alloc.c | 13 ++- fs/ocfs2/aops.c | 11 +- fs/ocfs2/dir.c | 37 +++---- fs/ocfs2/file.c | 11 +- fs/ocfs2/namei.c | 9 +- fs/ocfs2/quota_global.c | 2 - fs/quota/dquot.c | 79 +++++---------- fs/reiserfs/bitmap.c | 10 +- fs/reiserfs/stree.c | 20 ++-- fs/reiserfs/super.c | 2 - fs/udf/balloc.c | 35 ++++--- fs/ufs/balloc.c | 24 +++-- include/linux/quota.h | 8 -- include/linux/quotaops.h | 208 +++++++++++--------------------------- 29 files changed, 258 insertions(+), 383 deletions(-) (limited to 'fs') diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 18b9d0ca0630..1192fde11638 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -462,9 +462,7 @@ in sys_read() and friends. prototypes: int (*initialize) (struct inode *, int); int (*drop) (struct inode *); - int (*alloc_space) (struct inode *, qsize_t, int); int (*alloc_inode) (const struct inode *, unsigned long); - int (*free_space) (struct inode *, qsize_t); int (*free_inode) (const struct inode *, unsigned long); int (*transfer) (struct inode *, struct iattr *); int (*write_dquot) (struct dquot *); @@ -481,9 +479,7 @@ What filesystem should expect from the generic quota functions: FS recursion Held locks when called initialize: yes maybe dqonoff_sem drop: yes - -alloc_space: ->mark_dirty() - alloc_inode: ->mark_dirty() - -free_space: ->mark_dirty() - free_inode: ->mark_dirty() - transfer: yes - write_dquot: yes dqonoff_sem or dqptr_sem @@ -495,7 +491,7 @@ write_info: yes dqonoff_sem FS recursion means calling ->quota_read() and ->quota_write() from superblock operations. -->alloc_space(), ->alloc_inode(), ->free_space(), ->free_inode() are called +->alloc_inode(), ->free_inode() are called only directly by the filesystem and do not call any fs functions only the ->mark_dirty() operation. diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c index 7f8d2e5a7ea6..1d081f0cfec2 100644 --- a/fs/ext2/balloc.c +++ b/fs/ext2/balloc.c @@ -570,7 +570,7 @@ do_more: error_return: brelse(bitmap_bh); release_blocks(sb, freed); - vfs_dq_free_block(inode, freed); + dquot_free_block(inode, freed); } /** @@ -1236,6 +1236,7 @@ ext2_fsblk_t ext2_new_blocks(struct inode *inode, ext2_fsblk_t goal, unsigned short windowsz = 0; unsigned long ngroups; unsigned long num = *count; + int ret; *errp = -ENOSPC; sb = inode->i_sb; @@ -1247,8 +1248,9 @@ ext2_fsblk_t ext2_new_blocks(struct inode *inode, ext2_fsblk_t goal, /* * Check quota for allocation of this block. */ - if (vfs_dq_alloc_block(inode, num)) { - *errp = -EDQUOT; + ret = dquot_alloc_block(inode, num); + if (ret) { + *errp = ret; return 0; } @@ -1409,7 +1411,7 @@ allocated: *errp = 0; brelse(bitmap_bh); - vfs_dq_free_block(inode, *count-num); + dquot_free_block(inode, *count-num); *count = num; return ret_block; @@ -1420,7 +1422,7 @@ out: * Undo the block allocation */ if (!performed_allocation) - vfs_dq_free_block(inode, *count); + dquot_free_block(inode, *count); brelse(bitmap_bh); return 0; } diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c index 904f00642f84..e44dc92609be 100644 --- a/fs/ext2/xattr.c +++ b/fs/ext2/xattr.c @@ -644,8 +644,8 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh, the inode. */ ea_bdebug(new_bh, "reusing block"); - error = -EDQUOT; - if (vfs_dq_alloc_block(inode, 1)) { + error = dquot_alloc_block(inode, 1); + if (error) { unlock_buffer(new_bh); goto cleanup; } @@ -702,7 +702,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh, * as if nothing happened and cleanup the unused block */ if (error && error != -ENOSPC) { if (new_bh && new_bh != old_bh) - vfs_dq_free_block(inode, 1); + dquot_free_block(inode, 1); goto cleanup; } } else @@ -734,7 +734,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh, le32_add_cpu(&HDR(old_bh)->h_refcount, -1); if (ce) mb_cache_entry_release(ce); - vfs_dq_free_block(inode, 1); + dquot_free_block(inode, 1); mark_buffer_dirty(old_bh); ea_bdebug(old_bh, "refcount now=%d", le32_to_cpu(HDR(old_bh)->h_refcount)); @@ -797,7 +797,7 @@ ext2_xattr_delete_inode(struct inode *inode) mark_buffer_dirty(bh); if (IS_SYNC(inode)) sync_dirty_buffer(bh); - vfs_dq_free_block(inode, 1); + dquot_free_block(inode, 1); } EXT2_I(inode)->i_file_acl = 0; diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c index 27967f92e820..161da2d3f890 100644 --- a/fs/ext3/balloc.c +++ b/fs/ext3/balloc.c @@ -676,7 +676,7 @@ void ext3_free_blocks(handle_t *handle, struct inode *inode, } ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks); if (dquot_freed_blocks) - vfs_dq_free_block(inode, dquot_freed_blocks); + dquot_free_block(inode, dquot_freed_blocks); return; } @@ -1502,8 +1502,9 @@ ext3_fsblk_t ext3_new_blocks(handle_t *handle, struct inode *inode, /* * Check quota for allocation of this block. */ - if (vfs_dq_alloc_block(inode, num)) { - *errp = -EDQUOT; + err = dquot_alloc_block(inode, num); + if (err) { + *errp = err; return 0; } @@ -1713,7 +1714,7 @@ allocated: *errp = 0; brelse(bitmap_bh); - vfs_dq_free_block(inode, *count-num); + dquot_free_block(inode, *count-num); *count = num; return ret_block; @@ -1728,7 +1729,7 @@ out: * Undo the block allocation */ if (!performed_allocation) - vfs_dq_free_block(inode, *count); + dquot_free_block(inode, *count); brelse(bitmap_bh); return 0; } diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index eda9121d7d57..20f02d69365c 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -3336,7 +3336,7 @@ int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode) * i_size has been changed by generic_commit_write() and we thus need * to include the updated inode in the current transaction. * - * Also, vfs_dq_alloc_space() will always dirty the inode when blocks + * Also, dquot_alloc_space() will always dirty the inode when blocks * are allocated to the file. * * If the inode is marked synchronous, we don't honour that here - doing diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 5c54e5f685d4..8c13910a3782 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -752,9 +752,7 @@ static ssize_t ext3_quota_write(struct super_block *sb, int type, static const struct dquot_operations ext3_quota_operations = { .initialize = dquot_initialize, .drop = dquot_drop, - .alloc_space = dquot_alloc_space, .alloc_inode = dquot_alloc_inode, - .free_space = dquot_free_space, .free_inode = dquot_free_inode, .transfer = dquot_transfer, .write_dquot = ext3_write_dquot, diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c index 2d2fb2a85961..534a94c3a933 100644 --- a/fs/ext3/xattr.c +++ b/fs/ext3/xattr.c @@ -500,7 +500,7 @@ ext3_xattr_release_block(handle_t *handle, struct inode *inode, error = ext3_journal_dirty_metadata(handle, bh); if (IS_SYNC(inode)) handle->h_sync = 1; - vfs_dq_free_block(inode, 1); + dquot_free_block(inode, 1); ea_bdebug(bh, "refcount now=%d; releasing", le32_to_cpu(BHDR(bh)->h_refcount)); if (ce) @@ -775,8 +775,8 @@ inserted: else { /* The old block is released after updating the inode. */ - error = -EDQUOT; - if (vfs_dq_alloc_block(inode, 1)) + error = dquot_alloc_block(inode, 1); + if (error) goto cleanup; error = ext3_journal_get_write_access(handle, new_bh); @@ -850,7 +850,7 @@ cleanup: return error; cleanup_dquot: - vfs_dq_free_block(inode, 1); + dquot_free_block(inode, 1); goto cleanup; bad_block: diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index e11952404e02..9f607ea411c8 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1093,9 +1093,9 @@ void ext4_da_update_reserve_space(struct inode *inode, /* Update quota subsystem */ if (quota_claim) { - vfs_dq_claim_block(inode, used); + dquot_claim_block(inode, used); if (mdb_free) - vfs_dq_release_reservation_block(inode, mdb_free); + dquot_release_reservation_block(inode, mdb_free); } else { /* * We did fallocate with an offset that is already delayed @@ -1106,8 +1106,8 @@ void ext4_da_update_reserve_space(struct inode *inode, * that */ if (allocated_meta_blocks) - vfs_dq_claim_block(inode, allocated_meta_blocks); - vfs_dq_release_reservation_block(inode, mdb_free + used); + dquot_claim_block(inode, allocated_meta_blocks); + dquot_release_reservation_block(inode, mdb_free + used); } /* @@ -1836,6 +1836,7 @@ static int ext4_da_reserve_space(struct inode *inode, sector_t lblock) struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); unsigned long md_needed, md_reserved; + int ret; /* * recalculate the amount of metadata blocks to reserve @@ -1853,11 +1854,12 @@ repeat: * later. Real quota accounting is done at pages writeout * time. */ - if (vfs_dq_reserve_block(inode, md_needed + 1)) - return -EDQUOT; + ret = dquot_reserve_block(inode, md_needed + 1); + if (ret) + return ret; if (ext4_claim_free_blocks(sbi, md_needed + 1)) { - vfs_dq_release_reservation_block(inode, md_needed + 1); + dquot_release_reservation_block(inode, md_needed + 1); if (ext4_should_retry_alloc(inode->i_sb, &retries)) { yield(); goto repeat; @@ -1914,7 +1916,7 @@ static void ext4_da_release_space(struct inode *inode, int to_free) spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); - vfs_dq_release_reservation_block(inode, to_free); + dquot_release_reservation_block(inode, to_free); } static void ext4_da_page_release_reservation(struct page *page, @@ -5641,7 +5643,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) * i_size has been changed by generic_commit_write() and we thus need * to include the updated inode in the current transaction. * - * Also, vfs_dq_alloc_block() will always dirty the inode when blocks + * Also, dquot_alloc_block() will always dirty the inode when blocks * are allocated to the file. * * If the inode is marked synchronous, we don't honour that here - doing diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index d34afad3e137..0b905781e8e6 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4254,7 +4254,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, return 0; } reserv_blks = ar->len; - while (ar->len && vfs_dq_alloc_block(ar->inode, ar->len)) { + while (ar->len && dquot_alloc_block(ar->inode, ar->len)) { ar->flags |= EXT4_MB_HINT_NOPREALLOC; ar->len--; } @@ -4331,7 +4331,7 @@ out2: kmem_cache_free(ext4_ac_cachep, ac); out1: if (inquota && ar->len < inquota) - vfs_dq_free_block(ar->inode, inquota - ar->len); + dquot_free_block(ar->inode, inquota - ar->len); out3: if (!ar->len) { if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) @@ -4646,7 +4646,7 @@ do_more: sb->s_dirt = 1; error_return: if (freed) - vfs_dq_free_block(inode, freed); + dquot_free_block(inode, freed); brelse(bitmap_bh); ext4_std_error(sb, err); if (ac) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 735c20d5fd56..fa8f4deda652 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1014,15 +1014,10 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, static const struct dquot_operations ext4_quota_operations = { .initialize = dquot_initialize, .drop = dquot_drop, - .alloc_space = dquot_alloc_space, - .reserve_space = dquot_reserve_space, - .claim_space = dquot_claim_space, - .release_rsv = dquot_release_reserved_space, #ifdef CONFIG_QUOTA .get_reserved_space = ext4_get_reserved_space, #endif .alloc_inode = dquot_alloc_inode, - .free_space = dquot_free_space, .free_inode = dquot_free_inode, .transfer = dquot_transfer, .write_dquot = ext4_write_dquot, diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index f3a2f7ed45aa..ab3a95ee5e7e 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -494,7 +494,7 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode, error = ext4_handle_dirty_metadata(handle, inode, bh); if (IS_SYNC(inode)) ext4_handle_sync(handle); - vfs_dq_free_block(inode, 1); + dquot_free_block(inode, 1); ea_bdebug(bh, "refcount now=%d; releasing", le32_to_cpu(BHDR(bh)->h_refcount)); if (ce) @@ -787,8 +787,8 @@ inserted: else { /* The old block is released after updating the inode. */ - error = -EDQUOT; - if (vfs_dq_alloc_block(inode, 1)) + error = dquot_alloc_block(inode, 1); + if (error) goto cleanup; error = ext4_journal_get_write_access(handle, new_bh); @@ -876,7 +876,7 @@ cleanup: return error; cleanup_dquot: - vfs_dq_free_block(inode, 1); + dquot_free_block(inode, 1); goto cleanup; bad_block: diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c index 925871e9887b..0e4623be70ce 100644 --- a/fs/jfs/jfs_dtree.c +++ b/fs/jfs/jfs_dtree.c @@ -381,10 +381,10 @@ static u32 add_index(tid_t tid, struct inode *ip, s64 bn, int slot) * It's time to move the inline table to an external * page and begin to build the xtree */ - if (vfs_dq_alloc_block(ip, sbi->nbperpage)) + if (dquot_alloc_block(ip, sbi->nbperpage)) goto clean_up; if (dbAlloc(ip, 0, sbi->nbperpage, &xaddr)) { - vfs_dq_free_block(ip, sbi->nbperpage); + dquot_free_block(ip, sbi->nbperpage); goto clean_up; } @@ -408,7 +408,7 @@ static u32 add_index(tid_t tid, struct inode *ip, s64 bn, int slot) memcpy(&jfs_ip->i_dirtable, temp_table, sizeof (temp_table)); dbFree(ip, xaddr, sbi->nbperpage); - vfs_dq_free_block(ip, sbi->nbperpage); + dquot_free_block(ip, sbi->nbperpage); goto clean_up; } ip->i_size = PSIZE; @@ -1027,10 +1027,9 @@ static int dtSplitUp(tid_t tid, n = xlen; /* Allocate blocks to quota. */ - if (vfs_dq_alloc_block(ip, n)) { - rc = -EDQUOT; + rc = dquot_alloc_block(ip, n); + if (rc) goto extendOut; - } quota_allocation += n; if ((rc = dbReAlloc(sbi->ipbmap, xaddr, (s64) xlen, @@ -1308,7 +1307,7 @@ static int dtSplitUp(tid_t tid, /* Rollback quota allocation */ if (rc && quota_allocation) - vfs_dq_free_block(ip, quota_allocation); + dquot_free_block(ip, quota_allocation); dtSplitUp_Exit: @@ -1369,9 +1368,10 @@ static int dtSplitPage(tid_t tid, struct inode *ip, struct dtsplit * split, return -EIO; /* Allocate blocks to quota. */ - if (vfs_dq_alloc_block(ip, lengthPXD(pxd))) { + rc = dquot_alloc_block(ip, lengthPXD(pxd)); + if (rc) { release_metapage(rmp); - return -EDQUOT; + return rc; } jfs_info("dtSplitPage: ip:0x%p smp:0x%p rmp:0x%p", ip, smp, rmp); @@ -1892,6 +1892,7 @@ static int dtSplitRoot(tid_t tid, struct dt_lock *dtlck; struct tlock *tlck; struct lv *lv; + int rc; /* get split root page */ smp = split->mp; @@ -1916,9 +1917,10 @@ static int dtSplitRoot(tid_t tid, rp = rmp->data; /* Allocate blocks to quota. */ - if (vfs_dq_alloc_block(ip, lengthPXD(pxd))) { + rc = dquot_alloc_block(ip, lengthPXD(pxd)); + if (rc) { release_metapage(rmp); - return -EDQUOT; + return rc; } BT_MARK_DIRTY(rmp, ip); @@ -2287,7 +2289,7 @@ static int dtDeleteUp(tid_t tid, struct inode *ip, xlen = lengthPXD(&fp->header.self); /* Free quota allocation. */ - vfs_dq_free_block(ip, xlen); + dquot_free_block(ip, xlen); /* free/invalidate its buffer page */ discard_metapage(fmp); @@ -2363,7 +2365,7 @@ static int dtDeleteUp(tid_t tid, struct inode *ip, xlen = lengthPXD(&p->header.self); /* Free quota allocation */ - vfs_dq_free_block(ip, xlen); + dquot_free_block(ip, xlen); /* free/invalidate its buffer page */ discard_metapage(mp); diff --git a/fs/jfs/jfs_extent.c b/fs/jfs/jfs_extent.c index 41d6045dbeb0..5d3bbd10f8db 100644 --- a/fs/jfs/jfs_extent.c +++ b/fs/jfs/jfs_extent.c @@ -141,10 +141,11 @@ extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, bool abnr) } /* Allocate blocks to quota. */ - if (vfs_dq_alloc_block(ip, nxlen)) { + rc = dquot_alloc_block(ip, nxlen); + if (rc) { dbFree(ip, nxaddr, (s64) nxlen); mutex_unlock(&JFS_IP(ip)->commit_mutex); - return -EDQUOT; + return rc; } /* determine the value of the extent flag */ @@ -164,7 +165,7 @@ extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, bool abnr) */ if (rc) { dbFree(ip, nxaddr, nxlen); - vfs_dq_free_block(ip, nxlen); + dquot_free_block(ip, nxlen); mutex_unlock(&JFS_IP(ip)->commit_mutex); return (rc); } @@ -256,10 +257,11 @@ int extRealloc(struct inode *ip, s64 nxlen, xad_t * xp, bool abnr) goto exit; /* Allocat blocks to quota. */ - if (vfs_dq_alloc_block(ip, nxlen)) { + rc = dquot_alloc_block(ip, nxlen); + if (rc) { dbFree(ip, nxaddr, (s64) nxlen); mutex_unlock(&JFS_IP(ip)->commit_mutex); - return -EDQUOT; + return rc; } delta = nxlen - xlen; @@ -297,7 +299,7 @@ int extRealloc(struct inode *ip, s64 nxlen, xad_t * xp, bool abnr) /* extend the extent */ if ((rc = xtExtend(0, ip, xoff + xlen, (int) nextend, 0))) { dbFree(ip, xaddr + xlen, delta); - vfs_dq_free_block(ip, nxlen); + dquot_free_block(ip, nxlen); goto exit; } } else { @@ -308,7 +310,7 @@ int extRealloc(struct inode *ip, s64 nxlen, xad_t * xp, bool abnr) */ if ((rc = xtTailgate(0, ip, xoff, (int) ntail, nxaddr, 0))) { dbFree(ip, nxaddr, nxlen); - vfs_dq_free_block(ip, nxlen); + dquot_free_block(ip, nxlen); goto exit; } } diff --git a/fs/jfs/jfs_xtree.c b/fs/jfs/jfs_xtree.c index d654a6458648..6c50871e6220 100644 --- a/fs/jfs/jfs_xtree.c +++ b/fs/jfs/jfs_xtree.c @@ -585,10 +585,10 @@ int xtInsert(tid_t tid, /* transaction id */ hint = addressXAD(xad) + lengthXAD(xad) - 1; } else hint = 0; - if ((rc = vfs_dq_alloc_block(ip, xlen))) + if ((rc = dquot_alloc_block(ip, xlen))) goto out; if ((rc = dbAlloc(ip, hint, (s64) xlen, &xaddr))) { - vfs_dq_free_block(ip, xlen); + dquot_free_block(ip, xlen); goto out; } } @@ -617,7 +617,7 @@ int xtInsert(tid_t tid, /* transaction id */ /* undo data extent allocation */ if (*xaddrp == 0) { dbFree(ip, xaddr, (s64) xlen); - vfs_dq_free_block(ip, xlen); + dquot_free_block(ip, xlen); } return rc; } @@ -985,10 +985,9 @@ xtSplitPage(tid_t tid, struct inode *ip, rbn = addressPXD(pxd); /* Allocate blocks to quota. */ - if (vfs_dq_alloc_block(ip, lengthPXD(pxd))) { - rc = -EDQUOT; + rc = dquot_alloc_block(ip, lengthPXD(pxd)); + if (rc) goto clean_up; - } quota_allocation += lengthPXD(pxd); @@ -1195,7 +1194,7 @@ xtSplitPage(tid_t tid, struct inode *ip, /* Rollback quota allocation. */ if (quota_allocation) - vfs_dq_free_block(ip, quota_allocation); + dquot_free_block(ip, quota_allocation); return (rc); } @@ -1235,6 +1234,7 @@ xtSplitRoot(tid_t tid, struct pxdlist *pxdlist; struct tlock *tlck; struct xtlock *xtlck; + int rc; sp = &JFS_IP(ip)->i_xtroot; @@ -1252,9 +1252,10 @@ xtSplitRoot(tid_t tid, return -EIO; /* Allocate blocks to quota. */ - if (vfs_dq_alloc_block(ip, lengthPXD(pxd))) { + rc = dquot_alloc_block(ip, lengthPXD(pxd)); + if (rc) { release_metapage(rmp); - return -EDQUOT; + return rc; } jfs_info("xtSplitRoot: ip:0x%p rmp:0x%p", ip, rmp); @@ -3680,7 +3681,7 @@ s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag) ip->i_size = newsize; /* update quota allocation to reflect freed blocks */ - vfs_dq_free_block(ip, nfreed); + dquot_free_block(ip, nfreed); /* * free tlock of invalidated pages diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c index fad364548bc9..1f594ab21895 100644 --- a/fs/jfs/xattr.c +++ b/fs/jfs/xattr.c @@ -260,14 +260,14 @@ static int ea_write(struct inode *ip, struct jfs_ea_list *ealist, int size, nblocks = (size + (sb->s_blocksize - 1)) >> sb->s_blocksize_bits; /* Allocate new blocks to quota. */ - if (vfs_dq_alloc_block(ip, nblocks)) { - return -EDQUOT; - } + rc = dquot_alloc_block(ip, nblocks); + if (rc) + return rc; rc = dbAlloc(ip, INOHINT(ip), nblocks, &blkno); if (rc) { /*Rollback quota allocation. */ - vfs_dq_free_block(ip, nblocks); + dquot_free_block(ip, nblocks); return rc; } @@ -332,7 +332,7 @@ static int ea_write(struct inode *ip, struct jfs_ea_list *ealist, int size, failed: /* Rollback quota allocation. */ - vfs_dq_free_block(ip, nblocks); + dquot_free_block(ip, nblocks); dbFree(ip, blkno, nblocks); return rc; @@ -538,7 +538,8 @@ static int ea_get(struct inode *inode, struct ea_buffer *ea_buf, int min_size) if (blocks_needed > current_blocks) { /* Allocate new blocks to quota. */ - if (vfs_dq_alloc_block(inode, blocks_needed)) + rc = dquot_alloc_block(inode, blocks_needed); + if (rc) return -EDQUOT; quota_allocation = blocks_needed; @@ -602,7 +603,7 @@ static int ea_get(struct inode *inode, struct ea_buffer *ea_buf, int min_size) clean_up: /* Rollback quota allocation */ if (quota_allocation) - vfs_dq_free_block(inode, quota_allocation); + dquot_free_block(inode, quota_allocation); return (rc); } @@ -677,7 +678,7 @@ static int ea_put(tid_t tid, struct inode *inode, struct ea_buffer *ea_buf, /* If old blocks exist, they must be removed from quota allocation. */ if (old_blocks) - vfs_dq_free_block(inode, old_blocks); + dquot_free_block(inode, old_blocks); inode->i_ctime = CURRENT_TIME; diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index d17bdc718f74..20538dd832a4 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -5712,7 +5712,7 @@ int ocfs2_remove_btree_range(struct inode *inode, goto out; } - vfs_dq_free_space_nodirty(inode, + dquot_free_space_nodirty(inode, ocfs2_clusters_to_bytes(inode->i_sb, len)); ret = ocfs2_remove_extent(handle, et, cpos, len, meta_ac, dealloc); @@ -6935,7 +6935,7 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb, goto bail; } - vfs_dq_free_space_nodirty(inode, + dquot_free_space_nodirty(inode, ocfs2_clusters_to_bytes(osb->sb, clusters_to_del)); spin_lock(&OCFS2_I(inode)->ip_lock); OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) - @@ -7300,11 +7300,10 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, unsigned int page_end; u64 phys; - if (vfs_dq_alloc_space_nodirty(inode, - ocfs2_clusters_to_bytes(osb->sb, 1))) { - ret = -EDQUOT; + ret = dquot_alloc_space_nodirty(inode, + ocfs2_clusters_to_bytes(osb->sb, 1)); + if (ret) goto out_commit; - } did_quota = 1; ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off, @@ -7380,7 +7379,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, out_commit: if (ret < 0 && did_quota) - vfs_dq_free_space_nodirty(inode, + dquot_free_space_nodirty(inode, ocfs2_clusters_to_bytes(osb->sb, 1)); ocfs2_commit_trans(osb, handle); diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 7e9df11260f4..7d04c171567d 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -1763,10 +1763,11 @@ int ocfs2_write_begin_nolock(struct address_space *mapping, wc->w_handle = handle; - if (clusters_to_alloc && vfs_dq_alloc_space_nodirty(inode, - ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc))) { - ret = -EDQUOT; - goto out_commit; + if (clusters_to_alloc) { + ret = dquot_alloc_space_nodirty(inode, + ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc)); + if (ret) + goto out_commit; } /* * We don't want this to fail in ocfs2_write_end(), so do it @@ -1809,7 +1810,7 @@ success: return 0; out_quota: if (clusters_to_alloc) - vfs_dq_free_space(inode, + dquot_free_space(inode, ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc)); out_commit: ocfs2_commit_trans(osb, handle); diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index 28c3ec238796..a63ea4d74e67 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -2964,12 +2964,10 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, goto out; } - if (vfs_dq_alloc_space_nodirty(dir, - ocfs2_clusters_to_bytes(osb->sb, - alloc + dx_alloc))) { - ret = -EDQUOT; + ret = dquot_alloc_space_nodirty(dir, + ocfs2_clusters_to_bytes(osb->sb, alloc + dx_alloc)); + if (ret) goto out_commit; - } did_quota = 1; if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) { @@ -3178,7 +3176,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, out_commit: if (ret < 0 && did_quota) - vfs_dq_free_space_nodirty(dir, bytes_allocated); + dquot_free_space_nodirty(dir, bytes_allocated); ocfs2_commit_trans(osb, handle); @@ -3221,11 +3219,10 @@ static int ocfs2_do_extend_dir(struct super_block *sb, if (extend) { u32 offset = OCFS2_I(dir)->ip_clusters; - if (vfs_dq_alloc_space_nodirty(dir, - ocfs2_clusters_to_bytes(sb, 1))) { - status = -EDQUOT; + status = dquot_alloc_space_nodirty(dir, + ocfs2_clusters_to_bytes(sb, 1)); + if (status) goto bail; - } did_quota = 1; status = ocfs2_add_inode_data(OCFS2_SB(sb), dir, &offset, @@ -3254,7 +3251,7 @@ static int ocfs2_do_extend_dir(struct super_block *sb, status = 0; bail: if (did_quota && status < 0) - vfs_dq_free_space_nodirty(dir, ocfs2_clusters_to_bytes(sb, 1)); + dquot_free_space_nodirty(dir, ocfs2_clusters_to_bytes(sb, 1)); mlog_exit(status); return status; } @@ -3889,11 +3886,10 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir, goto out; } - if (vfs_dq_alloc_space_nodirty(dir, - ocfs2_clusters_to_bytes(dir->i_sb, 1))) { - ret = -EDQUOT; + ret = dquot_alloc_space_nodirty(dir, + ocfs2_clusters_to_bytes(dir->i_sb, 1)); + if (ret) goto out_commit; - } did_quota = 1; ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), dx_leaf_bh, @@ -3983,7 +3979,7 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir, out_commit: if (ret < 0 && did_quota) - vfs_dq_free_space_nodirty(dir, + dquot_free_space_nodirty(dir, ocfs2_clusters_to_bytes(dir->i_sb, 1)); ocfs2_commit_trans(osb, handle); @@ -4165,11 +4161,10 @@ static int ocfs2_expand_inline_dx_root(struct inode *dir, goto out; } - if (vfs_dq_alloc_space_nodirty(dir, - ocfs2_clusters_to_bytes(osb->sb, 1))) { - ret = -EDQUOT; + ret = dquot_alloc_space_nodirty(dir, + ocfs2_clusters_to_bytes(osb->sb, 1)); + if (ret) goto out_commit; - } did_quota = 1; /* @@ -4229,7 +4224,7 @@ static int ocfs2_expand_inline_dx_root(struct inode *dir, out_commit: if (ret < 0 && did_quota) - vfs_dq_free_space_nodirty(dir, + dquot_free_space_nodirty(dir, ocfs2_clusters_to_bytes(dir->i_sb, 1)); ocfs2_commit_trans(osb, handle); diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 558ce0312421..6cf3d8d18369 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -629,11 +629,10 @@ restart_all: } restarted_transaction: - if (vfs_dq_alloc_space_nodirty(inode, ocfs2_clusters_to_bytes(osb->sb, - clusters_to_add))) { - status = -EDQUOT; + status = dquot_alloc_space_nodirty(inode, + ocfs2_clusters_to_bytes(osb->sb, clusters_to_add)); + if (status) goto leave; - } did_quota = 1; /* reserve a write to the file entry early on - that we if we @@ -674,7 +673,7 @@ restarted_transaction: clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters); spin_unlock(&OCFS2_I(inode)->ip_lock); /* Release unused quota reservation */ - vfs_dq_free_space(inode, + dquot_free_space(inode, ocfs2_clusters_to_bytes(osb->sb, clusters_to_add)); did_quota = 0; @@ -710,7 +709,7 @@ restarted_transaction: leave: if (status < 0 && did_quota) - vfs_dq_free_space(inode, + dquot_free_space(inode, ocfs2_clusters_to_bytes(osb->sb, clusters_to_add)); if (handle) { ocfs2_commit_trans(osb, handle); diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 50fb26a6a5f5..13adaa1f40cd 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -1716,11 +1716,10 @@ static int ocfs2_symlink(struct inode *dir, u32 offset = 0; inode->i_op = &ocfs2_symlink_inode_operations; - if (vfs_dq_alloc_space_nodirty(inode, - ocfs2_clusters_to_bytes(osb->sb, 1))) { - status = -EDQUOT; + status = dquot_alloc_space_nodirty(inode, + ocfs2_clusters_to_bytes(osb->sb, 1)); + if (status) goto bail; - } did_quota = 1; status = ocfs2_add_inode_data(osb, inode, &offset, 1, 0, new_fe_bh, @@ -1788,7 +1787,7 @@ static int ocfs2_symlink(struct inode *dir, d_instantiate(dentry, inode); bail: if (status < 0 && did_quota) - vfs_dq_free_space_nodirty(inode, + dquot_free_space_nodirty(inode, ocfs2_clusters_to_bytes(osb->sb, 1)); if (status < 0 && did_quota_inode) vfs_dq_free_inode(inode); diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index b437dc0c4cad..aa66fb277225 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -853,9 +853,7 @@ static void ocfs2_destroy_dquot(struct dquot *dquot) const struct dquot_operations ocfs2_quota_operations = { .initialize = dquot_initialize, .drop = dquot_drop, - .alloc_space = dquot_alloc_space, .alloc_inode = dquot_alloc_inode, - .free_space = dquot_free_space, .free_inode = dquot_free_inode, .transfer = dquot_transfer, .write_dquot = ocfs2_write_dquot, diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 10d021dd37c1..baf202c012cc 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -1464,28 +1464,29 @@ static void inode_decr_space(struct inode *inode, qsize_t number, int reserve) } /* - * Following four functions update i_blocks+i_bytes fields and - * quota information (together with appropriate checks) - * NOTE: We absolutely rely on the fact that caller dirties - * the inode (usually macros in quotaops.h care about this) and - * holds a handle for the current transaction so that dquot write and - * inode write go into the same transaction. + * This functions updates i_blocks+i_bytes fields and quota information + * (together with appropriate checks). + * + * NOTE: We absolutely rely on the fact that caller dirties the inode + * (usually helpers in quotaops.h care about this) and holds a handle for + * the current transaction so that dquot write and inode write go into the + * same transaction. */ /* * This operation can block, but only after everything is updated */ int __dquot_alloc_space(struct inode *inode, qsize_t number, - int warn, int reserve) + int warn, int reserve) { - int cnt, ret = QUOTA_OK; + int cnt, ret = 0; char warntype[MAXQUOTAS]; /* * First test before acquiring mutex - solves deadlocks when we * re-enter the quota code and are already holding the mutex */ - if (IS_NOQUOTA(inode)) { + if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) { inode_incr_space(inode, number, reserve); goto out; } @@ -1498,9 +1499,9 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (!inode->i_dquot[cnt]) continue; - if (check_bdq(inode->i_dquot[cnt], number, warn, warntype+cnt) - == NO_QUOTA) { - ret = NO_QUOTA; + if (check_bdq(inode->i_dquot[cnt], number, !warn, warntype+cnt) + == NO_QUOTA) { + ret = -EDQUOT; spin_unlock(&dq_data_lock); goto out_flush_warn; } @@ -1525,18 +1526,7 @@ out_flush_warn: out: return ret; } - -int dquot_alloc_space(struct inode *inode, qsize_t number, int warn) -{ - return __dquot_alloc_space(inode, number, warn, 0); -} -EXPORT_SYMBOL(dquot_alloc_space); - -int dquot_reserve_space(struct inode *inode, qsize_t number, int warn) -{ - return __dquot_alloc_space(inode, number, warn, 1); -} -EXPORT_SYMBOL(dquot_reserve_space); +EXPORT_SYMBOL(__dquot_alloc_space); /* * This operation can block, but only after everything is updated @@ -1578,14 +1568,16 @@ warn_put_all: } EXPORT_SYMBOL(dquot_alloc_inode); -int dquot_claim_space(struct inode *inode, qsize_t number) +/* + * Convert in-memory reserved quotas to real consumed quotas + */ +int dquot_claim_space_nodirty(struct inode *inode, qsize_t number) { int cnt; - int ret = QUOTA_OK; - if (IS_NOQUOTA(inode)) { + if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) { inode_claim_rsv_space(inode, number); - goto out; + return 0; } down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); @@ -1601,24 +1593,23 @@ int dquot_claim_space(struct inode *inode, qsize_t number) spin_unlock(&dq_data_lock); mark_all_dquot_dirty(inode->i_dquot); up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); -out: - return ret; + return 0; } -EXPORT_SYMBOL(dquot_claim_space); +EXPORT_SYMBOL(dquot_claim_space_nodirty); /* * This operation can block, but only after everything is updated */ -int __dquot_free_space(struct inode *inode, qsize_t number, int reserve) +void __dquot_free_space(struct inode *inode, qsize_t number, int reserve) { unsigned int cnt; char warntype[MAXQUOTAS]; /* First test before acquiring mutex - solves deadlocks when we * re-enter the quota code and are already holding the mutex */ - if (IS_NOQUOTA(inode)) { + if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) { inode_decr_space(inode, number, reserve); - return QUOTA_OK; + return; } down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); @@ -1641,24 +1632,8 @@ int __dquot_free_space(struct inode *inode, qsize_t number, int reserve) out_unlock: flush_warnings(inode->i_dquot, warntype); up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); - return QUOTA_OK; -} - -int dquot_free_space(struct inode *inode, qsize_t number) -{ - return __dquot_free_space(inode, number, 0); -} -EXPORT_SYMBOL(dquot_free_space); - -/* - * Release reserved quota space - */ -void dquot_release_reserved_space(struct inode *inode, qsize_t number) -{ - __dquot_free_space(inode, number, 1); - } -EXPORT_SYMBOL(dquot_release_reserved_space); +EXPORT_SYMBOL(__dquot_free_space); /* * This operation can block, but only after everything is updated @@ -1840,9 +1815,7 @@ EXPORT_SYMBOL(dquot_commit_info); const struct dquot_operations dquot_operations = { .initialize = dquot_initialize, .drop = dquot_drop, - .alloc_space = dquot_alloc_space, .alloc_inode = dquot_alloc_inode, - .free_space = dquot_free_space, .free_inode = dquot_free_inode, .transfer = dquot_transfer, .write_dquot = dquot_commit, diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c index 65c872761177..dc014f7def05 100644 --- a/fs/reiserfs/bitmap.c +++ b/fs/reiserfs/bitmap.c @@ -425,7 +425,7 @@ static void _reiserfs_free_block(struct reiserfs_transaction_handle *th, journal_mark_dirty(th, s, sbh); if (for_unformatted) - vfs_dq_free_block_nodirty(inode, 1); + dquot_free_block_nodirty(inode, 1); } void reiserfs_free_block(struct reiserfs_transaction_handle *th, @@ -1049,7 +1049,7 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start amount_needed, hint->inode->i_uid); #endif quota_ret = - vfs_dq_alloc_block_nodirty(hint->inode, amount_needed); + dquot_alloc_block_nodirty(hint->inode, amount_needed); if (quota_ret) /* Quota exceeded? */ return QUOTA_EXCEEDED; if (hint->preallocate && hint->prealloc_size) { @@ -1058,7 +1058,7 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start "reiserquota: allocating (prealloc) %d blocks id=%u", hint->prealloc_size, hint->inode->i_uid); #endif - quota_ret = vfs_dq_prealloc_block_nodirty(hint->inode, + quota_ret = dquot_prealloc_block_nodirty(hint->inode, hint->prealloc_size); if (quota_ret) hint->preallocate = hint->prealloc_size = 0; @@ -1092,7 +1092,7 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start hint->inode->i_uid); #endif /* Free not allocated blocks */ - vfs_dq_free_block_nodirty(hint->inode, + dquot_free_block_nodirty(hint->inode, amount_needed + hint->prealloc_size - nr_allocated); } @@ -1125,7 +1125,7 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start REISERFS_I(hint->inode)->i_prealloc_count, hint->inode->i_uid); #endif - vfs_dq_free_block_nodirty(hint->inode, amount_needed + + dquot_free_block_nodirty(hint->inode, amount_needed + hint->prealloc_size - nr_allocated - REISERFS_I(hint->inode)-> i_prealloc_count); diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c index 5fa7118f04e1..313d39d639eb 100644 --- a/fs/reiserfs/stree.c +++ b/fs/reiserfs/stree.c @@ -1299,7 +1299,7 @@ int reiserfs_delete_item(struct reiserfs_transaction_handle *th, "reiserquota delete_item(): freeing %u, id=%u type=%c", quota_cut_bytes, inode->i_uid, head2type(&s_ih)); #endif - vfs_dq_free_space_nodirty(inode, quota_cut_bytes); + dquot_free_space_nodirty(inode, quota_cut_bytes); /* Return deleted body length */ return ret_value; @@ -1383,7 +1383,7 @@ void reiserfs_delete_solid_item(struct reiserfs_transaction_handle *th, quota_cut_bytes, inode->i_uid, key2type(key)); #endif - vfs_dq_free_space_nodirty(inode, + dquot_free_space_nodirty(inode, quota_cut_bytes); } break; @@ -1733,7 +1733,7 @@ int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th, "reiserquota cut_from_item(): freeing %u id=%u type=%c", quota_cut_bytes, inode->i_uid, '?'); #endif - vfs_dq_free_space_nodirty(inode, quota_cut_bytes); + dquot_free_space_nodirty(inode, quota_cut_bytes); return ret_value; } @@ -1968,9 +1968,10 @@ int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct tree key2type(&(key->on_disk_key))); #endif - if (vfs_dq_alloc_space_nodirty(inode, pasted_size)) { + retval = dquot_alloc_space_nodirty(inode, pasted_size); + if (retval) { pathrelse(search_path); - return -EDQUOT; + return retval; } init_tb_struct(th, &s_paste_balance, th->t_super, search_path, pasted_size); @@ -2024,7 +2025,7 @@ int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct tree pasted_size, inode->i_uid, key2type(&(key->on_disk_key))); #endif - vfs_dq_free_space_nodirty(inode, pasted_size); + dquot_free_space_nodirty(inode, pasted_size); return retval; } @@ -2062,9 +2063,10 @@ int reiserfs_insert_item(struct reiserfs_transaction_handle *th, #endif /* We can't dirty inode here. It would be immediately written but * appropriate stat item isn't inserted yet... */ - if (vfs_dq_alloc_space_nodirty(inode, quota_bytes)) { + retval = dquot_alloc_space_nodirty(inode, quota_bytes); + if (retval) { pathrelse(path); - return -EDQUOT; + return retval; } } init_tb_struct(th, &s_ins_balance, th->t_super, path, @@ -2113,6 +2115,6 @@ int reiserfs_insert_item(struct reiserfs_transaction_handle *th, quota_bytes, inode->i_uid, head2type(ih)); #endif if (inode) - vfs_dq_free_space_nodirty(inode, quota_bytes); + dquot_free_space_nodirty(inode, quota_bytes); return retval; } diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index b4a7dd03bdb9..ea4a77e9d7f5 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -618,9 +618,7 @@ static int reiserfs_quota_on(struct super_block *, int, int, char *, int); static const struct dquot_operations reiserfs_quota_operations = { .initialize = dquot_initialize, .drop = dquot_drop, - .alloc_space = dquot_alloc_space, .alloc_inode = dquot_alloc_inode, - .free_space = dquot_free_space, .free_inode = dquot_free_inode, .transfer = dquot_transfer, .write_dquot = reiserfs_write_dquot, diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c index 82372e332f08..e2ff180173a2 100644 --- a/fs/udf/balloc.c +++ b/fs/udf/balloc.c @@ -208,7 +208,7 @@ static void udf_bitmap_free_blocks(struct super_block *sb, ((char *)bh->b_data)[(bit + i) >> 3]); } else { if (inode) - vfs_dq_free_block(inode, 1); + dquot_free_block(inode, 1); udf_add_free_space(sb, sbi->s_partition, 1); } } @@ -260,11 +260,11 @@ static int udf_bitmap_prealloc_blocks(struct super_block *sb, while (bit < (sb->s_blocksize << 3) && block_count > 0) { if (!udf_test_bit(bit, bh->b_data)) goto out; - else if (vfs_dq_prealloc_block(inode, 1)) + else if (dquot_prealloc_block(inode, 1)) goto out; else if (!udf_clear_bit(bit, bh->b_data)) { udf_debug("bit already cleared for block %d\n", bit); - vfs_dq_free_block(inode, 1); + dquot_free_block(inode, 1); goto out; } block_count--; @@ -390,10 +390,14 @@ got_block: /* * Check quota for allocation of this block. */ - if (inode && vfs_dq_alloc_block(inode, 1)) { - mutex_unlock(&sbi->s_alloc_mutex); - *err = -EDQUOT; - return 0; + if (inode) { + int ret = dquot_alloc_block(inode, 1); + + if (ret) { + mutex_unlock(&sbi->s_alloc_mutex); + *err = ret; + return 0; + } } newblock = bit + (block_group << (sb->s_blocksize_bits + 3)) - @@ -449,7 +453,7 @@ static void udf_table_free_blocks(struct super_block *sb, /* We do this up front - There are some error conditions that could occure, but.. oh well */ if (inode) - vfs_dq_free_block(inode, count); + dquot_free_block(inode, count); udf_add_free_space(sb, sbi->s_partition, count); start = bloc->logicalBlockNum + offset; @@ -694,7 +698,7 @@ static int udf_table_prealloc_blocks(struct super_block *sb, epos.offset -= adsize; alloc_count = (elen >> sb->s_blocksize_bits); - if (inode && vfs_dq_prealloc_block(inode, + if (inode && dquot_prealloc_block(inode, alloc_count > block_count ? block_count : alloc_count)) alloc_count = 0; else if (alloc_count > block_count) { @@ -797,12 +801,13 @@ static int udf_table_new_block(struct super_block *sb, newblock = goal_eloc.logicalBlockNum; goal_eloc.logicalBlockNum++; goal_elen -= sb->s_blocksize; - - if (inode && vfs_dq_alloc_block(inode, 1)) { - brelse(goal_epos.bh); - mutex_unlock(&sbi->s_alloc_mutex); - *err = -EDQUOT; - return 0; + if (inode) { + *err = dquot_alloc_block(inode, 1); + if (*err) { + brelse(goal_epos.bh); + mutex_unlock(&sbi->s_alloc_mutex); + return 0; + } } if (goal_elen) diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c index 54c16ec95dff..5cfa4d85ccf2 100644 --- a/fs/ufs/balloc.c +++ b/fs/ufs/balloc.c @@ -85,7 +85,7 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count) "bit already cleared for fragment %u", i); } - vfs_dq_free_block(inode, count); + dquot_free_block(inode, count); fs32_add(sb, &ucg->cg_cs.cs_nffree, count); @@ -195,7 +195,7 @@ do_more: ubh_setblock(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno); if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD) ufs_clusteracct (sb, ucpi, blkno, 1); - vfs_dq_free_block(inode, uspi->s_fpb); + dquot_free_block(inode, uspi->s_fpb); fs32_add(sb, &ucg->cg_cs.cs_nbfree, 1); uspi->cs_total.cs_nbfree++; @@ -511,6 +511,7 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment, struct ufs_cg_private_info * ucpi; struct ufs_cylinder_group * ucg; unsigned cgno, fragno, fragoff, count, fragsize, i; + int ret; UFSD("ENTER, fragment %llu, oldcount %u, newcount %u\n", (unsigned long long)fragment, oldcount, newcount); @@ -556,8 +557,9 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment, fs32_add(sb, &ucg->cg_frsum[fragsize - count], 1); for (i = oldcount; i < newcount; i++) ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_freeoff, fragno + i); - if (vfs_dq_alloc_block(inode, count)) { - *err = -EDQUOT; + ret = dquot_alloc_block(inode, count); + if (ret) { + *err = ret; return 0; } @@ -596,6 +598,7 @@ static u64 ufs_alloc_fragments(struct inode *inode, unsigned cgno, struct ufs_cylinder_group * ucg; unsigned oldcg, i, j, k, allocsize; u64 result; + int ret; UFSD("ENTER, ino %lu, cgno %u, goal %llu, count %u\n", inode->i_ino, cgno, (unsigned long long)goal, count); @@ -664,7 +667,7 @@ cg_found: for (i = count; i < uspi->s_fpb; i++) ubh_setbit (UCPI_UBH(ucpi), ucpi->c_freeoff, goal + i); i = uspi->s_fpb - count; - vfs_dq_free_block(inode, i); + dquot_free_block(inode, i); fs32_add(sb, &ucg->cg_cs.cs_nffree, i); uspi->cs_total.cs_nffree += i; @@ -676,8 +679,9 @@ cg_found: result = ufs_bitmap_search (sb, ucpi, goal, allocsize); if (result == INVBLOCK) return 0; - if (vfs_dq_alloc_block(inode, count)) { - *err = -EDQUOT; + ret = dquot_alloc_block(inode, count); + if (ret) { + *err = ret; return 0; } for (i = 0; i < count; i++) @@ -714,6 +718,7 @@ static u64 ufs_alloccg_block(struct inode *inode, struct ufs_super_block_first * usb1; struct ufs_cylinder_group * ucg; u64 result, blkno; + int ret; UFSD("ENTER, goal %llu\n", (unsigned long long)goal); @@ -747,8 +752,9 @@ gotit: ubh_clrblock (UCPI_UBH(ucpi), ucpi->c_freeoff, blkno); if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD) ufs_clusteracct (sb, ucpi, blkno, -1); - if (vfs_dq_alloc_block(inode, uspi->s_fpb)) { - *err = -EDQUOT; + ret = dquot_alloc_block(inode, uspi->s_fpb); + if (ret) { + *err = ret; return INVBLOCK; } diff --git a/include/linux/quota.h b/include/linux/quota.h index edf34f2fe87d..1b14ad287fe3 100644 --- a/include/linux/quota.h +++ b/include/linux/quota.h @@ -297,9 +297,7 @@ struct quota_format_ops { struct dquot_operations { int (*initialize) (struct inode *, int); int (*drop) (struct inode *); - int (*alloc_space) (struct inode *, qsize_t, int); int (*alloc_inode) (const struct inode *, qsize_t); - int (*free_space) (struct inode *, qsize_t); int (*free_inode) (const struct inode *, qsize_t); int (*transfer) (struct inode *, qid_t *, unsigned long); int (*write_dquot) (struct dquot *); /* Ordinary dquot write */ @@ -309,12 +307,6 @@ struct dquot_operations { int (*release_dquot) (struct dquot *); /* Quota is going to be deleted from disk */ int (*mark_dirty) (struct dquot *); /* Dquot is marked dirty */ int (*write_info) (struct super_block *, int); /* Write of quota "superblock" */ - /* reserve quota for delayed block allocation */ - int (*reserve_space) (struct inode *, qsize_t, int); - /* claim reserved quota for delayed alloc */ - int (*claim_space) (struct inode *, qsize_t); - /* release rsved quota for delayed alloc */ - void (*release_rsv) (struct inode *, qsize_t); /* get reserved quota for delayed alloc, value returned is managed by * quota code only */ qsize_t *(*get_reserved_space) (struct inode *); diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h index e1cae204b5d9..47e85682e118 100644 --- a/include/linux/quotaops.h +++ b/include/linux/quotaops.h @@ -33,14 +33,13 @@ int dquot_scan_active(struct super_block *sb, struct dquot *dquot_alloc(struct super_block *sb, int type); void dquot_destroy(struct dquot *dquot); -int dquot_alloc_space(struct inode *inode, qsize_t number, int prealloc); -int dquot_alloc_inode(const struct inode *inode, qsize_t number); +int __dquot_alloc_space(struct inode *inode, qsize_t number, + int warn, int reserve); +void __dquot_free_space(struct inode *inode, qsize_t number, int reserve); -int dquot_reserve_space(struct inode *inode, qsize_t number, int prealloc); -int dquot_claim_space(struct inode *inode, qsize_t number); -void dquot_release_reserved_space(struct inode *inode, qsize_t number); +int dquot_alloc_inode(const struct inode *inode, qsize_t number); -int dquot_free_space(struct inode *inode, qsize_t number); +int dquot_claim_space_nodirty(struct inode *inode, qsize_t number); int dquot_free_inode(const struct inode *inode, qsize_t number); int dquot_transfer(struct inode *inode, qid_t *chid, unsigned long mask); @@ -149,60 +148,6 @@ static inline void vfs_dq_init(struct inode *inode) inode->i_sb->dq_op->initialize(inode, -1); } -/* The following allocation/freeing/transfer functions *must* be called inside - * a transaction (deadlocks possible otherwise) */ -static inline int vfs_dq_prealloc_space_nodirty(struct inode *inode, qsize_t nr) -{ - if (sb_any_quota_active(inode->i_sb)) { - /* Used space is updated in alloc_space() */ - if (inode->i_sb->dq_op->alloc_space(inode, nr, 1) == NO_QUOTA) - return 1; - } - else - inode_add_bytes(inode, nr); - return 0; -} - -static inline int vfs_dq_prealloc_space(struct inode *inode, qsize_t nr) -{ - int ret; - if (!(ret = vfs_dq_prealloc_space_nodirty(inode, nr))) - mark_inode_dirty(inode); - return ret; -} - -static inline int vfs_dq_alloc_space_nodirty(struct inode *inode, qsize_t nr) -{ - if (sb_any_quota_active(inode->i_sb)) { - /* Used space is updated in alloc_space() */ - if (inode->i_sb->dq_op->alloc_space(inode, nr, 0) == NO_QUOTA) - return 1; - } - else - inode_add_bytes(inode, nr); - return 0; -} - -static inline int vfs_dq_alloc_space(struct inode *inode, qsize_t nr) -{ - int ret; - if (!(ret = vfs_dq_alloc_space_nodirty(inode, nr))) - mark_inode_dirty(inode); - return ret; -} - -static inline int vfs_dq_reserve_space(struct inode *inode, qsize_t nr) -{ - if (sb_any_quota_active(inode->i_sb)) { - /* Used space is updated in alloc_space() */ - if (inode->i_sb->dq_op->reserve_space(inode, nr, 0) == NO_QUOTA) - return 1; - } - else - inode_add_rsv_space(inode, nr); - return 0; -} - static inline int vfs_dq_alloc_inode(struct inode *inode) { if (sb_any_quota_active(inode->i_sb)) { @@ -213,47 +158,6 @@ static inline int vfs_dq_alloc_inode(struct inode *inode) return 0; } -/* - * Convert in-memory reserved quotas to real consumed quotas - */ -static inline int vfs_dq_claim_space(struct inode *inode, qsize_t nr) -{ - if (sb_any_quota_active(inode->i_sb)) { - if (inode->i_sb->dq_op->claim_space(inode, nr) == NO_QUOTA) - return 1; - } else - inode_claim_rsv_space(inode, nr); - - mark_inode_dirty(inode); - return 0; -} - -/* - * Release reserved (in-memory) quotas - */ -static inline -void vfs_dq_release_reservation_space(struct inode *inode, qsize_t nr) -{ - if (sb_any_quota_active(inode->i_sb)) - inode->i_sb->dq_op->release_rsv(inode, nr); - else - inode_sub_rsv_space(inode, nr); -} - -static inline void vfs_dq_free_space_nodirty(struct inode *inode, qsize_t nr) -{ - if (sb_any_quota_active(inode->i_sb)) - inode->i_sb->dq_op->free_space(inode, nr); - else - inode_sub_bytes(inode, nr); -} - -static inline void vfs_dq_free_space(struct inode *inode, qsize_t nr) -{ - vfs_dq_free_space_nodirty(inode, nr); - mark_inode_dirty(inode); -} - static inline void vfs_dq_free_inode(struct inode *inode) { if (sb_any_quota_active(inode->i_sb)) @@ -351,105 +255,109 @@ static inline int vfs_dq_transfer(struct inode *inode, struct iattr *iattr) return 0; } -static inline int vfs_dq_prealloc_space_nodirty(struct inode *inode, qsize_t nr) +static inline int __dquot_alloc_space(struct inode *inode, qsize_t number, + int warn, int reserve) { - inode_add_bytes(inode, nr); + if (!reserve) + inode_add_bytes(inode, number); return 0; } -static inline int vfs_dq_prealloc_space(struct inode *inode, qsize_t nr) +static inline void __dquot_free_space(struct inode *inode, qsize_t number, + int reserve) { - vfs_dq_prealloc_space_nodirty(inode, nr); - mark_inode_dirty(inode); - return 0; + if (!reserve) + inode_sub_bytes(inode, number); } -static inline int vfs_dq_alloc_space_nodirty(struct inode *inode, qsize_t nr) +static inline int dquot_claim_space_nodirty(struct inode *inode, qsize_t number) { - inode_add_bytes(inode, nr); + inode_add_bytes(inode, number); return 0; } -static inline int vfs_dq_alloc_space(struct inode *inode, qsize_t nr) +#endif /* CONFIG_QUOTA */ + +static inline int dquot_alloc_space_nodirty(struct inode *inode, qsize_t nr) { - vfs_dq_alloc_space_nodirty(inode, nr); - mark_inode_dirty(inode); - return 0; + return __dquot_alloc_space(inode, nr, 1, 0); } -static inline int vfs_dq_reserve_space(struct inode *inode, qsize_t nr) +static inline int dquot_alloc_space(struct inode *inode, qsize_t nr) { - return 0; + int ret; + + ret = dquot_alloc_space_nodirty(inode, nr); + if (!ret) + mark_inode_dirty(inode); + return ret; } -static inline int vfs_dq_claim_space(struct inode *inode, qsize_t nr) +static inline int dquot_alloc_block_nodirty(struct inode *inode, qsize_t nr) { - return vfs_dq_alloc_space(inode, nr); + return dquot_alloc_space_nodirty(inode, nr << inode->i_blkbits); } -static inline -int vfs_dq_release_reservation_space(struct inode *inode, qsize_t nr) +static inline int dquot_alloc_block(struct inode *inode, qsize_t nr) { - return 0; + return dquot_alloc_space(inode, nr << inode->i_blkbits); } -static inline void vfs_dq_free_space_nodirty(struct inode *inode, qsize_t nr) +static inline int dquot_prealloc_block_nodirty(struct inode *inode, qsize_t nr) { - inode_sub_bytes(inode, nr); + return __dquot_alloc_space(inode, nr << inode->i_blkbits, 0, 0); } -static inline void vfs_dq_free_space(struct inode *inode, qsize_t nr) +static inline int dquot_prealloc_block(struct inode *inode, qsize_t nr) { - vfs_dq_free_space_nodirty(inode, nr); - mark_inode_dirty(inode); -} - -#endif /* CONFIG_QUOTA */ + int ret; -static inline int vfs_dq_prealloc_block_nodirty(struct inode *inode, qsize_t nr) -{ - return vfs_dq_prealloc_space_nodirty(inode, nr << inode->i_blkbits); + ret = dquot_prealloc_block_nodirty(inode, nr); + if (!ret) + mark_inode_dirty(inode); + return ret; } -static inline int vfs_dq_prealloc_block(struct inode *inode, qsize_t nr) +static inline int dquot_reserve_block(struct inode *inode, qsize_t nr) { - return vfs_dq_prealloc_space(inode, nr << inode->i_blkbits); + return __dquot_alloc_space(inode, nr << inode->i_blkbits, 1, 1); } -static inline int vfs_dq_alloc_block_nodirty(struct inode *inode, qsize_t nr) +static inline int dquot_claim_block(struct inode *inode, qsize_t nr) { - return vfs_dq_alloc_space_nodirty(inode, nr << inode->i_blkbits); -} + int ret; -static inline int vfs_dq_alloc_block(struct inode *inode, qsize_t nr) -{ - return vfs_dq_alloc_space(inode, nr << inode->i_blkbits); + ret = dquot_claim_space_nodirty(inode, nr << inode->i_blkbits); + if (!ret) + mark_inode_dirty(inode); + return ret; } -static inline int vfs_dq_reserve_block(struct inode *inode, qsize_t nr) +static inline void dquot_free_space_nodirty(struct inode *inode, qsize_t nr) { - return vfs_dq_reserve_space(inode, nr << inode->i_blkbits); + __dquot_free_space(inode, nr, 0); } -static inline int vfs_dq_claim_block(struct inode *inode, qsize_t nr) +static inline void dquot_free_space(struct inode *inode, qsize_t nr) { - return vfs_dq_claim_space(inode, nr << inode->i_blkbits); + dquot_free_space_nodirty(inode, nr); + mark_inode_dirty(inode); } -static inline -void vfs_dq_release_reservation_block(struct inode *inode, qsize_t nr) +static inline void dquot_free_block_nodirty(struct inode *inode, qsize_t nr) { - vfs_dq_release_reservation_space(inode, nr << inode->i_blkbits); + dquot_free_space_nodirty(inode, nr << inode->i_blkbits); } -static inline void vfs_dq_free_block_nodirty(struct inode *inode, qsize_t nr) +static inline void dquot_free_block(struct inode *inode, qsize_t nr) { - vfs_dq_free_space_nodirty(inode, nr << inode->i_blkbits); + dquot_free_space(inode, nr << inode->i_blkbits); } -static inline void vfs_dq_free_block(struct inode *inode, qsize_t nr) +static inline void dquot_release_reservation_block(struct inode *inode, + qsize_t nr) { - vfs_dq_free_space(inode, nr << inode->i_blkbits); + __dquot_free_space(inode, nr << inode->i_blkbits, 1); } #endif /* _LINUX_QUOTAOPS_ */ -- cgit v1.2.3 From 63936ddaa16b9486e2d426ed7b09f559a5c60f87 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@infradead.org> Date: Wed, 3 Mar 2010 09:05:01 -0500 Subject: dquot: cleanup inode allocation / freeing routines Get rid of the alloc_inode and free_inode dquot operations - they are always called from the filesystem and if a filesystem really needs their own (which none currently does) it can just call into it's own routine directly. Also get rid of the vfs_dq_alloc/vfs_dq_free wrappers and always call the lowlevel dquot_alloc_inode / dqout_free_inode routines directly, which now lose the number argument which is always 1. Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Jan Kara <jack@suse.cz> --- Documentation/filesystems/Locking | 8 -------- fs/ext2/ialloc.c | 10 +++++----- fs/ext3/ialloc.c | 10 +++++----- fs/ext3/super.c | 2 -- fs/ext4/ialloc.c | 10 +++++----- fs/ext4/super.c | 2 -- fs/jfs/inode.c | 2 +- fs/jfs/jfs_inode.c | 6 +++--- fs/ocfs2/inode.c | 2 +- fs/ocfs2/namei.c | 30 +++++++++--------------------- fs/ocfs2/quota_global.c | 2 -- fs/quota/dquot.c | 29 +++++++++++++---------------- fs/reiserfs/inode.c | 10 +++++----- fs/reiserfs/super.c | 2 -- fs/udf/ialloc.c | 10 ++++++---- fs/ufs/ialloc.c | 7 ++++--- include/linux/quota.h | 2 -- include/linux/quotaops.h | 24 ++++-------------------- 18 files changed, 61 insertions(+), 107 deletions(-) (limited to 'fs') diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 1192fde11638..4428f55f2131 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -462,8 +462,6 @@ in sys_read() and friends. prototypes: int (*initialize) (struct inode *, int); int (*drop) (struct inode *); - int (*alloc_inode) (const struct inode *, unsigned long); - int (*free_inode) (const struct inode *, unsigned long); int (*transfer) (struct inode *, struct iattr *); int (*write_dquot) (struct dquot *); int (*acquire_dquot) (struct dquot *); @@ -479,8 +477,6 @@ What filesystem should expect from the generic quota functions: FS recursion Held locks when called initialize: yes maybe dqonoff_sem drop: yes - -alloc_inode: ->mark_dirty() - -free_inode: ->mark_dirty() - transfer: yes - write_dquot: yes dqonoff_sem or dqptr_sem acquire_dquot: yes dqonoff_sem or dqptr_sem @@ -491,10 +487,6 @@ write_info: yes dqonoff_sem FS recursion means calling ->quota_read() and ->quota_write() from superblock operations. -->alloc_inode(), ->free_inode() are called -only directly by the filesystem and do not call any fs functions only -the ->mark_dirty() operation. - More details about quota locking can be found in fs/dquot.c. --------------------------- vm_operations_struct ----------------------------- diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c index 15387c9c17d8..d12f9809559c 100644 --- a/fs/ext2/ialloc.c +++ b/fs/ext2/ialloc.c @@ -121,7 +121,7 @@ void ext2_free_inode (struct inode * inode) if (!is_bad_inode(inode)) { /* Quota is already initialized in iput() */ ext2_xattr_delete_inode(inode); - vfs_dq_free_inode(inode); + dquot_free_inode(inode); vfs_dq_drop(inode); } @@ -586,10 +586,10 @@ got: goto fail_drop; } - if (vfs_dq_alloc_inode(inode)) { - err = -EDQUOT; + vfs_dq_init(inode); + err = dquot_alloc_inode(inode); + if (err) goto fail_drop; - } err = ext2_init_acl(inode, dir); if (err) @@ -605,7 +605,7 @@ got: return inode; fail_free_drop: - vfs_dq_free_inode(inode); + dquot_free_inode(inode); fail_drop: vfs_dq_drop(inode); diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c index b39991285136..8bf00e997c38 100644 --- a/fs/ext3/ialloc.c +++ b/fs/ext3/ialloc.c @@ -125,7 +125,7 @@ void ext3_free_inode (handle_t *handle, struct inode * inode) */ vfs_dq_init(inode); ext3_xattr_delete_inode(handle, inode); - vfs_dq_free_inode(inode); + dquot_free_inode(inode); vfs_dq_drop(inode); is_directory = S_ISDIR(inode->i_mode); @@ -588,10 +588,10 @@ got: sizeof(struct ext3_inode) - EXT3_GOOD_OLD_INODE_SIZE : 0; ret = inode; - if (vfs_dq_alloc_inode(inode)) { - err = -EDQUOT; + vfs_dq_init(inode); + err = dquot_alloc_inode(inode); + if (err) goto fail_drop; - } err = ext3_init_acl(handle, inode, dir); if (err) @@ -619,7 +619,7 @@ really_out: return ret; fail_free_drop: - vfs_dq_free_inode(inode); + dquot_free_inode(inode); fail_drop: vfs_dq_drop(inode); diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 8c13910a3782..8b8bc4f9cb14 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -752,8 +752,6 @@ static ssize_t ext3_quota_write(struct super_block *sb, int type, static const struct dquot_operations ext3_quota_operations = { .initialize = dquot_initialize, .drop = dquot_drop, - .alloc_inode = dquot_alloc_inode, - .free_inode = dquot_free_inode, .transfer = dquot_transfer, .write_dquot = ext3_write_dquot, .acquire_dquot = ext3_acquire_dquot, diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index f3624ead4f6c..b0d744cf8b95 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -219,7 +219,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) */ vfs_dq_init(inode); ext4_xattr_delete_inode(handle, inode); - vfs_dq_free_inode(inode); + dquot_free_inode(inode); vfs_dq_drop(inode); is_directory = S_ISDIR(inode->i_mode); @@ -1034,10 +1034,10 @@ got: ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize; ret = inode; - if (vfs_dq_alloc_inode(inode)) { - err = -EDQUOT; + vfs_dq_init(inode); + err = dquot_alloc_inode(inode); + if (err) goto fail_drop; - } err = ext4_init_acl(handle, inode, dir); if (err) @@ -1074,7 +1074,7 @@ really_out: return ret; fail_free_drop: - vfs_dq_free_inode(inode); + dquot_free_inode(inode); fail_drop: vfs_dq_drop(inode); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index fa8f4deda652..d231da8798e3 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1017,8 +1017,6 @@ static const struct dquot_operations ext4_quota_operations = { #ifdef CONFIG_QUOTA .get_reserved_space = ext4_get_reserved_space, #endif - .alloc_inode = dquot_alloc_inode, - .free_inode = dquot_free_inode, .transfer = dquot_transfer, .write_dquot = ext4_write_dquot, .acquire_dquot = ext4_acquire_dquot, diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index b2ae190a77ba..2562d18988f7 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -159,7 +159,7 @@ void jfs_delete_inode(struct inode *inode) * Free the inode from the quota allocation. */ vfs_dq_init(inode); - vfs_dq_free_inode(inode); + dquot_free_inode(inode); vfs_dq_drop(inode); } diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c index dc0e02159ac9..7762f33e062b 100644 --- a/fs/jfs/jfs_inode.c +++ b/fs/jfs/jfs_inode.c @@ -116,10 +116,10 @@ struct inode *ialloc(struct inode *parent, umode_t mode) /* * Allocate inode to quota. */ - if (vfs_dq_alloc_inode(inode)) { - rc = -EDQUOT; + vfs_dq_init(inode); + rc = dquot_alloc_inode(inode); + if (rc) goto fail_drop; - } inode->i_mode = mode; /* inherit flags from parent */ diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 88459bdd1ff3..cb7f67d8441a 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -665,7 +665,7 @@ static int ocfs2_remove_inode(struct inode *inode, } ocfs2_remove_from_cache(INODE_CACHE(inode), di_bh); - vfs_dq_free_inode(inode); + dquot_free_inode(inode); status = ocfs2_free_dinode(handle, inode_alloc_inode, inode_alloc_bh, di); diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 13adaa1f40cd..99766b6418eb 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -348,13 +348,9 @@ static int ocfs2_mknod(struct inode *dir, goto leave; } - /* We don't use standard VFS wrapper because we don't want vfs_dq_init - * to be called. */ - if (sb_any_quota_active(osb->sb) && - osb->sb->dq_op->alloc_inode(inode, 1) == NO_QUOTA) { - status = -EDQUOT; + status = dquot_alloc_inode(inode); + if (status) goto leave; - } did_quota_inode = 1; mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, @@ -431,7 +427,7 @@ static int ocfs2_mknod(struct inode *dir, status = 0; leave: if (status < 0 && did_quota_inode) - vfs_dq_free_inode(inode); + dquot_free_inode(inode); if (handle) ocfs2_commit_trans(osb, handle); @@ -1688,13 +1684,9 @@ static int ocfs2_symlink(struct inode *dir, goto bail; } - /* We don't use standard VFS wrapper because we don't want vfs_dq_init - * to be called. */ - if (sb_any_quota_active(osb->sb) && - osb->sb->dq_op->alloc_inode(inode, 1) == NO_QUOTA) { - status = -EDQUOT; + status = dquot_alloc_inode(inode); + if (status) goto bail; - } did_quota_inode = 1; mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry, @@ -1790,7 +1782,7 @@ bail: dquot_free_space_nodirty(inode, ocfs2_clusters_to_bytes(osb->sb, 1)); if (status < 0 && did_quota_inode) - vfs_dq_free_inode(inode); + dquot_free_inode(inode); if (handle) ocfs2_commit_trans(osb, handle); @@ -2098,13 +2090,9 @@ int ocfs2_create_inode_in_orphan(struct inode *dir, goto leave; } - /* We don't use standard VFS wrapper because we don't want vfs_dq_init - * to be called. */ - if (sb_any_quota_active(osb->sb) && - osb->sb->dq_op->alloc_inode(inode, 1) == NO_QUOTA) { - status = -EDQUOT; + status = dquot_alloc_inode(inode); + if (status) goto leave; - } did_quota_inode = 1; inode->i_nlink = 0; @@ -2139,7 +2127,7 @@ int ocfs2_create_inode_in_orphan(struct inode *dir, insert_inode_hash(inode); leave: if (status < 0 && did_quota_inode) - vfs_dq_free_inode(inode); + dquot_free_inode(inode); if (handle) ocfs2_commit_trans(osb, handle); diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index aa66fb277225..ed96b3eeb13c 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -853,8 +853,6 @@ static void ocfs2_destroy_dquot(struct dquot *dquot) const struct dquot_operations ocfs2_quota_operations = { .initialize = dquot_initialize, .drop = dquot_drop, - .alloc_inode = dquot_alloc_inode, - .free_inode = dquot_free_inode, .transfer = dquot_transfer, .write_dquot = ocfs2_write_dquot, .acquire_dquot = ocfs2_acquire_dquot, diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index baf202c012cc..ed131318b849 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -1531,15 +1531,15 @@ EXPORT_SYMBOL(__dquot_alloc_space); /* * This operation can block, but only after everything is updated */ -int dquot_alloc_inode(const struct inode *inode, qsize_t number) +int dquot_alloc_inode(const struct inode *inode) { - int cnt, ret = NO_QUOTA; + int cnt, ret = -EDQUOT; char warntype[MAXQUOTAS]; /* First test before acquiring mutex - solves deadlocks when we * re-enter the quota code and are already holding the mutex */ - if (IS_NOQUOTA(inode)) - return QUOTA_OK; + if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) + return 0; for (cnt = 0; cnt < MAXQUOTAS; cnt++) warntype[cnt] = QUOTA_NL_NOWARN; down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); @@ -1547,7 +1547,7 @@ int dquot_alloc_inode(const struct inode *inode, qsize_t number) for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (!inode->i_dquot[cnt]) continue; - if (check_idq(inode->i_dquot[cnt], number, warntype+cnt) + if (check_idq(inode->i_dquot[cnt], 1, warntype+cnt) == NO_QUOTA) goto warn_put_all; } @@ -1555,12 +1555,12 @@ int dquot_alloc_inode(const struct inode *inode, qsize_t number) for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (!inode->i_dquot[cnt]) continue; - dquot_incr_inodes(inode->i_dquot[cnt], number); + dquot_incr_inodes(inode->i_dquot[cnt], 1); } - ret = QUOTA_OK; + ret = 0; warn_put_all: spin_unlock(&dq_data_lock); - if (ret == QUOTA_OK) + if (ret == 0) mark_all_dquot_dirty(inode->i_dquot); flush_warnings(inode->i_dquot, warntype); up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); @@ -1638,29 +1638,28 @@ EXPORT_SYMBOL(__dquot_free_space); /* * This operation can block, but only after everything is updated */ -int dquot_free_inode(const struct inode *inode, qsize_t number) +void dquot_free_inode(const struct inode *inode) { unsigned int cnt; char warntype[MAXQUOTAS]; /* First test before acquiring mutex - solves deadlocks when we * re-enter the quota code and are already holding the mutex */ - if (IS_NOQUOTA(inode)) - return QUOTA_OK; + if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) + return; down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); spin_lock(&dq_data_lock); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (!inode->i_dquot[cnt]) continue; - warntype[cnt] = info_idq_free(inode->i_dquot[cnt], number); - dquot_decr_inodes(inode->i_dquot[cnt], number); + warntype[cnt] = info_idq_free(inode->i_dquot[cnt], 1); + dquot_decr_inodes(inode->i_dquot[cnt], 1); } spin_unlock(&dq_data_lock); mark_all_dquot_dirty(inode->i_dquot); flush_warnings(inode->i_dquot, warntype); up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); - return QUOTA_OK; } EXPORT_SYMBOL(dquot_free_inode); @@ -1815,8 +1814,6 @@ EXPORT_SYMBOL(dquot_commit_info); const struct dquot_operations dquot_operations = { .initialize = dquot_initialize, .drop = dquot_drop, - .alloc_inode = dquot_alloc_inode, - .free_inode = dquot_free_inode, .transfer = dquot_transfer, .write_dquot = dquot_commit, .acquire_dquot = dquot_acquire, diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 2df0f5c7c60b..f56a3d2e6497 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -54,7 +54,7 @@ void reiserfs_delete_inode(struct inode *inode) * after delete_object so that quota updates go into the same transaction as * stat data deletion */ if (!err) - vfs_dq_free_inode(inode); + dquot_free_inode(inode); if (journal_end(&th, inode->i_sb, jbegin_count)) goto out; @@ -1765,10 +1765,10 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, BUG_ON(!th->t_trans_id); - if (vfs_dq_alloc_inode(inode)) { - err = -EDQUOT; + vfs_dq_init(inode); + err = dquot_alloc_inode(inode); + if (err) goto out_end_trans; - } if (!dir->i_nlink) { err = -EPERM; goto out_bad_inode; @@ -1959,7 +1959,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, INODE_PKEY(inode)->k_objectid = 0; /* Quota change must be inside a transaction for journaling */ - vfs_dq_free_inode(inode); + dquot_free_inode(inode); out_end_trans: journal_end(th, th->t_super, th->t_blocks_allocated); diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index ea4a77e9d7f5..e942ceecf2b8 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -618,8 +618,6 @@ static int reiserfs_quota_on(struct super_block *, int, int, char *, int); static const struct dquot_operations reiserfs_quota_operations = { .initialize = dquot_initialize, .drop = dquot_drop, - .alloc_inode = dquot_alloc_inode, - .free_inode = dquot_free_inode, .transfer = dquot_transfer, .write_dquot = reiserfs_write_dquot, .acquire_dquot = reiserfs_acquire_dquot, diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c index c10fa39f97e2..e1856b89c9c8 100644 --- a/fs/udf/ialloc.c +++ b/fs/udf/ialloc.c @@ -36,7 +36,7 @@ void udf_free_inode(struct inode *inode) * Note: we must free any quota before locking the superblock, * as writing the quota to disk may need the lock as well. */ - vfs_dq_free_inode(inode); + dquot_free_inode(inode); vfs_dq_drop(inode); clear_inode(inode); @@ -61,7 +61,7 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err) struct super_block *sb = dir->i_sb; struct udf_sb_info *sbi = UDF_SB(sb); struct inode *inode; - int block; + int block, ret; uint32_t start = UDF_I(dir)->i_location.logicalBlockNum; struct udf_inode_info *iinfo; struct udf_inode_info *dinfo = UDF_I(dir); @@ -153,12 +153,14 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err) insert_inode_hash(inode); mark_inode_dirty(inode); - if (vfs_dq_alloc_inode(inode)) { + vfs_dq_init(inode); + ret = dquot_alloc_inode(inode); + if (ret) { vfs_dq_drop(inode); inode->i_flags |= S_NOQUOTA; inode->i_nlink = 0; iput(inode); - *err = -EDQUOT; + *err = ret; return NULL; } diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c index 3527c00fef0d..02f77882c573 100644 --- a/fs/ufs/ialloc.c +++ b/fs/ufs/ialloc.c @@ -95,7 +95,7 @@ void ufs_free_inode (struct inode * inode) is_directory = S_ISDIR(inode->i_mode); - vfs_dq_free_inode(inode); + dquot_free_inode(inode); vfs_dq_drop(inode); clear_inode (inode); @@ -355,9 +355,10 @@ cg_found: unlock_super (sb); - if (vfs_dq_alloc_inode(inode)) { + vfs_dq_init(inode); + err = dquot_alloc_inode(inode); + if (err) { vfs_dq_drop(inode); - err = -EDQUOT; goto fail_without_unlock; } diff --git a/include/linux/quota.h b/include/linux/quota.h index 1b14ad287fe3..e3b07895d327 100644 --- a/include/linux/quota.h +++ b/include/linux/quota.h @@ -297,8 +297,6 @@ struct quota_format_ops { struct dquot_operations { int (*initialize) (struct inode *, int); int (*drop) (struct inode *); - int (*alloc_inode) (const struct inode *, qsize_t); - int (*free_inode) (const struct inode *, qsize_t); int (*transfer) (struct inode *, qid_t *, unsigned long); int (*write_dquot) (struct dquot *); /* Ordinary dquot write */ struct dquot *(*alloc_dquot)(struct super_block *, int); /* Allocate memory for new dquot */ diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h index 47e85682e118..9ce7f051a4ba 100644 --- a/include/linux/quotaops.h +++ b/include/linux/quotaops.h @@ -37,10 +37,10 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, int warn, int reserve); void __dquot_free_space(struct inode *inode, qsize_t number, int reserve); -int dquot_alloc_inode(const struct inode *inode, qsize_t number); +int dquot_alloc_inode(const struct inode *inode); int dquot_claim_space_nodirty(struct inode *inode, qsize_t number); -int dquot_free_inode(const struct inode *inode, qsize_t number); +void dquot_free_inode(const struct inode *inode); int dquot_transfer(struct inode *inode, qid_t *chid, unsigned long mask); int dquot_commit(struct dquot *dquot); @@ -148,22 +148,6 @@ static inline void vfs_dq_init(struct inode *inode) inode->i_sb->dq_op->initialize(inode, -1); } -static inline int vfs_dq_alloc_inode(struct inode *inode) -{ - if (sb_any_quota_active(inode->i_sb)) { - vfs_dq_init(inode); - if (inode->i_sb->dq_op->alloc_inode(inode, 1) == NO_QUOTA) - return 1; - } - return 0; -} - -static inline void vfs_dq_free_inode(struct inode *inode) -{ - if (sb_any_quota_active(inode->i_sb)) - inode->i_sb->dq_op->free_inode(inode, 1); -} - /* Cannot be called inside a transaction */ static inline int vfs_dq_off(struct super_block *sb, int remount) { @@ -231,12 +215,12 @@ static inline void vfs_dq_drop(struct inode *inode) { } -static inline int vfs_dq_alloc_inode(struct inode *inode) +static inline int dquot_alloc_inode(const struct inode *inode) { return 0; } -static inline void vfs_dq_free_inode(struct inode *inode) +static inline void dquot_free_inode(const struct inode *inode) { } -- cgit v1.2.3 From 759bfee658beab14af7b357156461d0eb852be2c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@infradead.org> Date: Wed, 3 Mar 2010 09:05:02 -0500 Subject: dquot: move dquot transfer responsibility into the filesystem Currently notify_change calls vfs_dq_transfer directly. This means we tie the quota code into the VFS. Get rid of that and make the filesystem responsible for the transfer. Most filesystems already do this, only ufs and udf need the code added, and for jfs it needs to be enabled unconditionally instead of only when ACLs are enabled. Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Jan Kara <jack@suse.cz> --- fs/attr.c | 11 ++--------- fs/jfs/acl.c | 26 +------------------------- fs/jfs/file.c | 26 +++++++++++++++++++++++++- fs/jfs/jfs_acl.h | 7 ++++++- fs/jfs/jfs_inode.h | 1 + fs/jfs/namei.c | 2 +- fs/udf/file.c | 23 ++++++++++++++++++++++- fs/ufs/truncate.c | 7 +++++++ 8 files changed, 65 insertions(+), 38 deletions(-) (limited to 'fs') diff --git a/fs/attr.c b/fs/attr.c index 96d394bdaddf..0a6ea54cde7d 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -12,7 +12,6 @@ #include <linux/capability.h> #include <linux/fsnotify.h> #include <linux/fcntl.h> -#include <linux/quotaops.h> #include <linux/security.h> /* Taken over from the old code... */ @@ -212,14 +211,8 @@ int notify_change(struct dentry * dentry, struct iattr * attr) error = inode->i_op->setattr(dentry, attr); } else { error = inode_change_ok(inode, attr); - if (!error) { - if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || - (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) - error = vfs_dq_transfer(inode, attr) ? - -EDQUOT : 0; - if (!error) - error = inode_setattr(inode, attr); - } + if (!error) + error = inode_setattr(inode, attr); } if (ia_valid & ATTR_SIZE) diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c index d66477c34306..213169780b6c 100644 --- a/fs/jfs/acl.c +++ b/fs/jfs/acl.c @@ -20,7 +20,6 @@ #include <linux/sched.h> #include <linux/fs.h> -#include <linux/quotaops.h> #include <linux/posix_acl_xattr.h> #include "jfs_incore.h" #include "jfs_txnmgr.h" @@ -174,7 +173,7 @@ cleanup: return rc; } -static int jfs_acl_chmod(struct inode *inode) +int jfs_acl_chmod(struct inode *inode) { struct posix_acl *acl, *clone; int rc; @@ -205,26 +204,3 @@ static int jfs_acl_chmod(struct inode *inode) posix_acl_release(clone); return rc; } - -int jfs_setattr(struct dentry *dentry, struct iattr *iattr) -{ - struct inode *inode = dentry->d_inode; - int rc; - - rc = inode_change_ok(inode, iattr); - if (rc) - return rc; - - if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) || - (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) { - if (vfs_dq_transfer(inode, iattr)) - return -EDQUOT; - } - - rc = inode_setattr(inode, iattr); - - if (!rc && (iattr->ia_valid & ATTR_MODE)) - rc = jfs_acl_chmod(inode); - - return rc; -} diff --git a/fs/jfs/file.c b/fs/jfs/file.c index 2b70fa78e4a7..a4229e49330e 100644 --- a/fs/jfs/file.c +++ b/fs/jfs/file.c @@ -18,6 +18,7 @@ */ #include <linux/fs.h> +#include <linux/quotaops.h> #include "jfs_incore.h" #include "jfs_inode.h" #include "jfs_dmap.h" @@ -88,14 +89,37 @@ static int jfs_release(struct inode *inode, struct file *file) return 0; } +int jfs_setattr(struct dentry *dentry, struct iattr *iattr) +{ + struct inode *inode = dentry->d_inode; + int rc; + + rc = inode_change_ok(inode, iattr); + if (rc) + return rc; + + if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) || + (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) { + if (vfs_dq_transfer(inode, iattr)) + return -EDQUOT; + } + + rc = inode_setattr(inode, iattr); + + if (!rc && (iattr->ia_valid & ATTR_MODE)) + rc = jfs_acl_chmod(inode); + + return rc; +} + const struct inode_operations jfs_file_inode_operations = { .truncate = jfs_truncate, .setxattr = jfs_setxattr, .getxattr = jfs_getxattr, .listxattr = jfs_listxattr, .removexattr = jfs_removexattr, -#ifdef CONFIG_JFS_POSIX_ACL .setattr = jfs_setattr, +#ifdef CONFIG_JFS_POSIX_ACL .check_acl = jfs_check_acl, #endif }; diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h index b07bd417ef85..54e07559878d 100644 --- a/fs/jfs/jfs_acl.h +++ b/fs/jfs/jfs_acl.h @@ -22,7 +22,7 @@ int jfs_check_acl(struct inode *, int); int jfs_init_acl(tid_t, struct inode *, struct inode *); -int jfs_setattr(struct dentry *, struct iattr *); +int jfs_acl_chmod(struct inode *inode); #else @@ -32,5 +32,10 @@ static inline int jfs_init_acl(tid_t tid, struct inode *inode, return 0; } +static inline int jfs_acl_chmod(struct inode *inode) +{ + return 0; +} + #endif #endif /* _H_JFS_ACL */ diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h index 1eff7db34d63..4b91b2787835 100644 --- a/fs/jfs/jfs_inode.h +++ b/fs/jfs/jfs_inode.h @@ -40,6 +40,7 @@ extern struct dentry *jfs_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len, int fh_type); extern void jfs_set_inode_flags(struct inode *); extern int jfs_get_block(struct inode *, sector_t, struct buffer_head *, int); +extern int jfs_setattr(struct dentry *, struct iattr *); extern const struct address_space_operations jfs_aops; extern const struct inode_operations jfs_dir_inode_operations; diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index c79a4270f083..1d1390afe55e 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -1541,8 +1541,8 @@ const struct inode_operations jfs_dir_inode_operations = { .getxattr = jfs_getxattr, .listxattr = jfs_listxattr, .removexattr = jfs_removexattr, -#ifdef CONFIG_JFS_POSIX_ACL .setattr = jfs_setattr, +#ifdef CONFIG_JFS_POSIX_ACL .check_acl = jfs_check_acl, #endif }; diff --git a/fs/udf/file.c b/fs/udf/file.c index f311d509b6a3..35ca47281faa 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -34,6 +34,7 @@ #include <linux/errno.h> #include <linux/smp_lock.h> #include <linux/pagemap.h> +#include <linux/quotaops.h> #include <linux/buffer_head.h> #include <linux/aio.h> @@ -217,6 +218,26 @@ const struct file_operations udf_file_operations = { .llseek = generic_file_llseek, }; +static int udf_setattr(struct dentry *dentry, struct iattr *iattr) +{ + struct inode *inode = dentry->d_inode; + int error; + + error = inode_change_ok(inode, iattr); + if (error) + return error; + + if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) || + (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) { + error = vfs_dq_transfer(inode, iattr) ? -EDQUOT : 0; + if (error) + return error; + } + + return inode_setattr(inode, iattr); +} + const struct inode_operations udf_file_inode_operations = { - .truncate = udf_truncate, + .truncate = udf_truncate, + .setattr = udf_setattr, }; diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c index 41dd431ce228..56ab31f00bd0 100644 --- a/fs/ufs/truncate.c +++ b/fs/ufs/truncate.c @@ -44,6 +44,7 @@ #include <linux/buffer_head.h> #include <linux/blkdev.h> #include <linux/sched.h> +#include <linux/quotaops.h> #include "ufs_fs.h" #include "ufs.h" @@ -517,6 +518,12 @@ static int ufs_setattr(struct dentry *dentry, struct iattr *attr) if (error) return error; + if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || + (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { + error = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0; + if (error) + return error; + } if (ia_valid & ATTR_SIZE && attr->ia_size != i_size_read(inode)) { loff_t old_i_size = inode->i_size; -- cgit v1.2.3 From b43fa8284d7790d9cca32c9c55e24f29be2fa33b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@infradead.org> Date: Wed, 3 Mar 2010 09:05:03 -0500 Subject: dquot: cleanup dquot transfer routine Get rid of the transfer dquot operation - it is now always called from the filesystem and if a filesystem really needs it's own (which none currently does) it can just call into it's own routine directly. Rename the now static low-level dquot_transfer helper to __dquot_transfer and vfs_dq_transfer to dquot_transfer to have a consistent namespace, and make the new dquot_transfer return a normal negative errno value which all callers expect. Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Jan Kara <jack@suse.cz> --- Documentation/filesystems/Locking | 2 -- drivers/staging/pohmelfs/inode.c | 2 +- fs/ext2/inode.c | 2 +- fs/ext3/inode.c | 2 +- fs/ext3/super.c | 1 - fs/ext4/inode.c | 2 +- fs/ext4/super.c | 1 - fs/jfs/file.c | 5 +++-- fs/ocfs2/file.c | 4 ++-- fs/ocfs2/quota_global.c | 1 - fs/quota/dquot.c | 12 +++++------- fs/reiserfs/inode.c | 3 +-- fs/reiserfs/super.c | 1 - fs/udf/file.c | 2 +- fs/ufs/truncate.c | 2 +- include/linux/quota.h | 1 - include/linux/quotaops.h | 5 ++--- 17 files changed, 19 insertions(+), 29 deletions(-) (limited to 'fs') diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 4428f55f2131..4574e0272bdd 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -462,7 +462,6 @@ in sys_read() and friends. prototypes: int (*initialize) (struct inode *, int); int (*drop) (struct inode *); - int (*transfer) (struct inode *, struct iattr *); int (*write_dquot) (struct dquot *); int (*acquire_dquot) (struct dquot *); int (*release_dquot) (struct dquot *); @@ -477,7 +476,6 @@ What filesystem should expect from the generic quota functions: FS recursion Held locks when called initialize: yes maybe dqonoff_sem drop: yes - -transfer: yes - write_dquot: yes dqonoff_sem or dqptr_sem acquire_dquot: yes dqonoff_sem or dqptr_sem release_dquot: yes dqonoff_sem or dqptr_sem diff --git a/drivers/staging/pohmelfs/inode.c b/drivers/staging/pohmelfs/inode.c index f69b7783027f..11fc4d5c43e1 100644 --- a/drivers/staging/pohmelfs/inode.c +++ b/drivers/staging/pohmelfs/inode.c @@ -969,7 +969,7 @@ int pohmelfs_setattr_raw(struct inode *inode, struct iattr *attr) if ((attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { - err = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0; + err = dquot_transfer(inode, attr); if (err) goto err_out_exit; } diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 71b032c65a02..3cfcfd9a131a 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -1459,7 +1459,7 @@ int ext2_setattr(struct dentry *dentry, struct iattr *iattr) return error; if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) || (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) { - error = vfs_dq_transfer(inode, iattr) ? -EDQUOT : 0; + error = dquot_transfer(inode, iattr); if (error) return error; } diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 20f02d69365c..14d40a4dd6f0 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -3160,7 +3160,7 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr) error = PTR_ERR(handle); goto err_out; } - error = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0; + error = dquot_transfer(inode, attr); if (error) { ext3_journal_stop(handle); return error; diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 8b8bc4f9cb14..f7d4a2c19dee 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -752,7 +752,6 @@ static ssize_t ext3_quota_write(struct super_block *sb, int type, static const struct dquot_operations ext3_quota_operations = { .initialize = dquot_initialize, .drop = dquot_drop, - .transfer = dquot_transfer, .write_dquot = ext3_write_dquot, .acquire_dquot = ext3_acquire_dquot, .release_dquot = ext3_release_dquot, diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 9f607ea411c8..6a002a6d0624 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5263,7 +5263,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) error = PTR_ERR(handle); goto err_out; } - error = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0; + error = dquot_transfer(inode, attr); if (error) { ext4_journal_stop(handle); return error; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index d231da8798e3..b4253fb7bab6 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1017,7 +1017,6 @@ static const struct dquot_operations ext4_quota_operations = { #ifdef CONFIG_QUOTA .get_reserved_space = ext4_get_reserved_space, #endif - .transfer = dquot_transfer, .write_dquot = ext4_write_dquot, .acquire_dquot = ext4_acquire_dquot, .release_dquot = ext4_release_dquot, diff --git a/fs/jfs/file.c b/fs/jfs/file.c index a4229e49330e..2c201783836f 100644 --- a/fs/jfs/file.c +++ b/fs/jfs/file.c @@ -100,8 +100,9 @@ int jfs_setattr(struct dentry *dentry, struct iattr *iattr) if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) || (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) { - if (vfs_dq_transfer(inode, iattr)) - return -EDQUOT; + rc = dquot_transfer(inode, iattr); + if (rc) + return rc; } rc = inode_setattr(inode, iattr); diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 6cf3d8d18369..472e8f8bc892 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1020,7 +1020,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) /* * Gather pointers to quota structures so that allocation / * freeing of quota structures happens here and not inside - * vfs_dq_transfer() where we have problems with lock ordering + * dquot_transfer() where we have problems with lock ordering */ if (attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid && OCFS2_HAS_RO_COMPAT_FEATURE(sb, @@ -1053,7 +1053,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) mlog_errno(status); goto bail_unlock; } - status = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0; + status = dquot_transfer(inode, attr); if (status < 0) goto bail_commit; } else { diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index ed96b3eeb13c..b654bd103b6f 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -853,7 +853,6 @@ static void ocfs2_destroy_dquot(struct dquot *dquot) const struct dquot_operations ocfs2_quota_operations = { .initialize = dquot_initialize, .drop = dquot_drop, - .transfer = dquot_transfer, .write_dquot = ocfs2_write_dquot, .acquire_dquot = ocfs2_acquire_dquot, .release_dquot = ocfs2_release_dquot, diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index ed131318b849..78ce4c48ad77 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -1669,7 +1669,7 @@ EXPORT_SYMBOL(dquot_free_inode); * This operation can block, but only after everything is updated * A transaction must be started when entering this function. */ -int dquot_transfer(struct inode *inode, qid_t *chid, unsigned long mask) +static int __dquot_transfer(struct inode *inode, qid_t *chid, unsigned long mask) { qsize_t space, cur_space; qsize_t rsv_space = 0; @@ -1766,12 +1766,11 @@ over_quota: ret = NO_QUOTA; goto warn_put_all; } -EXPORT_SYMBOL(dquot_transfer); /* Wrapper for transferring ownership of an inode for uid/gid only * Called from FSXXX_setattr() */ -int vfs_dq_transfer(struct inode *inode, struct iattr *iattr) +int dquot_transfer(struct inode *inode, struct iattr *iattr) { qid_t chid[MAXQUOTAS]; unsigned long mask = 0; @@ -1786,12 +1785,12 @@ int vfs_dq_transfer(struct inode *inode, struct iattr *iattr) } if (sb_any_quota_active(inode->i_sb) && !IS_NOQUOTA(inode)) { vfs_dq_init(inode); - if (inode->i_sb->dq_op->transfer(inode, chid, mask) == NO_QUOTA) - return 1; + if (__dquot_transfer(inode, chid, mask) == NO_QUOTA) + return -EDQUOT; } return 0; } -EXPORT_SYMBOL(vfs_dq_transfer); +EXPORT_SYMBOL(dquot_transfer); /* * Write info of quota file to disk @@ -1814,7 +1813,6 @@ EXPORT_SYMBOL(dquot_commit_info); const struct dquot_operations dquot_operations = { .initialize = dquot_initialize, .drop = dquot_drop, - .transfer = dquot_transfer, .write_dquot = dquot_commit, .acquire_dquot = dquot_acquire, .release_dquot = dquot_release, diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index f56a3d2e6497..99a5e5a8ab5a 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -3134,8 +3134,7 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) jbegin_count); if (error) goto out; - error = - vfs_dq_transfer(inode, attr) ? -EDQUOT : 0; + error = dquot_transfer(inode, attr); if (error) { journal_end(&th, inode->i_sb, jbegin_count); diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index e942ceecf2b8..97c3e8ed7db6 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -618,7 +618,6 @@ static int reiserfs_quota_on(struct super_block *, int, int, char *, int); static const struct dquot_operations reiserfs_quota_operations = { .initialize = dquot_initialize, .drop = dquot_drop, - .transfer = dquot_transfer, .write_dquot = reiserfs_write_dquot, .acquire_dquot = reiserfs_acquire_dquot, .release_dquot = reiserfs_release_dquot, diff --git a/fs/udf/file.c b/fs/udf/file.c index 35ca47281faa..2df7fcb677b3 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -229,7 +229,7 @@ static int udf_setattr(struct dentry *dentry, struct iattr *iattr) if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) || (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) { - error = vfs_dq_transfer(inode, iattr) ? -EDQUOT : 0; + error = dquot_transfer(inode, iattr); if (error) return error; } diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c index 56ab31f00bd0..87bbab685901 100644 --- a/fs/ufs/truncate.c +++ b/fs/ufs/truncate.c @@ -520,7 +520,7 @@ static int ufs_setattr(struct dentry *dentry, struct iattr *attr) if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { - error = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0; + error = dquot_transfer(inode, attr); if (error) return error; } diff --git a/include/linux/quota.h b/include/linux/quota.h index e3b07895d327..422e6aa78edc 100644 --- a/include/linux/quota.h +++ b/include/linux/quota.h @@ -297,7 +297,6 @@ struct quota_format_ops { struct dquot_operations { int (*initialize) (struct inode *, int); int (*drop) (struct inode *); - int (*transfer) (struct inode *, qid_t *, unsigned long); int (*write_dquot) (struct dquot *); /* Ordinary dquot write */ struct dquot *(*alloc_dquot)(struct super_block *, int); /* Allocate memory for new dquot */ void (*destroy_dquot)(struct dquot *); /* Free memory for dquot */ diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h index 9ce7f051a4ba..fa27b7218c82 100644 --- a/include/linux/quotaops.h +++ b/include/linux/quotaops.h @@ -42,7 +42,6 @@ int dquot_alloc_inode(const struct inode *inode); int dquot_claim_space_nodirty(struct inode *inode, qsize_t number); void dquot_free_inode(const struct inode *inode); -int dquot_transfer(struct inode *inode, qid_t *chid, unsigned long mask); int dquot_commit(struct dquot *dquot); int dquot_acquire(struct dquot *dquot); int dquot_release(struct dquot *dquot); @@ -66,7 +65,7 @@ int vfs_get_dqblk(struct super_block *sb, int type, qid_t id, struct if_dqblk *d int vfs_set_dqblk(struct super_block *sb, int type, qid_t id, struct if_dqblk *di); void vfs_dq_drop(struct inode *inode); -int vfs_dq_transfer(struct inode *inode, struct iattr *iattr); +int dquot_transfer(struct inode *inode, struct iattr *iattr); int vfs_dq_quota_on_remount(struct super_block *sb); static inline struct mem_dqinfo *sb_dqinfo(struct super_block *sb, int type) @@ -234,7 +233,7 @@ static inline int vfs_dq_quota_on_remount(struct super_block *sb) return 0; } -static inline int vfs_dq_transfer(struct inode *inode, struct iattr *iattr) +static inline int dquot_transfer(struct inode *inode, struct iattr *iattr) { return 0; } -- cgit v1.2.3 From 257ba15cedf1288f0c96118d7e63947231d27278 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@infradead.org> Date: Wed, 3 Mar 2010 09:05:04 -0500 Subject: dquot: move dquot drop responsibility into the filesystem Currently clear_inode calls vfs_dq_drop directly. This means we tie the quota code into the VFS. Get rid of that and make the filesystem responsible for the drop inside the ->clear_inode superblock operation. Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Jan Kara <jack@suse.cz> --- fs/ext2/super.c | 2 ++ fs/ext3/super.c | 2 ++ fs/ext4/super.c | 1 + fs/inode.c | 1 - fs/jfs/super.c | 6 ++++++ fs/ocfs2/inode.c | 2 ++ fs/reiserfs/super.c | 6 ++++++ fs/udf/inode.c | 2 ++ fs/ufs/super.c | 6 ++++++ 9 files changed, 27 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext2/super.c b/fs/ext2/super.c index f9cb54a585ce..98815d2a5664 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -194,6 +194,8 @@ static void destroy_inodecache(void) static void ext2_clear_inode(struct inode *inode) { struct ext2_block_alloc_info *rsv = EXT2_I(inode)->i_block_alloc_info; + + vfs_dq_drop(inode); ext2_discard_reservation(inode); EXT2_I(inode)->i_block_alloc_info = NULL; if (unlikely(rsv)) diff --git a/fs/ext3/super.c b/fs/ext3/super.c index f7d4a2c19dee..2277b1a98e62 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -528,6 +528,8 @@ static void destroy_inodecache(void) static void ext3_clear_inode(struct inode *inode) { struct ext3_block_alloc_info *rsv = EXT3_I(inode)->i_block_alloc_info; + + vfs_dq_drop(inode); ext3_discard_reservation(inode); EXT3_I(inode)->i_block_alloc_info = NULL; if (unlikely(rsv)) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index b4253fb7bab6..56554c8850ec 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -761,6 +761,7 @@ static void destroy_inodecache(void) static void ext4_clear_inode(struct inode *inode) { + vfs_dq_drop(inode); ext4_discard_preallocations(inode); if (EXT4_JOURNAL(inode)) jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal, diff --git a/fs/inode.c b/fs/inode.c index 03dfeb2e3928..f1aef3482b0e 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -314,7 +314,6 @@ void clear_inode(struct inode *inode) BUG_ON(!(inode->i_state & I_FREEING)); BUG_ON(inode->i_state & I_CLEAR); inode_sync_wait(inode); - vfs_dq_drop(inode); if (inode->i_sb->s_op->clear_inode) inode->i_sb->s_op->clear_inode(inode); if (S_ISBLK(inode->i_mode) && inode->i_bdev) diff --git a/fs/jfs/super.c b/fs/jfs/super.c index d929a822a74e..4086fa593419 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -131,6 +131,11 @@ static void jfs_destroy_inode(struct inode *inode) kmem_cache_free(jfs_inode_cachep, ji); } +static void jfs_clear_inode(struct inode *inode) +{ + vfs_dq_drop(inode); +} + static int jfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct jfs_sb_info *sbi = JFS_SBI(dentry->d_sb); @@ -745,6 +750,7 @@ static const struct super_operations jfs_super_operations = { .dirty_inode = jfs_dirty_inode, .write_inode = jfs_write_inode, .delete_inode = jfs_delete_inode, + .clear_inode = jfs_clear_inode, .put_super = jfs_put_super, .sync_fs = jfs_sync_fs, .freeze_fs = jfs_freeze, diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index cb7f67d8441a..13eb5d467c40 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -1087,6 +1087,8 @@ void ocfs2_clear_inode(struct inode *inode) mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL, "Inode=%lu\n", inode->i_ino); + vfs_dq_drop(inode); + /* To preven remote deletes we hold open lock before, now it * is time to unlock PR and EX open locks. */ ocfs2_open_unlock(inode); diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 97c3e8ed7db6..6b24e70e329b 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -578,6 +578,11 @@ out: reiserfs_write_unlock_once(inode->i_sb, lock_depth); } +static void reiserfs_clear_inode(struct inode *inode) +{ + vfs_dq_drop(inode); +} + #ifdef CONFIG_QUOTA static ssize_t reiserfs_quota_write(struct super_block *, int, const char *, size_t, loff_t); @@ -590,6 +595,7 @@ static const struct super_operations reiserfs_sops = { .destroy_inode = reiserfs_destroy_inode, .write_inode = reiserfs_write_inode, .dirty_inode = reiserfs_dirty_inode, + .clear_inode = reiserfs_clear_inode, .delete_inode = reiserfs_delete_inode, .put_super = reiserfs_put_super, .write_super = reiserfs_write_super, diff --git a/fs/udf/inode.c b/fs/udf/inode.c index f90231eb2916..859389a3832b 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -108,6 +108,8 @@ void udf_clear_inode(struct inode *inode) (unsigned long long)inode->i_size, (unsigned long long)iinfo->i_lenExtents); } + + vfs_dq_drop(inode); kfree(iinfo->i_ext.i_data); iinfo->i_ext.i_data = NULL; } diff --git a/fs/ufs/super.c b/fs/ufs/super.c index 143c20bfb04b..95d61cb3a5b8 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -1432,6 +1432,11 @@ static void destroy_inodecache(void) kmem_cache_destroy(ufs_inode_cachep); } +static void ufs_clear_inode(struct inode *inode) +{ + vfs_dq_drop(inode); +} + #ifdef CONFIG_QUOTA static ssize_t ufs_quota_read(struct super_block *, int, char *,size_t, loff_t); static ssize_t ufs_quota_write(struct super_block *, int, const char *, size_t, loff_t); @@ -1442,6 +1447,7 @@ static const struct super_operations ufs_super_ops = { .destroy_inode = ufs_destroy_inode, .write_inode = ufs_write_inode, .delete_inode = ufs_delete_inode, + .clear_inode = ufs_clear_inode, .put_super = ufs_put_super, .write_super = ufs_write_super, .sync_fs = ufs_sync_fs, -- cgit v1.2.3 From 9f7547580263d4a55efe06ce5cfd567f568be6e8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@infradead.org> Date: Wed, 3 Mar 2010 09:05:05 -0500 Subject: dquot: cleanup dquot drop routine Get rid of the drop dquot operation - it is now always called from the filesystem and if a filesystem really needs it's own (which none currently does) it can just call into it's own routine directly. Rename the now static low-level dquot_drop helper to __dquot_drop and vfs_dq_drop to dquot_drop to have a consistent namespace. Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Jan Kara <jack@suse.cz> --- Documentation/filesystems/Locking | 2 -- fs/ext2/ialloc.c | 4 +-- fs/ext2/super.c | 2 +- fs/ext3/ialloc.c | 4 +-- fs/ext3/super.c | 3 +-- fs/ext4/ialloc.c | 4 +-- fs/ext4/super.c | 3 +-- fs/jfs/inode.c | 2 +- fs/jfs/jfs_inode.c | 2 +- fs/jfs/super.c | 2 +- fs/ocfs2/inode.c | 2 +- fs/ocfs2/quota_global.c | 1 - fs/quota/dquot.c | 52 +++++++++++++++++++-------------------- fs/reiserfs/inode.c | 2 +- fs/reiserfs/namei.c | 2 +- fs/reiserfs/super.c | 3 +-- fs/udf/ialloc.c | 4 +-- fs/udf/inode.c | 2 +- fs/ufs/ialloc.c | 4 +-- fs/ufs/super.c | 2 +- include/linux/quota.h | 1 - include/linux/quotaops.h | 5 ++-- 22 files changed, 49 insertions(+), 59 deletions(-) (limited to 'fs') diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 4574e0272bdd..fa10e4bf8e5e 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -461,7 +461,6 @@ in sys_read() and friends. --------------------------- dquot_operations ------------------------------- prototypes: int (*initialize) (struct inode *, int); - int (*drop) (struct inode *); int (*write_dquot) (struct dquot *); int (*acquire_dquot) (struct dquot *); int (*release_dquot) (struct dquot *); @@ -475,7 +474,6 @@ What filesystem should expect from the generic quota functions: FS recursion Held locks when called initialize: yes maybe dqonoff_sem -drop: yes - write_dquot: yes dqonoff_sem or dqptr_sem acquire_dquot: yes dqonoff_sem or dqptr_sem release_dquot: yes dqonoff_sem or dqptr_sem diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c index d12f9809559c..88b71972c626 100644 --- a/fs/ext2/ialloc.c +++ b/fs/ext2/ialloc.c @@ -122,7 +122,7 @@ void ext2_free_inode (struct inode * inode) /* Quota is already initialized in iput() */ ext2_xattr_delete_inode(inode); dquot_free_inode(inode); - vfs_dq_drop(inode); + dquot_drop(inode); } es = EXT2_SB(sb)->s_es; @@ -608,7 +608,7 @@ fail_free_drop: dquot_free_inode(inode); fail_drop: - vfs_dq_drop(inode); + dquot_drop(inode); inode->i_flags |= S_NOQUOTA; inode->i_nlink = 0; unlock_new_inode(inode); diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 98815d2a5664..42e4a303b675 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -195,7 +195,7 @@ static void ext2_clear_inode(struct inode *inode) { struct ext2_block_alloc_info *rsv = EXT2_I(inode)->i_block_alloc_info; - vfs_dq_drop(inode); + dquot_drop(inode); ext2_discard_reservation(inode); EXT2_I(inode)->i_block_alloc_info = NULL; if (unlikely(rsv)) diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c index 8bf00e997c38..7d7238f9f6f3 100644 --- a/fs/ext3/ialloc.c +++ b/fs/ext3/ialloc.c @@ -126,7 +126,7 @@ void ext3_free_inode (handle_t *handle, struct inode * inode) vfs_dq_init(inode); ext3_xattr_delete_inode(handle, inode); dquot_free_inode(inode); - vfs_dq_drop(inode); + dquot_drop(inode); is_directory = S_ISDIR(inode->i_mode); @@ -622,7 +622,7 @@ fail_free_drop: dquot_free_inode(inode); fail_drop: - vfs_dq_drop(inode); + dquot_drop(inode); inode->i_flags |= S_NOQUOTA; inode->i_nlink = 0; unlock_new_inode(inode); diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 2277b1a98e62..0163d0dae124 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -529,7 +529,7 @@ static void ext3_clear_inode(struct inode *inode) { struct ext3_block_alloc_info *rsv = EXT3_I(inode)->i_block_alloc_info; - vfs_dq_drop(inode); + dquot_drop(inode); ext3_discard_reservation(inode); EXT3_I(inode)->i_block_alloc_info = NULL; if (unlikely(rsv)) @@ -753,7 +753,6 @@ static ssize_t ext3_quota_write(struct super_block *sb, int type, static const struct dquot_operations ext3_quota_operations = { .initialize = dquot_initialize, - .drop = dquot_drop, .write_dquot = ext3_write_dquot, .acquire_dquot = ext3_acquire_dquot, .release_dquot = ext3_release_dquot, diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index b0d744cf8b95..ca8986e4b528 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -220,7 +220,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) vfs_dq_init(inode); ext4_xattr_delete_inode(handle, inode); dquot_free_inode(inode); - vfs_dq_drop(inode); + dquot_drop(inode); is_directory = S_ISDIR(inode->i_mode); @@ -1077,7 +1077,7 @@ fail_free_drop: dquot_free_inode(inode); fail_drop: - vfs_dq_drop(inode); + dquot_drop(inode); inode->i_flags |= S_NOQUOTA; inode->i_nlink = 0; unlock_new_inode(inode); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 56554c8850ec..035516c80df2 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -761,7 +761,7 @@ static void destroy_inodecache(void) static void ext4_clear_inode(struct inode *inode) { - vfs_dq_drop(inode); + dquot_drop(inode); ext4_discard_preallocations(inode); if (EXT4_JOURNAL(inode)) jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal, @@ -1014,7 +1014,6 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, static const struct dquot_operations ext4_quota_operations = { .initialize = dquot_initialize, - .drop = dquot_drop, #ifdef CONFIG_QUOTA .get_reserved_space = ext4_get_reserved_space, #endif diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index 2562d18988f7..22fa412c5289 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -160,7 +160,7 @@ void jfs_delete_inode(struct inode *inode) */ vfs_dq_init(inode); dquot_free_inode(inode); - vfs_dq_drop(inode); + dquot_drop(inode); } clear_inode(inode); diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c index 7762f33e062b..72b30895422c 100644 --- a/fs/jfs/jfs_inode.c +++ b/fs/jfs/jfs_inode.c @@ -162,7 +162,7 @@ struct inode *ialloc(struct inode *parent, umode_t mode) return inode; fail_drop: - vfs_dq_drop(inode); + dquot_drop(inode); inode->i_flags |= S_NOQUOTA; fail_unlock: inode->i_nlink = 0; diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 4086fa593419..266699deb1c6 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -133,7 +133,7 @@ static void jfs_destroy_inode(struct inode *inode) static void jfs_clear_inode(struct inode *inode) { - vfs_dq_drop(inode); + dquot_drop(inode); } static int jfs_statfs(struct dentry *dentry, struct kstatfs *buf) diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 13eb5d467c40..00eb6a095e68 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -1087,7 +1087,7 @@ void ocfs2_clear_inode(struct inode *inode) mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL, "Inode=%lu\n", inode->i_ino); - vfs_dq_drop(inode); + dquot_drop(inode); /* To preven remote deletes we hold open lock before, now it * is time to unlock PR and EX open locks. */ diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index b654bd103b6f..4dca38f487cf 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -852,7 +852,6 @@ static void ocfs2_destroy_dquot(struct dquot *dquot) const struct dquot_operations ocfs2_quota_operations = { .initialize = dquot_initialize, - .drop = dquot_drop, .write_dquot = ocfs2_write_dquot, .acquire_dquot = ocfs2_acquire_dquot, .release_dquot = ocfs2_release_dquot, diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 78ce4c48ad77..cd83c5b871ba 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -1358,7 +1358,7 @@ EXPORT_SYMBOL(dquot_initialize); /* * Release all quotas referenced by inode */ -int dquot_drop(struct inode *inode) +static void __dquot_drop(struct inode *inode) { int cnt; struct dquot *put[MAXQUOTAS]; @@ -1370,32 +1370,31 @@ int dquot_drop(struct inode *inode) } up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); dqput_all(put); - return 0; } -EXPORT_SYMBOL(dquot_drop); -/* Wrapper to remove references to quota structures from inode */ -void vfs_dq_drop(struct inode *inode) -{ - /* Here we can get arbitrary inode from clear_inode() so we have - * to be careful. OTOH we don't need locking as quota operations - * are allowed to change only at mount time */ - if (!IS_NOQUOTA(inode) && inode->i_sb && inode->i_sb->dq_op - && inode->i_sb->dq_op->drop) { - int cnt; - /* Test before calling to rule out calls from proc and such - * where we are not allowed to block. Note that this is - * actually reliable test even without the lock - the caller - * must assure that nobody can come after the DQUOT_DROP and - * add quota pointers back anyway */ - for (cnt = 0; cnt < MAXQUOTAS; cnt++) - if (inode->i_dquot[cnt]) - break; - if (cnt < MAXQUOTAS) - inode->i_sb->dq_op->drop(inode); - } -} -EXPORT_SYMBOL(vfs_dq_drop); +void dquot_drop(struct inode *inode) +{ + int cnt; + + if (IS_NOQUOTA(inode)) + return; + + /* + * Test before calling to rule out calls from proc and such + * where we are not allowed to block. Note that this is + * actually reliable test even without the lock - the caller + * must assure that nobody can come after the DQUOT_DROP and + * add quota pointers back anyway. + */ + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + if (inode->i_dquot[cnt]) + break; + } + + if (cnt < MAXQUOTAS) + __dquot_drop(inode); +} +EXPORT_SYMBOL(dquot_drop); /* * inode_reserved_space is managed internally by quota, and protected by @@ -1812,7 +1811,6 @@ EXPORT_SYMBOL(dquot_commit_info); */ const struct dquot_operations dquot_operations = { .initialize = dquot_initialize, - .drop = dquot_drop, .write_dquot = dquot_commit, .acquire_dquot = dquot_acquire, .release_dquot = dquot_release, @@ -2029,7 +2027,7 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id, * When S_NOQUOTA is set, remove dquot references as no more * references can be added */ - sb->dq_op->drop(inode); + __dquot_drop(inode); } error = -EIO; diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 99a5e5a8ab5a..f07c3b69247d 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -1964,7 +1964,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, out_end_trans: journal_end(th, th->t_super, th->t_blocks_allocated); /* Drop can be outside and it needs more credits so it's better to have it outside */ - vfs_dq_drop(inode); + dquot_drop(inode); inode->i_flags |= S_NOQUOTA; make_bad_inode(inode); diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index 9d4dcf0b07cb..9dea84e8a79a 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -546,7 +546,7 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th, */ static int drop_new_inode(struct inode *inode) { - vfs_dq_drop(inode); + dquot_drop(inode); make_bad_inode(inode); inode->i_flags |= S_NOQUOTA; iput(inode); diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 6b24e70e329b..34f7cd0cb02d 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -580,7 +580,7 @@ out: static void reiserfs_clear_inode(struct inode *inode) { - vfs_dq_drop(inode); + dquot_drop(inode); } #ifdef CONFIG_QUOTA @@ -623,7 +623,6 @@ static int reiserfs_quota_on(struct super_block *, int, int, char *, int); static const struct dquot_operations reiserfs_quota_operations = { .initialize = dquot_initialize, - .drop = dquot_drop, .write_dquot = reiserfs_write_dquot, .acquire_dquot = reiserfs_acquire_dquot, .release_dquot = reiserfs_release_dquot, diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c index e1856b89c9c8..15c6e992e587 100644 --- a/fs/udf/ialloc.c +++ b/fs/udf/ialloc.c @@ -37,7 +37,7 @@ void udf_free_inode(struct inode *inode) * as writing the quota to disk may need the lock as well. */ dquot_free_inode(inode); - vfs_dq_drop(inode); + dquot_drop(inode); clear_inode(inode); @@ -156,7 +156,7 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err) vfs_dq_init(inode); ret = dquot_alloc_inode(inode); if (ret) { - vfs_dq_drop(inode); + dquot_drop(inode); inode->i_flags |= S_NOQUOTA; inode->i_nlink = 0; iput(inode); diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 859389a3832b..1199e8e21ee2 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -109,7 +109,7 @@ void udf_clear_inode(struct inode *inode) (unsigned long long)iinfo->i_lenExtents); } - vfs_dq_drop(inode); + dquot_drop(inode); kfree(iinfo->i_ext.i_data); iinfo->i_ext.i_data = NULL; } diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c index 02f77882c573..67b4bdb056fb 100644 --- a/fs/ufs/ialloc.c +++ b/fs/ufs/ialloc.c @@ -96,7 +96,7 @@ void ufs_free_inode (struct inode * inode) is_directory = S_ISDIR(inode->i_mode); dquot_free_inode(inode); - vfs_dq_drop(inode); + dquot_drop(inode); clear_inode (inode); @@ -358,7 +358,7 @@ cg_found: vfs_dq_init(inode); err = dquot_alloc_inode(inode); if (err) { - vfs_dq_drop(inode); + dquot_drop(inode); goto fail_without_unlock; } diff --git a/fs/ufs/super.c b/fs/ufs/super.c index 95d61cb3a5b8..66b63a751615 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -1434,7 +1434,7 @@ static void destroy_inodecache(void) static void ufs_clear_inode(struct inode *inode) { - vfs_dq_drop(inode); + dquot_drop(inode); } #ifdef CONFIG_QUOTA diff --git a/include/linux/quota.h b/include/linux/quota.h index 422e6aa78edc..aec2e9dac2d7 100644 --- a/include/linux/quota.h +++ b/include/linux/quota.h @@ -296,7 +296,6 @@ struct quota_format_ops { /* Operations working with dquots */ struct dquot_operations { int (*initialize) (struct inode *, int); - int (*drop) (struct inode *); int (*write_dquot) (struct dquot *); /* Ordinary dquot write */ struct dquot *(*alloc_dquot)(struct super_block *, int); /* Allocate memory for new dquot */ void (*destroy_dquot)(struct dquot *); /* Free memory for dquot */ diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h index fa27b7218c82..a5ebd1abccd8 100644 --- a/include/linux/quotaops.h +++ b/include/linux/quotaops.h @@ -24,7 +24,7 @@ void inode_claim_rsv_space(struct inode *inode, qsize_t number); void inode_sub_rsv_space(struct inode *inode, qsize_t number); int dquot_initialize(struct inode *inode, int type); -int dquot_drop(struct inode *inode); +void dquot_drop(struct inode *inode); struct dquot *dqget(struct super_block *sb, unsigned int id, int type); void dqput(struct dquot *dquot); int dquot_scan_active(struct super_block *sb, @@ -64,7 +64,6 @@ int vfs_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii); int vfs_get_dqblk(struct super_block *sb, int type, qid_t id, struct if_dqblk *di); int vfs_set_dqblk(struct super_block *sb, int type, qid_t id, struct if_dqblk *di); -void vfs_dq_drop(struct inode *inode); int dquot_transfer(struct inode *inode, struct iattr *iattr); int vfs_dq_quota_on_remount(struct super_block *sb); @@ -210,7 +209,7 @@ static inline void vfs_dq_init(struct inode *inode) { } -static inline void vfs_dq_drop(struct inode *inode) +static inline void dquot_drop(struct inode *inode) { } -- cgit v1.2.3 From 907f4554e2521cb28b0009d17167760650a9561c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@infradead.org> Date: Wed, 3 Mar 2010 09:05:06 -0500 Subject: dquot: move dquot initialization responsibility into the filesystem Currently various places in the VFS call vfs_dq_init directly. This means we tie the quota code into the VFS. Get rid of that and make the filesystem responsible for the initialization. For most metadata operations this is a straight forward move into the methods, but for truncate and open it's a bit more complicated. For truncate we currently only call vfs_dq_init for the sys_truncate case because open already takes care of it for ftruncate and open(O_TRUNC) - the new code causes an additional vfs_dq_init for those which is harmless. For open the initialization is moved from do_filp_open into the open method, which means it happens slightly earlier now, and only for regular files. The latter is fine because we don't need to initialize it for operations on special files, and we already do it as part of the namespace operations for directories. Add a dquot_file_open helper that filesystems that support generic quotas can use to fill in ->open. Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Jan Kara <jack@suse.cz> --- fs/ext2/file.c | 4 ++-- fs/ext2/inode.c | 5 +++++ fs/ext2/namei.c | 51 ++++++++++++++++++++++++++++++++---------------- fs/ext3/file.c | 2 +- fs/ext3/inode.c | 5 +++++ fs/ext3/namei.c | 18 +++++++++++++++++ fs/ext4/file.c | 2 +- fs/ext4/inode.c | 5 +++++ fs/ext4/namei.c | 17 ++++++++++++++++ fs/inode.c | 3 --- fs/jfs/file.c | 4 +++- fs/jfs/inode.c | 3 +++ fs/jfs/namei.c | 15 ++++++++++++++ fs/namei.c | 16 --------------- fs/nfsd/vfs.c | 4 ---- fs/ocfs2/file.c | 5 +++++ fs/ocfs2/inode.c | 2 ++ fs/ocfs2/namei.c | 11 +++++++++++ fs/open.c | 5 +---- fs/quota/dquot.c | 14 +++++++++++++ fs/reiserfs/file.c | 2 +- fs/reiserfs/inode.c | 5 +++++ fs/reiserfs/namei.c | 17 ++++++++++++++++ fs/reiserfs/xattr.c | 4 ---- fs/udf/file.c | 5 ++++- fs/udf/inode.c | 4 ++++ fs/udf/namei.c | 17 ++++++++++++++++ fs/ufs/file.c | 2 +- fs/ufs/inode.c | 4 ++++ fs/ufs/namei.c | 18 +++++++++++++++++ fs/ufs/truncate.c | 3 +++ include/linux/quotaops.h | 4 ++++ 32 files changed, 220 insertions(+), 56 deletions(-) (limited to 'fs') diff --git a/fs/ext2/file.c b/fs/ext2/file.c index 586e3589d4c2..d11f6e484519 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -70,7 +70,7 @@ const struct file_operations ext2_file_operations = { .compat_ioctl = ext2_compat_ioctl, #endif .mmap = generic_file_mmap, - .open = generic_file_open, + .open = dquot_file_open, .release = ext2_release_file, .fsync = ext2_fsync, .splice_read = generic_file_splice_read, @@ -87,7 +87,7 @@ const struct file_operations ext2_xip_file_operations = { .compat_ioctl = ext2_compat_ioctl, #endif .mmap = xip_file_mmap, - .open = generic_file_open, + .open = dquot_file_open, .release = ext2_release_file, .fsync = ext2_fsync, }; diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 3cfcfd9a131a..c87840c33e17 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -58,6 +58,8 @@ static inline int ext2_inode_is_fast_symlink(struct inode *inode) */ void ext2_delete_inode (struct inode * inode) { + if (!is_bad_inode(inode)) + vfs_dq_init(inode); truncate_inode_pages(&inode->i_data, 0); if (is_bad_inode(inode)) @@ -1457,6 +1459,9 @@ int ext2_setattr(struct dentry *dentry, struct iattr *iattr) error = inode_change_ok(inode, iattr); if (error) return error; + + if (iattr->ia_valid & ATTR_SIZE) + vfs_dq_init(inode); if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) || (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) { error = dquot_transfer(inode, iattr); diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index dd7175ce5606..5923df7b22af 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -31,6 +31,7 @@ */ #include <linux/pagemap.h> +#include <linux/quotaops.h> #include "ext2.h" #include "xattr.h" #include "acl.h" @@ -99,24 +100,27 @@ struct dentry *ext2_get_parent(struct dentry *child) */ static int ext2_create (struct inode * dir, struct dentry * dentry, int mode, struct nameidata *nd) { - struct inode * inode = ext2_new_inode (dir, mode); - int err = PTR_ERR(inode); - if (!IS_ERR(inode)) { - inode->i_op = &ext2_file_inode_operations; - if (ext2_use_xip(inode->i_sb)) { - inode->i_mapping->a_ops = &ext2_aops_xip; - inode->i_fop = &ext2_xip_file_operations; - } else if (test_opt(inode->i_sb, NOBH)) { - inode->i_mapping->a_ops = &ext2_nobh_aops; - inode->i_fop = &ext2_file_operations; - } else { - inode->i_mapping->a_ops = &ext2_aops; - inode->i_fop = &ext2_file_operations; - } - mark_inode_dirty(inode); - err = ext2_add_nondir(dentry, inode); + struct inode *inode; + + vfs_dq_init(dir); + + inode = ext2_new_inode(dir, mode); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + inode->i_op = &ext2_file_inode_operations; + if (ext2_use_xip(inode->i_sb)) { + inode->i_mapping->a_ops = &ext2_aops_xip; + inode->i_fop = &ext2_xip_file_operations; + } else if (test_opt(inode->i_sb, NOBH)) { + inode->i_mapping->a_ops = &ext2_nobh_aops; + inode->i_fop = &ext2_file_operations; + } else { + inode->i_mapping->a_ops = &ext2_aops; + inode->i_fop = &ext2_file_operations; } - return err; + mark_inode_dirty(inode); + return ext2_add_nondir(dentry, inode); } static int ext2_mknod (struct inode * dir, struct dentry *dentry, int mode, dev_t rdev) @@ -127,6 +131,8 @@ static int ext2_mknod (struct inode * dir, struct dentry *dentry, int mode, dev_ if (!new_valid_dev(rdev)) return -EINVAL; + vfs_dq_init(dir); + inode = ext2_new_inode (dir, mode); err = PTR_ERR(inode); if (!IS_ERR(inode)) { @@ -151,6 +157,8 @@ static int ext2_symlink (struct inode * dir, struct dentry * dentry, if (l > sb->s_blocksize) goto out; + vfs_dq_init(dir); + inode = ext2_new_inode (dir, S_IFLNK | S_IRWXUGO); err = PTR_ERR(inode); if (IS_ERR(inode)) @@ -194,6 +202,8 @@ static int ext2_link (struct dentry * old_dentry, struct inode * dir, if (inode->i_nlink >= EXT2_LINK_MAX) return -EMLINK; + vfs_dq_init(dir); + inode->i_ctime = CURRENT_TIME_SEC; inode_inc_link_count(inode); atomic_inc(&inode->i_count); @@ -216,6 +226,8 @@ static int ext2_mkdir(struct inode * dir, struct dentry * dentry, int mode) if (dir->i_nlink >= EXT2_LINK_MAX) goto out; + vfs_dq_init(dir); + inode_inc_link_count(dir); inode = ext2_new_inode (dir, S_IFDIR | mode); @@ -262,6 +274,8 @@ static int ext2_unlink(struct inode * dir, struct dentry *dentry) struct page * page; int err = -ENOENT; + vfs_dq_init(dir); + de = ext2_find_entry (dir, &dentry->d_name, &page); if (!de) goto out; @@ -304,6 +318,9 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry, struct ext2_dir_entry_2 * old_de; int err = -ENOENT; + vfs_dq_init(old_dir); + vfs_dq_init(new_dir); + old_de = ext2_find_entry (old_dir, &old_dentry->d_name, &old_page); if (!old_de) goto out; diff --git a/fs/ext3/file.c b/fs/ext3/file.c index a86d3302cdc2..3c7fb11a3b29 100644 --- a/fs/ext3/file.c +++ b/fs/ext3/file.c @@ -62,7 +62,7 @@ const struct file_operations ext3_file_operations = { .compat_ioctl = ext3_compat_ioctl, #endif .mmap = generic_file_mmap, - .open = generic_file_open, + .open = dquot_file_open, .release = ext3_release_file, .fsync = ext3_sync_file, .splice_read = generic_file_splice_read, diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 14d40a4dd6f0..d7962b0c57b3 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -196,6 +196,9 @@ void ext3_delete_inode (struct inode * inode) { handle_t *handle; + if (!is_bad_inode(inode)) + vfs_dq_init(inode); + truncate_inode_pages(&inode->i_data, 0); if (is_bad_inode(inode)) @@ -3148,6 +3151,8 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr) if (error) return error; + if (ia_valid & ATTR_SIZE) + vfs_dq_init(inode); if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { handle_t *handle; diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index 7b0e44f7d66f..a492b371b134 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -1696,6 +1696,8 @@ static int ext3_create (struct inode * dir, struct dentry * dentry, int mode, struct inode * inode; int err, retries = 0; + vfs_dq_init(dir); + retry: handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 + @@ -1730,6 +1732,8 @@ static int ext3_mknod (struct inode * dir, struct dentry *dentry, if (!new_valid_dev(rdev)) return -EINVAL; + vfs_dq_init(dir); + retry: handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 + @@ -1766,6 +1770,8 @@ static int ext3_mkdir(struct inode * dir, struct dentry * dentry, int mode) if (dir->i_nlink >= EXT3_LINK_MAX) return -EMLINK; + vfs_dq_init(dir); + retry: handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 + @@ -2060,7 +2066,9 @@ static int ext3_rmdir (struct inode * dir, struct dentry *dentry) /* Initialize quotas before so that eventual writes go in * separate transaction */ + vfs_dq_init(dir); vfs_dq_init(dentry->d_inode); + handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -2119,7 +2127,9 @@ static int ext3_unlink(struct inode * dir, struct dentry *dentry) /* Initialize quotas before so that eventual writes go * in separate transaction */ + vfs_dq_init(dir); vfs_dq_init(dentry->d_inode); + handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -2174,6 +2184,8 @@ static int ext3_symlink (struct inode * dir, if (l > dir->i_sb->s_blocksize) return -ENAMETOOLONG; + vfs_dq_init(dir); + retry: handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 5 + @@ -2228,6 +2240,9 @@ static int ext3_link (struct dentry * old_dentry, if (inode->i_nlink >= EXT3_LINK_MAX) return -EMLINK; + + vfs_dq_init(dir); + /* * Return -ENOENT if we've raced with unlink and i_nlink is 0. Doing * otherwise has the potential to corrupt the orphan inode list. @@ -2278,6 +2293,9 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry, struct ext3_dir_entry_2 * old_de, * new_de; int retval, flush_file = 0; + vfs_dq_init(old_dir); + vfs_dq_init(new_dir); + old_bh = new_bh = dir_bh = NULL; /* Initialize quotas before so that eventual writes go diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 9630583cef28..85fa464a24ad 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -127,7 +127,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp) sb->s_dirt = 1; } } - return generic_file_open(inode, filp); + return dquot_file_open(inode, filp); } const struct file_operations ext4_file_operations = { diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 6a002a6d0624..eaa22ae9f1f6 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -170,6 +170,9 @@ void ext4_delete_inode(struct inode *inode) handle_t *handle; int err; + if (!is_bad_inode(inode)) + vfs_dq_init(inode); + if (ext4_should_order_data(inode)) ext4_begin_ordered_truncate(inode, 0); truncate_inode_pages(&inode->i_data, 0); @@ -5251,6 +5254,8 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) if (error) return error; + if (ia_valid & ATTR_SIZE) + vfs_dq_init(inode); if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { handle_t *handle; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 17a17e10dd60..20f55c2e7571 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1766,6 +1766,8 @@ static int ext4_create(struct inode *dir, struct dentry *dentry, int mode, struct inode *inode; int err, retries = 0; + vfs_dq_init(dir); + retry: handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + @@ -1800,6 +1802,8 @@ static int ext4_mknod(struct inode *dir, struct dentry *dentry, if (!new_valid_dev(rdev)) return -EINVAL; + vfs_dq_init(dir); + retry: handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + @@ -1837,6 +1841,8 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode) if (EXT4_DIR_LINK_MAX(dir)) return -EMLINK; + vfs_dq_init(dir); + retry: handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + @@ -2136,7 +2142,9 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry) /* Initialize quotas before so that eventual writes go in * separate transaction */ + vfs_dq_init(dir); vfs_dq_init(dentry->d_inode); + handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -2195,7 +2203,9 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) /* Initialize quotas before so that eventual writes go * in separate transaction */ + vfs_dq_init(dir); vfs_dq_init(dentry->d_inode); + handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -2250,6 +2260,8 @@ static int ext4_symlink(struct inode *dir, if (l > dir->i_sb->s_blocksize) return -ENAMETOOLONG; + vfs_dq_init(dir); + retry: handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 5 + @@ -2308,6 +2320,8 @@ static int ext4_link(struct dentry *old_dentry, if (inode->i_nlink >= EXT4_LINK_MAX) return -EMLINK; + vfs_dq_init(dir); + /* * Return -ENOENT if we've raced with unlink and i_nlink is 0. Doing * otherwise has the potential to corrupt the orphan inode list. @@ -2358,6 +2372,9 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, struct ext4_dir_entry_2 *old_de, *new_de; int retval, force_da_alloc = 0; + vfs_dq_init(old_dir); + vfs_dq_init(new_dir); + old_bh = new_bh = dir_bh = NULL; /* Initialize quotas before so that eventual writes go diff --git a/fs/inode.c b/fs/inode.c index f1aef3482b0e..407bf392e20a 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -8,7 +8,6 @@ #include <linux/mm.h> #include <linux/dcache.h> #include <linux/init.h> -#include <linux/quotaops.h> #include <linux/slab.h> #include <linux/writeback.h> #include <linux/module.h> @@ -1210,8 +1209,6 @@ void generic_delete_inode(struct inode *inode) if (op->delete_inode) { void (*delete)(struct inode *) = op->delete_inode; - if (!is_bad_inode(inode)) - vfs_dq_init(inode); /* Filesystems implementing their own * s_op->delete_inode are required to call * truncate_inode_pages and clear_inode() diff --git a/fs/jfs/file.c b/fs/jfs/file.c index 2c201783836f..f19bb33eb1eb 100644 --- a/fs/jfs/file.c +++ b/fs/jfs/file.c @@ -48,7 +48,7 @@ static int jfs_open(struct inode *inode, struct file *file) { int rc; - if ((rc = generic_file_open(inode, file))) + if ((rc = dquot_file_open(inode, file))) return rc; /* @@ -98,6 +98,8 @@ int jfs_setattr(struct dentry *dentry, struct iattr *iattr) if (rc) return rc; + if (iattr->ia_valid & ATTR_SIZE) + vfs_dq_init(inode); if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) || (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) { rc = dquot_transfer(inode, iattr); diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index 22fa412c5289..1aa2dda16590 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -146,6 +146,9 @@ void jfs_delete_inode(struct inode *inode) { jfs_info("In jfs_delete_inode, inode = 0x%p", inode); + if (!is_bad_inode(inode)) + vfs_dq_init(inode); + if (!is_bad_inode(inode) && (JFS_IP(inode)->fileset == FILESYSTEM_I)) { truncate_inode_pages(&inode->i_data, 0); diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index 1d1390afe55e..b7cc29da50b4 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -85,6 +85,8 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, int mode, jfs_info("jfs_create: dip:0x%p name:%s", dip, dentry->d_name.name); + vfs_dq_init(dip); + /* * search parent directory for entry/freespace * (dtSearch() returns parent directory page pinned) @@ -215,6 +217,8 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode) jfs_info("jfs_mkdir: dip:0x%p name:%s", dip, dentry->d_name.name); + vfs_dq_init(dip); + /* link count overflow on parent directory ? */ if (dip->i_nlink == JFS_LINK_MAX) { rc = -EMLINK; @@ -356,6 +360,7 @@ static int jfs_rmdir(struct inode *dip, struct dentry *dentry) jfs_info("jfs_rmdir: dip:0x%p name:%s", dip, dentry->d_name.name); /* Init inode for quota operations. */ + vfs_dq_init(dip); vfs_dq_init(ip); /* directory must be empty to be removed */ @@ -483,6 +488,7 @@ static int jfs_unlink(struct inode *dip, struct dentry *dentry) jfs_info("jfs_unlink: dip:0x%p name:%s", dip, dentry->d_name.name); /* Init inode for quota operations. */ + vfs_dq_init(dip); vfs_dq_init(ip); if ((rc = get_UCSname(&dname, dentry))) @@ -805,6 +811,8 @@ static int jfs_link(struct dentry *old_dentry, if (ip->i_nlink == 0) return -ENOENT; + vfs_dq_init(dir); + tid = txBegin(ip->i_sb, 0); mutex_lock_nested(&JFS_IP(dir)->commit_mutex, COMMIT_MUTEX_PARENT); @@ -896,6 +904,8 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry, jfs_info("jfs_symlink: dip:0x%p name:%s", dip, name); + vfs_dq_init(dip); + ssize = strlen(name) + 1; /* @@ -1087,6 +1097,9 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, jfs_info("jfs_rename: %s %s", old_dentry->d_name.name, new_dentry->d_name.name); + vfs_dq_init(old_dir); + vfs_dq_init(new_dir); + old_ip = old_dentry->d_inode; new_ip = new_dentry->d_inode; @@ -1360,6 +1373,8 @@ static int jfs_mknod(struct inode *dir, struct dentry *dentry, jfs_info("jfs_mknod: %s", dentry->d_name.name); + vfs_dq_init(dir); + if ((rc = get_UCSname(&dname, dentry))) goto out; diff --git a/fs/namei.c b/fs/namei.c index a4855af776a8..06abd2bf473c 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -19,7 +19,6 @@ #include <linux/slab.h> #include <linux/fs.h> #include <linux/namei.h> -#include <linux/quotaops.h> #include <linux/pagemap.h> #include <linux/fsnotify.h> #include <linux/personality.h> @@ -1461,7 +1460,6 @@ int vfs_create(struct inode *dir, struct dentry *dentry, int mode, error = security_inode_create(dir, dentry, mode); if (error) return error; - vfs_dq_init(dir); error = dir->i_op->create(dir, dentry, mode, nd); if (!error) fsnotify_create(dir, dentry); @@ -1813,9 +1811,6 @@ ok: } } if (!IS_ERR(filp)) { - if (acc_mode & MAY_WRITE) - vfs_dq_init(nd.path.dentry->d_inode); - if (will_truncate) { error = handle_truncate(&nd.path); if (error) { @@ -1996,7 +1991,6 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) if (error) return error; - vfs_dq_init(dir); error = dir->i_op->mknod(dir, dentry, mode, dev); if (!error) fsnotify_create(dir, dentry); @@ -2095,7 +2089,6 @@ int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) if (error) return error; - vfs_dq_init(dir); error = dir->i_op->mkdir(dir, dentry, mode); if (!error) fsnotify_mkdir(dir, dentry); @@ -2181,8 +2174,6 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry) if (!dir->i_op->rmdir) return -EPERM; - vfs_dq_init(dir); - mutex_lock(&dentry->d_inode->i_mutex); dentry_unhash(dentry); if (d_mountpoint(dentry)) @@ -2268,8 +2259,6 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry) if (!dir->i_op->unlink) return -EPERM; - vfs_dq_init(dir); - mutex_lock(&dentry->d_inode->i_mutex); if (d_mountpoint(dentry)) error = -EBUSY; @@ -2379,7 +2368,6 @@ int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname) if (error) return error; - vfs_dq_init(dir); error = dir->i_op->symlink(dir, dentry, oldname); if (!error) fsnotify_create(dir, dentry); @@ -2463,7 +2451,6 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de return error; mutex_lock(&inode->i_mutex); - vfs_dq_init(dir); error = dir->i_op->link(old_dentry, dir, new_dentry); mutex_unlock(&inode->i_mutex); if (!error) @@ -2662,9 +2649,6 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (!old_dir->i_op->rename) return -EPERM; - vfs_dq_init(old_dir); - vfs_dq_init(new_dir); - old_name = fsnotify_oldname_init(old_dentry->d_name.name); if (is_dir) diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 8715d194561a..09e9fc043600 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -20,7 +20,6 @@ #include <linux/fcntl.h> #include <linux/namei.h> #include <linux/delay.h> -#include <linux/quotaops.h> #include <linux/fsnotify.h> #include <linux/posix_acl_xattr.h> #include <linux/xattr.h> @@ -377,7 +376,6 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, put_write_access(inode); goto out_nfserr; } - vfs_dq_init(inode); } /* sanitize the mode change */ @@ -745,8 +743,6 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, flags = O_RDWR|O_LARGEFILE; else flags = O_WRONLY|O_LARGEFILE; - - vfs_dq_init(inode); } *filp = dentry_open(dget(dentry), mntget(fhp->fh_export->ex_path.mnt), flags, current_cred()); diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 472e8f8bc892..126198f5a67c 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -107,6 +107,9 @@ static int ocfs2_file_open(struct inode *inode, struct file *file) mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file, file->f_path.dentry->d_name.len, file->f_path.dentry->d_name.name); + if (file->f_mode & FMODE_WRITE) + vfs_dq_init(inode); + spin_lock(&oi->ip_lock); /* Check that the inode hasn't been wiped from disk by another @@ -977,6 +980,8 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE; if (size_change) { + vfs_dq_init(inode); + status = ocfs2_rw_lock(inode, 1); if (status < 0) { mlog_errno(status); diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 00eb6a095e68..77681a690d16 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -971,6 +971,8 @@ void ocfs2_delete_inode(struct inode *inode) goto bail; } + vfs_dq_init(inode); + if (!ocfs2_inode_is_valid_to_delete(inode)) { /* It's probably not necessary to truncate_inode_pages * here but we do it for safety anyway (it will most diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 99766b6418eb..8b5b142eb638 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -244,6 +244,8 @@ static int ocfs2_mknod(struct inode *dir, (unsigned long)dev, dentry->d_name.len, dentry->d_name.name); + vfs_dq_init(dir); + /* get our super block */ osb = OCFS2_SB(dir->i_sb); @@ -632,6 +634,8 @@ static int ocfs2_link(struct dentry *old_dentry, if (S_ISDIR(inode->i_mode)) return -EPERM; + vfs_dq_init(dir); + err = ocfs2_inode_lock_nested(dir, &parent_fe_bh, 1, OI_LS_PARENT); if (err < 0) { if (err != -ENOENT) @@ -787,6 +791,8 @@ static int ocfs2_unlink(struct inode *dir, mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry, dentry->d_name.len, dentry->d_name.name); + vfs_dq_init(dir); + BUG_ON(dentry->d_parent->d_inode != dir); mlog(0, "ino = %llu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno); @@ -1047,6 +1053,9 @@ static int ocfs2_rename(struct inode *old_dir, old_dentry->d_name.len, old_dentry->d_name.name, new_dentry->d_name.len, new_dentry->d_name.name); + vfs_dq_init(old_dir); + vfs_dq_init(new_dir); + osb = OCFS2_SB(old_dir->i_sb); if (new_inode) { @@ -1595,6 +1604,8 @@ static int ocfs2_symlink(struct inode *dir, mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir, dentry, symname, dentry->d_name.len, dentry->d_name.name); + vfs_dq_init(dir); + sb = dir->i_sb; osb = OCFS2_SB(sb); diff --git a/fs/open.c b/fs/open.c index 040cef72bc00..b740c4244833 100644 --- a/fs/open.c +++ b/fs/open.c @@ -8,7 +8,6 @@ #include <linux/mm.h> #include <linux/file.h> #include <linux/fdtable.h> -#include <linux/quotaops.h> #include <linux/fsnotify.h> #include <linux/module.h> #include <linux/slab.h> @@ -278,10 +277,8 @@ static long do_sys_truncate(const char __user *pathname, loff_t length) error = locks_verify_truncate(inode, NULL, length); if (!error) error = security_path_truncate(&path, length, 0); - if (!error) { - vfs_dq_init(inode); + if (!error) error = do_truncate(path.dentry, length, 0, NULL); - } put_write_and_out: put_write_access(inode); diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index cd83c5b871ba..6244bca45c9d 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -1820,6 +1820,20 @@ const struct dquot_operations dquot_operations = { .destroy_dquot = dquot_destroy, }; +/* + * Generic helper for ->open on filesystems supporting disk quotas. + */ +int dquot_file_open(struct inode *inode, struct file *file) +{ + int error; + + error = generic_file_open(inode, file); + if (!error && (file->f_mode & FMODE_WRITE)) + vfs_dq_init(inode); + return error; +} +EXPORT_SYMBOL(dquot_file_open); + /* * Turn quota off on a device. type == -1 ==> quotaoff for all types (umount) */ diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c index da2dba082e2d..1d9c12714c5c 100644 --- a/fs/reiserfs/file.c +++ b/fs/reiserfs/file.c @@ -289,7 +289,7 @@ const struct file_operations reiserfs_file_operations = { .compat_ioctl = reiserfs_compat_ioctl, #endif .mmap = reiserfs_file_mmap, - .open = generic_file_open, + .open = dquot_file_open, .release = reiserfs_file_release, .fsync = reiserfs_sync_file, .aio_read = generic_file_aio_read, diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index f07c3b69247d..06995cb48e39 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -34,6 +34,9 @@ void reiserfs_delete_inode(struct inode *inode) int depth; int err; + if (!is_bad_inode(inode)) + vfs_dq_init(inode); + truncate_inode_pages(&inode->i_data, 0); depth = reiserfs_write_lock_once(inode->i_sb); @@ -3073,6 +3076,8 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) depth = reiserfs_write_lock_once(inode->i_sb); if (attr->ia_valid & ATTR_SIZE) { + vfs_dq_init(inode); + /* version 2 items will be caught by the s_maxbytes check ** done for us in vmtruncate */ diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index 9dea84e8a79a..c55e1b9fee5f 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -594,6 +594,8 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, int mode, struct reiserfs_transaction_handle th; struct reiserfs_security_handle security; + vfs_dq_init(dir); + if (!(inode = new_inode(dir->i_sb))) { return -ENOMEM; } @@ -666,6 +668,8 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, int mode, if (!new_valid_dev(rdev)) return -EINVAL; + vfs_dq_init(dir); + if (!(inode = new_inode(dir->i_sb))) { return -ENOMEM; } @@ -739,6 +743,8 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) 2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) + REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb)); + vfs_dq_init(dir); + #ifdef DISPLACE_NEW_PACKING_LOCALITIES /* set flag that new packing locality created and new blocks for the content * of that directory are not displaced yet */ REISERFS_I(dir)->new_packing_locality = 1; @@ -842,6 +848,8 @@ static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry) JOURNAL_PER_BALANCE_CNT * 2 + 2 + 4 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb); + vfs_dq_init(dir); + reiserfs_write_lock(dir->i_sb); retval = journal_begin(&th, dir->i_sb, jbegin_count); if (retval) @@ -923,6 +931,8 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry) unsigned long savelink; int depth; + vfs_dq_init(dir); + inode = dentry->d_inode; /* in this transaction we can be doing at max two balancings and update @@ -1024,6 +1034,8 @@ static int reiserfs_symlink(struct inode *parent_dir, 2 * (REISERFS_QUOTA_INIT_BLOCKS(parent_dir->i_sb) + REISERFS_QUOTA_TRANS_BLOCKS(parent_dir->i_sb)); + vfs_dq_init(parent_dir); + if (!(inode = new_inode(parent_dir->i_sb))) { return -ENOMEM; } @@ -1111,6 +1123,8 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir, JOURNAL_PER_BALANCE_CNT * 3 + 2 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb); + vfs_dq_init(dir); + reiserfs_write_lock(dir->i_sb); if (inode->i_nlink >= REISERFS_LINK_MAX) { //FIXME: sd_nlink is 32 bit for new files @@ -1235,6 +1249,9 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry, JOURNAL_PER_BALANCE_CNT * 3 + 5 + 4 * REISERFS_QUOTA_TRANS_BLOCKS(old_dir->i_sb); + vfs_dq_init(old_dir); + vfs_dq_init(new_dir); + old_inode = old_dentry->d_inode; new_dentry_inode = new_dentry->d_inode; diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 81f09fab8ae4..37d034ca7d99 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -61,7 +61,6 @@ static int xattr_create(struct inode *dir, struct dentry *dentry, int mode) { BUG_ON(!mutex_is_locked(&dir->i_mutex)); - vfs_dq_init(dir); return dir->i_op->create(dir, dentry, mode, NULL); } #endif @@ -69,7 +68,6 @@ static int xattr_create(struct inode *dir, struct dentry *dentry, int mode) static int xattr_mkdir(struct inode *dir, struct dentry *dentry, int mode) { BUG_ON(!mutex_is_locked(&dir->i_mutex)); - vfs_dq_init(dir); return dir->i_op->mkdir(dir, dentry, mode); } @@ -81,7 +79,6 @@ static int xattr_unlink(struct inode *dir, struct dentry *dentry) { int error; BUG_ON(!mutex_is_locked(&dir->i_mutex)); - vfs_dq_init(dir); reiserfs_mutex_lock_nested_safe(&dentry->d_inode->i_mutex, I_MUTEX_CHILD, dir->i_sb); @@ -97,7 +94,6 @@ static int xattr_rmdir(struct inode *dir, struct dentry *dentry) { int error; BUG_ON(!mutex_is_locked(&dir->i_mutex)); - vfs_dq_init(dir); reiserfs_mutex_lock_nested_safe(&dentry->d_inode->i_mutex, I_MUTEX_CHILD, dir->i_sb); diff --git a/fs/udf/file.c b/fs/udf/file.c index 2df7fcb677b3..013fa44d9a5e 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -208,7 +208,7 @@ const struct file_operations udf_file_operations = { .read = do_sync_read, .aio_read = generic_file_aio_read, .ioctl = udf_ioctl, - .open = generic_file_open, + .open = dquot_file_open, .mmap = generic_file_mmap, .write = do_sync_write, .aio_write = udf_file_aio_write, @@ -227,6 +227,9 @@ static int udf_setattr(struct dentry *dentry, struct iattr *iattr) if (error) return error; + if (iattr->ia_valid & ATTR_SIZE) + vfs_dq_init(inode); + if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) || (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) { error = dquot_transfer(inode, iattr); diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 1199e8e21ee2..f19520268404 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -36,6 +36,7 @@ #include <linux/pagemap.h> #include <linux/buffer_head.h> #include <linux/writeback.h> +#include <linux/quotaops.h> #include <linux/slab.h> #include <linux/crc-itu-t.h> @@ -70,6 +71,9 @@ static int udf_get_block(struct inode *, sector_t, struct buffer_head *, int); void udf_delete_inode(struct inode *inode) { + if (!is_bad_inode(inode)) + vfs_dq_init(inode); + truncate_inode_pages(&inode->i_data, 0); if (is_bad_inode(inode)) diff --git a/fs/udf/namei.c b/fs/udf/namei.c index cd2115060fdc..e360c3fc4ae4 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -563,6 +563,8 @@ static int udf_create(struct inode *dir, struct dentry *dentry, int mode, int err; struct udf_inode_info *iinfo; + vfs_dq_init(dir); + lock_kernel(); inode = udf_new_inode(dir, mode, &err); if (!inode) { @@ -616,6 +618,8 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, int mode, if (!old_valid_dev(rdev)) return -EINVAL; + vfs_dq_init(dir); + lock_kernel(); err = -EIO; inode = udf_new_inode(dir, mode, &err); @@ -662,6 +666,8 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode) struct udf_inode_info *dinfo = UDF_I(dir); struct udf_inode_info *iinfo; + vfs_dq_init(dir); + lock_kernel(); err = -EMLINK; if (dir->i_nlink >= (256 << sizeof(dir->i_nlink)) - 1) @@ -799,6 +805,8 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry) struct fileIdentDesc *fi, cfi; struct kernel_lb_addr tloc; + vfs_dq_init(dir); + retval = -ENOENT; lock_kernel(); fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi); @@ -845,6 +853,8 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry) struct fileIdentDesc cfi; struct kernel_lb_addr tloc; + vfs_dq_init(dir); + retval = -ENOENT; lock_kernel(); fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi); @@ -899,6 +909,8 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry, struct buffer_head *bh; struct udf_inode_info *iinfo; + vfs_dq_init(dir); + lock_kernel(); inode = udf_new_inode(dir, S_IFLNK, &err); if (!inode) @@ -1069,6 +1081,8 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir, int err; struct buffer_head *bh; + vfs_dq_init(dir); + lock_kernel(); if (inode->i_nlink >= (256 << sizeof(inode->i_nlink)) - 1) { unlock_kernel(); @@ -1131,6 +1145,9 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry, struct kernel_lb_addr tloc; struct udf_inode_info *old_iinfo = UDF_I(old_inode); + vfs_dq_init(old_dir); + vfs_dq_init(new_dir); + lock_kernel(); ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi); if (ofi) { diff --git a/fs/ufs/file.c b/fs/ufs/file.c index 73655c61240a..d84762f3028e 100644 --- a/fs/ufs/file.c +++ b/fs/ufs/file.c @@ -40,7 +40,7 @@ const struct file_operations ufs_file_operations = { .write = do_sync_write, .aio_write = generic_file_aio_write, .mmap = generic_file_mmap, - .open = generic_file_open, + .open = dquot_file_open, .fsync = simple_fsync, .splice_read = generic_file_splice_read, }; diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index 7cf33379fd46..fff8edab382f 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -36,6 +36,7 @@ #include <linux/mm.h> #include <linux/smp_lock.h> #include <linux/buffer_head.h> +#include <linux/quotaops.h> #include "ufs_fs.h" #include "ufs.h" @@ -908,6 +909,9 @@ void ufs_delete_inode (struct inode * inode) { loff_t old_i_size; + if (!is_bad_inode(inode)) + vfs_dq_init(inode); + truncate_inode_pages(&inode->i_data, 0); if (is_bad_inode(inode)) goto no_delete; diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c index 4c26d9e8bc94..c33cb90c516d 100644 --- a/fs/ufs/namei.c +++ b/fs/ufs/namei.c @@ -30,6 +30,7 @@ #include <linux/time.h> #include <linux/fs.h> #include <linux/smp_lock.h> +#include <linux/quotaops.h> #include "ufs_fs.h" #include "ufs.h" @@ -84,6 +85,9 @@ static int ufs_create (struct inode * dir, struct dentry * dentry, int mode, int err; UFSD("BEGIN\n"); + + vfs_dq_init(dir); + inode = ufs_new_inode(dir, mode); err = PTR_ERR(inode); @@ -107,6 +111,9 @@ static int ufs_mknod (struct inode * dir, struct dentry *dentry, int mode, dev_t if (!old_valid_dev(rdev)) return -EINVAL; + + vfs_dq_init(dir); + inode = ufs_new_inode(dir, mode); err = PTR_ERR(inode); if (!IS_ERR(inode)) { @@ -131,6 +138,8 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry, if (l > sb->s_blocksize) goto out_notlocked; + vfs_dq_init(dir); + lock_kernel(); inode = ufs_new_inode(dir, S_IFLNK | S_IRWXUGO); err = PTR_ERR(inode); @@ -176,6 +185,8 @@ static int ufs_link (struct dentry * old_dentry, struct inode * dir, return -EMLINK; } + vfs_dq_init(dir); + inode->i_ctime = CURRENT_TIME_SEC; inode_inc_link_count(inode); atomic_inc(&inode->i_count); @@ -193,6 +204,8 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, int mode) if (dir->i_nlink >= UFS_LINK_MAX) goto out; + vfs_dq_init(dir); + lock_kernel(); inode_inc_link_count(dir); @@ -237,6 +250,8 @@ static int ufs_unlink(struct inode *dir, struct dentry *dentry) struct page *page; int err = -ENOENT; + vfs_dq_init(dir); + de = ufs_find_entry(dir, &dentry->d_name, &page); if (!de) goto out; @@ -281,6 +296,9 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry, struct ufs_dir_entry *old_de; int err = -ENOENT; + vfs_dq_init(old_dir); + vfs_dq_init(new_dir); + old_de = ufs_find_entry(old_dir, &old_dentry->d_name, &old_page); if (!old_de) goto out; diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c index 87bbab685901..e5ef8a3ec230 100644 --- a/fs/ufs/truncate.c +++ b/fs/ufs/truncate.c @@ -527,6 +527,9 @@ static int ufs_setattr(struct dentry *dentry, struct iattr *attr) if (ia_valid & ATTR_SIZE && attr->ia_size != i_size_read(inode)) { loff_t old_i_size = inode->i_size; + + vfs_dq_init(inode); + error = vmtruncate(inode, attr->ia_size); if (error) return error; diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h index a5ebd1abccd8..93ac788345e2 100644 --- a/include/linux/quotaops.h +++ b/include/linux/quotaops.h @@ -48,6 +48,8 @@ int dquot_release(struct dquot *dquot); int dquot_commit_info(struct super_block *sb, int type); int dquot_mark_dquot_dirty(struct dquot *dquot); +int dquot_file_open(struct inode *inode, struct file *file); + int vfs_quota_on(struct super_block *sb, int type, int format_id, char *path, int remount); int vfs_quota_enable(struct inode *inode, int type, int format_id, @@ -342,4 +344,6 @@ static inline void dquot_release_reservation_block(struct inode *inode, __dquot_free_space(inode, nr << inode->i_blkbits, 1); } +#define dquot_file_open generic_file_open + #endif /* _LINUX_QUOTAOPS_ */ -- cgit v1.2.3 From 871a293155a24554e153538d36e3a80fa169aefb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@infradead.org> Date: Wed, 3 Mar 2010 09:05:07 -0500 Subject: dquot: cleanup dquot initialize routine Get rid of the initialize dquot operation - it is now always called from the filesystem and if a filesystem really needs it's own (which none currently does) it can just call into it's own routine directly. Rename the now static low-level dquot_initialize helper to __dquot_initialize and vfs_dq_init to dquot_initialize to have a consistent namespace. Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Jan Kara <jack@suse.cz> --- Documentation/filesystems/Locking | 2 -- fs/ext2/file.c | 1 + fs/ext2/ialloc.c | 2 +- fs/ext2/inode.c | 4 ++-- fs/ext2/namei.c | 16 ++++++++-------- fs/ext3/file.c | 1 + fs/ext3/ialloc.c | 4 ++-- fs/ext3/inode.c | 6 +++--- fs/ext3/namei.c | 24 ++++++++++++------------ fs/ext3/super.c | 5 ++--- fs/ext4/file.c | 1 + fs/ext4/ialloc.c | 4 ++-- fs/ext4/inode.c | 4 ++-- fs/ext4/namei.c | 24 ++++++++++++------------ fs/ext4/super.c | 5 ++--- fs/jfs/file.c | 2 +- fs/jfs/inode.c | 4 ++-- fs/jfs/jfs_inode.c | 2 +- fs/jfs/namei.c | 24 ++++++++++++------------ fs/ocfs2/file.c | 4 ++-- fs/ocfs2/inode.c | 2 +- fs/ocfs2/namei.c | 14 +++++++------- fs/ocfs2/quota_global.c | 1 - fs/ocfs2/refcounttree.c | 2 +- fs/quota/dquot.c | 32 ++++++++++++++++++++------------ fs/reiserfs/inode.c | 6 +++--- fs/reiserfs/namei.c | 22 +++++++++++----------- fs/reiserfs/super.c | 3 +-- fs/udf/file.c | 2 +- fs/udf/ialloc.c | 2 +- fs/udf/inode.c | 2 +- fs/udf/namei.c | 18 +++++++++--------- fs/ufs/file.c | 1 + fs/ufs/ialloc.c | 2 +- fs/ufs/inode.c | 2 +- fs/ufs/namei.c | 16 ++++++++-------- fs/ufs/truncate.c | 2 +- include/linux/quota.h | 1 - include/linux/quotaops.h | 17 ++++------------- 39 files changed, 141 insertions(+), 145 deletions(-) (limited to 'fs') diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index fa10e4bf8e5e..06bbbed71206 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -460,7 +460,6 @@ in sys_read() and friends. --------------------------- dquot_operations ------------------------------- prototypes: - int (*initialize) (struct inode *, int); int (*write_dquot) (struct dquot *); int (*acquire_dquot) (struct dquot *); int (*release_dquot) (struct dquot *); @@ -473,7 +472,6 @@ a proper locking wrt the filesystem and call the generic quota operations. What filesystem should expect from the generic quota functions: FS recursion Held locks when called -initialize: yes maybe dqonoff_sem write_dquot: yes dqonoff_sem or dqptr_sem acquire_dquot: yes dqonoff_sem or dqptr_sem release_dquot: yes dqonoff_sem or dqptr_sem diff --git a/fs/ext2/file.c b/fs/ext2/file.c index d11f6e484519..5d198d0697fb 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -20,6 +20,7 @@ #include <linux/time.h> #include <linux/pagemap.h> +#include <linux/quotaops.h> #include "ext2.h" #include "xattr.h" #include "acl.h" diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c index 88b71972c626..ad7d572ee8dc 100644 --- a/fs/ext2/ialloc.c +++ b/fs/ext2/ialloc.c @@ -586,7 +586,7 @@ got: goto fail_drop; } - vfs_dq_init(inode); + dquot_initialize(inode); err = dquot_alloc_inode(inode); if (err) goto fail_drop; diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index c87840c33e17..45ff49f0a4b5 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -59,7 +59,7 @@ static inline int ext2_inode_is_fast_symlink(struct inode *inode) void ext2_delete_inode (struct inode * inode) { if (!is_bad_inode(inode)) - vfs_dq_init(inode); + dquot_initialize(inode); truncate_inode_pages(&inode->i_data, 0); if (is_bad_inode(inode)) @@ -1461,7 +1461,7 @@ int ext2_setattr(struct dentry *dentry, struct iattr *iattr) return error; if (iattr->ia_valid & ATTR_SIZE) - vfs_dq_init(inode); + dquot_initialize(inode); if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) || (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) { error = dquot_transfer(inode, iattr); diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index 5923df7b22af..71efb0e9a3f2 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -102,7 +102,7 @@ static int ext2_create (struct inode * dir, struct dentry * dentry, int mode, st { struct inode *inode; - vfs_dq_init(dir); + dquot_initialize(dir); inode = ext2_new_inode(dir, mode); if (IS_ERR(inode)) @@ -131,7 +131,7 @@ static int ext2_mknod (struct inode * dir, struct dentry *dentry, int mode, dev_ if (!new_valid_dev(rdev)) return -EINVAL; - vfs_dq_init(dir); + dquot_initialize(dir); inode = ext2_new_inode (dir, mode); err = PTR_ERR(inode); @@ -157,7 +157,7 @@ static int ext2_symlink (struct inode * dir, struct dentry * dentry, if (l > sb->s_blocksize) goto out; - vfs_dq_init(dir); + dquot_initialize(dir); inode = ext2_new_inode (dir, S_IFLNK | S_IRWXUGO); err = PTR_ERR(inode); @@ -202,7 +202,7 @@ static int ext2_link (struct dentry * old_dentry, struct inode * dir, if (inode->i_nlink >= EXT2_LINK_MAX) return -EMLINK; - vfs_dq_init(dir); + dquot_initialize(dir); inode->i_ctime = CURRENT_TIME_SEC; inode_inc_link_count(inode); @@ -226,7 +226,7 @@ static int ext2_mkdir(struct inode * dir, struct dentry * dentry, int mode) if (dir->i_nlink >= EXT2_LINK_MAX) goto out; - vfs_dq_init(dir); + dquot_initialize(dir); inode_inc_link_count(dir); @@ -274,7 +274,7 @@ static int ext2_unlink(struct inode * dir, struct dentry *dentry) struct page * page; int err = -ENOENT; - vfs_dq_init(dir); + dquot_initialize(dir); de = ext2_find_entry (dir, &dentry->d_name, &page); if (!de) @@ -318,8 +318,8 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry, struct ext2_dir_entry_2 * old_de; int err = -ENOENT; - vfs_dq_init(old_dir); - vfs_dq_init(new_dir); + dquot_initialize(old_dir); + dquot_initialize(new_dir); old_de = ext2_find_entry (old_dir, &old_dentry->d_name, &old_page); if (!old_de) diff --git a/fs/ext3/file.c b/fs/ext3/file.c index 3c7fb11a3b29..f55df0e61cbd 100644 --- a/fs/ext3/file.c +++ b/fs/ext3/file.c @@ -21,6 +21,7 @@ #include <linux/time.h> #include <linux/fs.h> #include <linux/jbd.h> +#include <linux/quotaops.h> #include <linux/ext3_fs.h> #include <linux/ext3_jbd.h> #include "xattr.h" diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c index 7d7238f9f6f3..ef9008b885b5 100644 --- a/fs/ext3/ialloc.c +++ b/fs/ext3/ialloc.c @@ -123,7 +123,7 @@ void ext3_free_inode (handle_t *handle, struct inode * inode) * Note: we must free any quota before locking the superblock, * as writing the quota to disk may need the lock as well. */ - vfs_dq_init(inode); + dquot_initialize(inode); ext3_xattr_delete_inode(handle, inode); dquot_free_inode(inode); dquot_drop(inode); @@ -588,7 +588,7 @@ got: sizeof(struct ext3_inode) - EXT3_GOOD_OLD_INODE_SIZE : 0; ret = inode; - vfs_dq_init(inode); + dquot_initialize(inode); err = dquot_alloc_inode(inode); if (err) goto fail_drop; diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index d7962b0c57b3..ffbbc65e3f68 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -197,7 +197,7 @@ void ext3_delete_inode (struct inode * inode) handle_t *handle; if (!is_bad_inode(inode)) - vfs_dq_init(inode); + dquot_initialize(inode); truncate_inode_pages(&inode->i_data, 0); @@ -3152,7 +3152,7 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr) return error; if (ia_valid & ATTR_SIZE) - vfs_dq_init(inode); + dquot_initialize(inode); if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { handle_t *handle; @@ -3250,7 +3250,7 @@ static int ext3_writepage_trans_blocks(struct inode *inode) ret = 2 * (bpp + indirects) + 2; #ifdef CONFIG_QUOTA - /* We know that structure was already allocated during vfs_dq_init so + /* We know that structure was already allocated during dquot_initialize so * we will be updating only the data blocks + inodes */ ret += EXT3_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb); #endif diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index a492b371b134..ee184084ca42 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -1696,7 +1696,7 @@ static int ext3_create (struct inode * dir, struct dentry * dentry, int mode, struct inode * inode; int err, retries = 0; - vfs_dq_init(dir); + dquot_initialize(dir); retry: handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + @@ -1732,7 +1732,7 @@ static int ext3_mknod (struct inode * dir, struct dentry *dentry, if (!new_valid_dev(rdev)) return -EINVAL; - vfs_dq_init(dir); + dquot_initialize(dir); retry: handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + @@ -1770,7 +1770,7 @@ static int ext3_mkdir(struct inode * dir, struct dentry * dentry, int mode) if (dir->i_nlink >= EXT3_LINK_MAX) return -EMLINK; - vfs_dq_init(dir); + dquot_initialize(dir); retry: handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + @@ -2066,8 +2066,8 @@ static int ext3_rmdir (struct inode * dir, struct dentry *dentry) /* Initialize quotas before so that eventual writes go in * separate transaction */ - vfs_dq_init(dir); - vfs_dq_init(dentry->d_inode); + dquot_initialize(dir); + dquot_initialize(dentry->d_inode); handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) @@ -2127,8 +2127,8 @@ static int ext3_unlink(struct inode * dir, struct dentry *dentry) /* Initialize quotas before so that eventual writes go * in separate transaction */ - vfs_dq_init(dir); - vfs_dq_init(dentry->d_inode); + dquot_initialize(dir); + dquot_initialize(dentry->d_inode); handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) @@ -2184,7 +2184,7 @@ static int ext3_symlink (struct inode * dir, if (l > dir->i_sb->s_blocksize) return -ENAMETOOLONG; - vfs_dq_init(dir); + dquot_initialize(dir); retry: handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + @@ -2241,7 +2241,7 @@ static int ext3_link (struct dentry * old_dentry, if (inode->i_nlink >= EXT3_LINK_MAX) return -EMLINK; - vfs_dq_init(dir); + dquot_initialize(dir); /* * Return -ENOENT if we've raced with unlink and i_nlink is 0. Doing @@ -2293,15 +2293,15 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry, struct ext3_dir_entry_2 * old_de, * new_de; int retval, flush_file = 0; - vfs_dq_init(old_dir); - vfs_dq_init(new_dir); + dquot_initialize(old_dir); + dquot_initialize(new_dir); old_bh = new_bh = dir_bh = NULL; /* Initialize quotas before so that eventual writes go * in separate transaction */ if (new_dentry->d_inode) - vfs_dq_init(new_dentry->d_inode); + dquot_initialize(new_dentry->d_inode); handle = ext3_journal_start(old_dir, 2 * EXT3_DATA_TRANS_BLOCKS(old_dir->i_sb) + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 2); diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 0163d0dae124..e844accbf55d 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -752,7 +752,6 @@ static ssize_t ext3_quota_write(struct super_block *sb, int type, const char *data, size_t len, loff_t off); static const struct dquot_operations ext3_quota_operations = { - .initialize = dquot_initialize, .write_dquot = ext3_write_dquot, .acquire_dquot = ext3_acquire_dquot, .release_dquot = ext3_release_dquot, @@ -1480,7 +1479,7 @@ static void ext3_orphan_cleanup (struct super_block * sb, } list_add(&EXT3_I(inode)->i_orphan, &EXT3_SB(sb)->s_orphan); - vfs_dq_init(inode); + dquot_initialize(inode); if (inode->i_nlink) { printk(KERN_DEBUG "%s: truncating inode %lu to %Ld bytes\n", @@ -2736,7 +2735,7 @@ static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf) * Process 1 Process 2 * ext3_create() quota_sync() * journal_start() write_dquot() - * vfs_dq_init() down(dqio_mutex) + * dquot_initialize() down(dqio_mutex) * down(dqio_mutex) journal_start() * */ diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 85fa464a24ad..a08a12998c49 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -23,6 +23,7 @@ #include <linux/jbd2.h> #include <linux/mount.h> #include <linux/path.h> +#include <linux/quotaops.h> #include "ext4.h" #include "ext4_jbd2.h" #include "xattr.h" diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index ca8986e4b528..9bb2bb9f67ad 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -217,7 +217,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) * Note: we must free any quota before locking the superblock, * as writing the quota to disk may need the lock as well. */ - vfs_dq_init(inode); + dquot_initialize(inode); ext4_xattr_delete_inode(handle, inode); dquot_free_inode(inode); dquot_drop(inode); @@ -1034,7 +1034,7 @@ got: ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize; ret = inode; - vfs_dq_init(inode); + dquot_initialize(inode); err = dquot_alloc_inode(inode); if (err) goto fail_drop; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index eaa22ae9f1f6..bec222ca9ba4 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -171,7 +171,7 @@ void ext4_delete_inode(struct inode *inode) int err; if (!is_bad_inode(inode)) - vfs_dq_init(inode); + dquot_initialize(inode); if (ext4_should_order_data(inode)) ext4_begin_ordered_truncate(inode, 0); @@ -5255,7 +5255,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) return error; if (ia_valid & ATTR_SIZE) - vfs_dq_init(inode); + dquot_initialize(inode); if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { handle_t *handle; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 20f55c2e7571..7f3d2d75a0dc 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1766,7 +1766,7 @@ static int ext4_create(struct inode *dir, struct dentry *dentry, int mode, struct inode *inode; int err, retries = 0; - vfs_dq_init(dir); + dquot_initialize(dir); retry: handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + @@ -1802,7 +1802,7 @@ static int ext4_mknod(struct inode *dir, struct dentry *dentry, if (!new_valid_dev(rdev)) return -EINVAL; - vfs_dq_init(dir); + dquot_initialize(dir); retry: handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + @@ -1841,7 +1841,7 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode) if (EXT4_DIR_LINK_MAX(dir)) return -EMLINK; - vfs_dq_init(dir); + dquot_initialize(dir); retry: handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + @@ -2142,8 +2142,8 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry) /* Initialize quotas before so that eventual writes go in * separate transaction */ - vfs_dq_init(dir); - vfs_dq_init(dentry->d_inode); + dquot_initialize(dir); + dquot_initialize(dentry->d_inode); handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) @@ -2203,8 +2203,8 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) /* Initialize quotas before so that eventual writes go * in separate transaction */ - vfs_dq_init(dir); - vfs_dq_init(dentry->d_inode); + dquot_initialize(dir); + dquot_initialize(dentry->d_inode); handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) @@ -2260,7 +2260,7 @@ static int ext4_symlink(struct inode *dir, if (l > dir->i_sb->s_blocksize) return -ENAMETOOLONG; - vfs_dq_init(dir); + dquot_initialize(dir); retry: handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + @@ -2320,7 +2320,7 @@ static int ext4_link(struct dentry *old_dentry, if (inode->i_nlink >= EXT4_LINK_MAX) return -EMLINK; - vfs_dq_init(dir); + dquot_initialize(dir); /* * Return -ENOENT if we've raced with unlink and i_nlink is 0. Doing @@ -2372,15 +2372,15 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, struct ext4_dir_entry_2 *old_de, *new_de; int retval, force_da_alloc = 0; - vfs_dq_init(old_dir); - vfs_dq_init(new_dir); + dquot_initialize(old_dir); + dquot_initialize(new_dir); old_bh = new_bh = dir_bh = NULL; /* Initialize quotas before so that eventual writes go * in separate transaction */ if (new_dentry->d_inode) - vfs_dq_init(new_dentry->d_inode); + dquot_initialize(new_dentry->d_inode); handle = ext4_journal_start(old_dir, 2 * EXT4_DATA_TRANS_BLOCKS(old_dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 035516c80df2..edcf3b0239d1 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1013,7 +1013,6 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, const char *data, size_t len, loff_t off); static const struct dquot_operations ext4_quota_operations = { - .initialize = dquot_initialize, #ifdef CONFIG_QUOTA .get_reserved_space = ext4_get_reserved_space, #endif @@ -1931,7 +1930,7 @@ static void ext4_orphan_cleanup(struct super_block *sb, } list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan); - vfs_dq_init(inode); + dquot_initialize(inode); if (inode->i_nlink) { ext4_msg(sb, KERN_DEBUG, "%s: truncating inode %lu to %lld bytes", @@ -3700,7 +3699,7 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) * Process 1 Process 2 * ext4_create() quota_sync() * jbd2_journal_start() write_dquot() - * vfs_dq_init() down(dqio_mutex) + * dquot_initialize() down(dqio_mutex) * down(dqio_mutex) jbd2_journal_start() * */ diff --git a/fs/jfs/file.c b/fs/jfs/file.c index f19bb33eb1eb..14ba982b3f24 100644 --- a/fs/jfs/file.c +++ b/fs/jfs/file.c @@ -99,7 +99,7 @@ int jfs_setattr(struct dentry *dentry, struct iattr *iattr) return rc; if (iattr->ia_valid & ATTR_SIZE) - vfs_dq_init(inode); + dquot_initialize(inode); if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) || (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) { rc = dquot_transfer(inode, iattr); diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index 1aa2dda16590..c694a5f15380 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -147,7 +147,7 @@ void jfs_delete_inode(struct inode *inode) jfs_info("In jfs_delete_inode, inode = 0x%p", inode); if (!is_bad_inode(inode)) - vfs_dq_init(inode); + dquot_initialize(inode); if (!is_bad_inode(inode) && (JFS_IP(inode)->fileset == FILESYSTEM_I)) { @@ -161,7 +161,7 @@ void jfs_delete_inode(struct inode *inode) /* * Free the inode from the quota allocation. */ - vfs_dq_init(inode); + dquot_initialize(inode); dquot_free_inode(inode); dquot_drop(inode); } diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c index 72b30895422c..829921b67765 100644 --- a/fs/jfs/jfs_inode.c +++ b/fs/jfs/jfs_inode.c @@ -116,7 +116,7 @@ struct inode *ialloc(struct inode *parent, umode_t mode) /* * Allocate inode to quota. */ - vfs_dq_init(inode); + dquot_initialize(inode); rc = dquot_alloc_inode(inode); if (rc) goto fail_drop; diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index b7cc29da50b4..4a3e9f39c21d 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -85,7 +85,7 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, int mode, jfs_info("jfs_create: dip:0x%p name:%s", dip, dentry->d_name.name); - vfs_dq_init(dip); + dquot_initialize(dip); /* * search parent directory for entry/freespace @@ -217,7 +217,7 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode) jfs_info("jfs_mkdir: dip:0x%p name:%s", dip, dentry->d_name.name); - vfs_dq_init(dip); + dquot_initialize(dip); /* link count overflow on parent directory ? */ if (dip->i_nlink == JFS_LINK_MAX) { @@ -360,8 +360,8 @@ static int jfs_rmdir(struct inode *dip, struct dentry *dentry) jfs_info("jfs_rmdir: dip:0x%p name:%s", dip, dentry->d_name.name); /* Init inode for quota operations. */ - vfs_dq_init(dip); - vfs_dq_init(ip); + dquot_initialize(dip); + dquot_initialize(ip); /* directory must be empty to be removed */ if (!dtEmpty(ip)) { @@ -488,8 +488,8 @@ static int jfs_unlink(struct inode *dip, struct dentry *dentry) jfs_info("jfs_unlink: dip:0x%p name:%s", dip, dentry->d_name.name); /* Init inode for quota operations. */ - vfs_dq_init(dip); - vfs_dq_init(ip); + dquot_initialize(dip); + dquot_initialize(ip); if ((rc = get_UCSname(&dname, dentry))) goto out; @@ -811,7 +811,7 @@ static int jfs_link(struct dentry *old_dentry, if (ip->i_nlink == 0) return -ENOENT; - vfs_dq_init(dir); + dquot_initialize(dir); tid = txBegin(ip->i_sb, 0); @@ -904,7 +904,7 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry, jfs_info("jfs_symlink: dip:0x%p name:%s", dip, name); - vfs_dq_init(dip); + dquot_initialize(dip); ssize = strlen(name) + 1; @@ -1097,8 +1097,8 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, jfs_info("jfs_rename: %s %s", old_dentry->d_name.name, new_dentry->d_name.name); - vfs_dq_init(old_dir); - vfs_dq_init(new_dir); + dquot_initialize(old_dir); + dquot_initialize(new_dir); old_ip = old_dentry->d_inode; new_ip = new_dentry->d_inode; @@ -1149,7 +1149,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, } else if (new_ip) { IWRITE_LOCK(new_ip, RDWRLOCK_NORMAL); /* Init inode for quota operations. */ - vfs_dq_init(new_ip); + dquot_initialize(new_ip); } /* @@ -1373,7 +1373,7 @@ static int jfs_mknod(struct inode *dir, struct dentry *dentry, jfs_info("jfs_mknod: %s", dentry->d_name.name); - vfs_dq_init(dir); + dquot_initialize(dir); if ((rc = get_UCSname(&dname, dentry))) goto out; diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 126198f5a67c..364105291282 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -108,7 +108,7 @@ static int ocfs2_file_open(struct inode *inode, struct file *file) file->f_path.dentry->d_name.len, file->f_path.dentry->d_name.name); if (file->f_mode & FMODE_WRITE) - vfs_dq_init(inode); + dquot_initialize(inode); spin_lock(&oi->ip_lock); @@ -980,7 +980,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE; if (size_change) { - vfs_dq_init(inode); + dquot_initialize(inode); status = ocfs2_rw_lock(inode, 1); if (status < 0) { diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 77681a690d16..278a223aae14 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -971,7 +971,7 @@ void ocfs2_delete_inode(struct inode *inode) goto bail; } - vfs_dq_init(inode); + dquot_initialize(inode); if (!ocfs2_inode_is_valid_to_delete(inode)) { /* It's probably not necessary to truncate_inode_pages diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 8b5b142eb638..d9cd4e373a53 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -212,7 +212,7 @@ static struct inode *ocfs2_get_init_inode(struct inode *dir, int mode) } else inode->i_gid = current_fsgid(); inode->i_mode = mode; - vfs_dq_init(inode); + dquot_initialize(inode); return inode; } @@ -244,7 +244,7 @@ static int ocfs2_mknod(struct inode *dir, (unsigned long)dev, dentry->d_name.len, dentry->d_name.name); - vfs_dq_init(dir); + dquot_initialize(dir); /* get our super block */ osb = OCFS2_SB(dir->i_sb); @@ -634,7 +634,7 @@ static int ocfs2_link(struct dentry *old_dentry, if (S_ISDIR(inode->i_mode)) return -EPERM; - vfs_dq_init(dir); + dquot_initialize(dir); err = ocfs2_inode_lock_nested(dir, &parent_fe_bh, 1, OI_LS_PARENT); if (err < 0) { @@ -791,7 +791,7 @@ static int ocfs2_unlink(struct inode *dir, mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry, dentry->d_name.len, dentry->d_name.name); - vfs_dq_init(dir); + dquot_initialize(dir); BUG_ON(dentry->d_parent->d_inode != dir); @@ -1053,8 +1053,8 @@ static int ocfs2_rename(struct inode *old_dir, old_dentry->d_name.len, old_dentry->d_name.name, new_dentry->d_name.len, new_dentry->d_name.name); - vfs_dq_init(old_dir); - vfs_dq_init(new_dir); + dquot_initialize(old_dir); + dquot_initialize(new_dir); osb = OCFS2_SB(old_dir->i_sb); @@ -1604,7 +1604,7 @@ static int ocfs2_symlink(struct inode *dir, mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir, dentry, symname, dentry->d_name.len, dentry->d_name.name); - vfs_dq_init(dir); + dquot_initialize(dir); sb = dir->i_sb; osb = OCFS2_SB(sb); diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index 4dca38f487cf..355f41d1d520 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -851,7 +851,6 @@ static void ocfs2_destroy_dquot(struct dquot *dquot) } const struct dquot_operations ocfs2_quota_operations = { - .initialize = dquot_initialize, .write_dquot = ocfs2_write_dquot, .acquire_dquot = ocfs2_acquire_dquot, .release_dquot = ocfs2_release_dquot, diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 8ae65c9c020c..f3ae10cde841 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -4390,7 +4390,7 @@ static int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir, } mutex_lock(&inode->i_mutex); - vfs_dq_init(dir); + dquot_initialize(dir); error = ocfs2_reflink(old_dentry, dir, new_dentry, preserve); mutex_unlock(&inode->i_mutex); if (!error) diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 6244bca45c9d..3c0a7e0dff78 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -230,6 +230,7 @@ struct dqstats dqstats; EXPORT_SYMBOL(dqstats); static qsize_t inode_get_rsv_space(struct inode *inode); +static void __dquot_initialize(struct inode *inode, int type); static inline unsigned int hashfn(const struct super_block *sb, unsigned int id, int type) @@ -890,7 +891,7 @@ static void add_dquot_ref(struct super_block *sb, int type) spin_unlock(&inode_lock); iput(old_inode); - sb->dq_op->initialize(inode, type); + __dquot_initialize(inode, type); /* We hold a reference to 'inode' so it couldn't have been * removed from s_inodes list while we dropped the inode_lock. * We cannot iput the inode now as we can be holding the last @@ -1293,22 +1294,26 @@ static int info_bdq_free(struct dquot *dquot, qsize_t space) } /* - * Initialize quota pointers in inode - * We do things in a bit complicated way but by that we avoid calling - * dqget() and thus filesystem callbacks under dqptr_sem. + * Initialize quota pointers in inode + * + * We do things in a bit complicated way but by that we avoid calling + * dqget() and thus filesystem callbacks under dqptr_sem. + * + * It is better to call this function outside of any transaction as it + * might need a lot of space in journal for dquot structure allocation. */ -int dquot_initialize(struct inode *inode, int type) +static void __dquot_initialize(struct inode *inode, int type) { unsigned int id = 0; - int cnt, ret = 0; + int cnt; struct dquot *got[MAXQUOTAS]; struct super_block *sb = inode->i_sb; qsize_t rsv; /* First test before acquiring mutex - solves deadlocks when we * re-enter the quota code and are already holding the mutex */ - if (IS_NOQUOTA(inode)) - return 0; + if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) + return; /* First get references to structures we might need. */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { @@ -1351,7 +1356,11 @@ out_err: up_write(&sb_dqopt(sb)->dqptr_sem); /* Drop unused references */ dqput_all(got); - return ret; +} + +void dquot_initialize(struct inode *inode) +{ + __dquot_initialize(inode, -1); } EXPORT_SYMBOL(dquot_initialize); @@ -1783,7 +1792,7 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr) chid[GRPQUOTA] = iattr->ia_gid; } if (sb_any_quota_active(inode->i_sb) && !IS_NOQUOTA(inode)) { - vfs_dq_init(inode); + dquot_initialize(inode); if (__dquot_transfer(inode, chid, mask) == NO_QUOTA) return -EDQUOT; } @@ -1810,7 +1819,6 @@ EXPORT_SYMBOL(dquot_commit_info); * Definitions of diskquota operations. */ const struct dquot_operations dquot_operations = { - .initialize = dquot_initialize, .write_dquot = dquot_commit, .acquire_dquot = dquot_acquire, .release_dquot = dquot_release, @@ -1829,7 +1837,7 @@ int dquot_file_open(struct inode *inode, struct file *file) error = generic_file_open(inode, file); if (!error && (file->f_mode & FMODE_WRITE)) - vfs_dq_init(inode); + dquot_initialize(inode); return error; } EXPORT_SYMBOL(dquot_file_open); diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 06995cb48e39..b8671a54e8ed 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -35,7 +35,7 @@ void reiserfs_delete_inode(struct inode *inode) int err; if (!is_bad_inode(inode)) - vfs_dq_init(inode); + dquot_initialize(inode); truncate_inode_pages(&inode->i_data, 0); @@ -1768,7 +1768,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, BUG_ON(!th->t_trans_id); - vfs_dq_init(inode); + dquot_initialize(inode); err = dquot_alloc_inode(inode); if (err) goto out_end_trans; @@ -3076,7 +3076,7 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) depth = reiserfs_write_lock_once(inode->i_sb); if (attr->ia_valid & ATTR_SIZE) { - vfs_dq_init(inode); + dquot_initialize(inode); /* version 2 items will be caught by the s_maxbytes check ** done for us in vmtruncate diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index c55e1b9fee5f..96e4cbbfaa18 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -554,7 +554,7 @@ static int drop_new_inode(struct inode *inode) } /* utility function that does setup for reiserfs_new_inode. -** vfs_dq_init needs lots of credits so it's better to have it +** dquot_initialize needs lots of credits so it's better to have it ** outside of a transaction, so we had to pull some bits of ** reiserfs_new_inode out into this func. */ @@ -577,7 +577,7 @@ static int new_inode_init(struct inode *inode, struct inode *dir, int mode) } else { inode->i_gid = current_fsgid(); } - vfs_dq_init(inode); + dquot_initialize(inode); return 0; } @@ -594,7 +594,7 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, int mode, struct reiserfs_transaction_handle th; struct reiserfs_security_handle security; - vfs_dq_init(dir); + dquot_initialize(dir); if (!(inode = new_inode(dir->i_sb))) { return -ENOMEM; @@ -668,7 +668,7 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, int mode, if (!new_valid_dev(rdev)) return -EINVAL; - vfs_dq_init(dir); + dquot_initialize(dir); if (!(inode = new_inode(dir->i_sb))) { return -ENOMEM; @@ -743,7 +743,7 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) 2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) + REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb)); - vfs_dq_init(dir); + dquot_initialize(dir); #ifdef DISPLACE_NEW_PACKING_LOCALITIES /* set flag that new packing locality created and new blocks for the content * of that directory are not displaced yet */ @@ -848,7 +848,7 @@ static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry) JOURNAL_PER_BALANCE_CNT * 2 + 2 + 4 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb); - vfs_dq_init(dir); + dquot_initialize(dir); reiserfs_write_lock(dir->i_sb); retval = journal_begin(&th, dir->i_sb, jbegin_count); @@ -931,7 +931,7 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry) unsigned long savelink; int depth; - vfs_dq_init(dir); + dquot_initialize(dir); inode = dentry->d_inode; @@ -1034,7 +1034,7 @@ static int reiserfs_symlink(struct inode *parent_dir, 2 * (REISERFS_QUOTA_INIT_BLOCKS(parent_dir->i_sb) + REISERFS_QUOTA_TRANS_BLOCKS(parent_dir->i_sb)); - vfs_dq_init(parent_dir); + dquot_initialize(parent_dir); if (!(inode = new_inode(parent_dir->i_sb))) { return -ENOMEM; @@ -1123,7 +1123,7 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir, JOURNAL_PER_BALANCE_CNT * 3 + 2 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb); - vfs_dq_init(dir); + dquot_initialize(dir); reiserfs_write_lock(dir->i_sb); if (inode->i_nlink >= REISERFS_LINK_MAX) { @@ -1249,8 +1249,8 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry, JOURNAL_PER_BALANCE_CNT * 3 + 5 + 4 * REISERFS_QUOTA_TRANS_BLOCKS(old_dir->i_sb); - vfs_dq_init(old_dir); - vfs_dq_init(new_dir); + dquot_initialize(old_dir); + dquot_initialize(new_dir); old_inode = old_dentry->d_inode; new_dentry_inode = new_dentry->d_inode; diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 34f7cd0cb02d..04bf5d791bda 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -246,7 +246,7 @@ static int finish_unfinished(struct super_block *s) retval = remove_save_link_only(s, &save_link_key, 0); continue; } - vfs_dq_init(inode); + dquot_initialize(inode); if (truncate && S_ISDIR(inode->i_mode)) { /* We got a truncate request for a dir which is impossible. @@ -622,7 +622,6 @@ static int reiserfs_write_info(struct super_block *, int); static int reiserfs_quota_on(struct super_block *, int, int, char *, int); static const struct dquot_operations reiserfs_quota_operations = { - .initialize = dquot_initialize, .write_dquot = reiserfs_write_dquot, .acquire_dquot = reiserfs_acquire_dquot, .release_dquot = reiserfs_release_dquot, diff --git a/fs/udf/file.c b/fs/udf/file.c index 013fa44d9a5e..1eb06774ed90 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -228,7 +228,7 @@ static int udf_setattr(struct dentry *dentry, struct iattr *iattr) return error; if (iattr->ia_valid & ATTR_SIZE) - vfs_dq_init(inode); + dquot_initialize(inode); if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) || (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) { diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c index 15c6e992e587..fb68c9cd0c3e 100644 --- a/fs/udf/ialloc.c +++ b/fs/udf/ialloc.c @@ -153,7 +153,7 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err) insert_inode_hash(inode); mark_inode_dirty(inode); - vfs_dq_init(inode); + dquot_initialize(inode); ret = dquot_alloc_inode(inode); if (ret) { dquot_drop(inode); diff --git a/fs/udf/inode.c b/fs/udf/inode.c index f19520268404..c7da1a32b364 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -72,7 +72,7 @@ static int udf_get_block(struct inode *, sector_t, struct buffer_head *, int); void udf_delete_inode(struct inode *inode) { if (!is_bad_inode(inode)) - vfs_dq_init(inode); + dquot_initialize(inode); truncate_inode_pages(&inode->i_data, 0); diff --git a/fs/udf/namei.c b/fs/udf/namei.c index e360c3fc4ae4..96757e3e3e04 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -563,7 +563,7 @@ static int udf_create(struct inode *dir, struct dentry *dentry, int mode, int err; struct udf_inode_info *iinfo; - vfs_dq_init(dir); + dquot_initialize(dir); lock_kernel(); inode = udf_new_inode(dir, mode, &err); @@ -618,7 +618,7 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, int mode, if (!old_valid_dev(rdev)) return -EINVAL; - vfs_dq_init(dir); + dquot_initialize(dir); lock_kernel(); err = -EIO; @@ -666,7 +666,7 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode) struct udf_inode_info *dinfo = UDF_I(dir); struct udf_inode_info *iinfo; - vfs_dq_init(dir); + dquot_initialize(dir); lock_kernel(); err = -EMLINK; @@ -805,7 +805,7 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry) struct fileIdentDesc *fi, cfi; struct kernel_lb_addr tloc; - vfs_dq_init(dir); + dquot_initialize(dir); retval = -ENOENT; lock_kernel(); @@ -853,7 +853,7 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry) struct fileIdentDesc cfi; struct kernel_lb_addr tloc; - vfs_dq_init(dir); + dquot_initialize(dir); retval = -ENOENT; lock_kernel(); @@ -909,7 +909,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry, struct buffer_head *bh; struct udf_inode_info *iinfo; - vfs_dq_init(dir); + dquot_initialize(dir); lock_kernel(); inode = udf_new_inode(dir, S_IFLNK, &err); @@ -1081,7 +1081,7 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir, int err; struct buffer_head *bh; - vfs_dq_init(dir); + dquot_initialize(dir); lock_kernel(); if (inode->i_nlink >= (256 << sizeof(inode->i_nlink)) - 1) { @@ -1145,8 +1145,8 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry, struct kernel_lb_addr tloc; struct udf_inode_info *old_iinfo = UDF_I(old_inode); - vfs_dq_init(old_dir); - vfs_dq_init(new_dir); + dquot_initialize(old_dir); + dquot_initialize(new_dir); lock_kernel(); ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi); diff --git a/fs/ufs/file.c b/fs/ufs/file.c index d84762f3028e..a8962cecde5b 100644 --- a/fs/ufs/file.c +++ b/fs/ufs/file.c @@ -24,6 +24,7 @@ */ #include <linux/fs.h> +#include <linux/quotaops.h> #include "ufs_fs.h" #include "ufs.h" diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c index 67b4bdb056fb..230ecf608026 100644 --- a/fs/ufs/ialloc.c +++ b/fs/ufs/ialloc.c @@ -355,7 +355,7 @@ cg_found: unlock_super (sb); - vfs_dq_init(inode); + dquot_initialize(inode); err = dquot_alloc_inode(inode); if (err) { dquot_drop(inode); diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index fff8edab382f..09aef49beedb 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -910,7 +910,7 @@ void ufs_delete_inode (struct inode * inode) loff_t old_i_size; if (!is_bad_inode(inode)) - vfs_dq_init(inode); + dquot_initialize(inode); truncate_inode_pages(&inode->i_data, 0); if (is_bad_inode(inode)) diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c index c33cb90c516d..118556243e7a 100644 --- a/fs/ufs/namei.c +++ b/fs/ufs/namei.c @@ -86,7 +86,7 @@ static int ufs_create (struct inode * dir, struct dentry * dentry, int mode, UFSD("BEGIN\n"); - vfs_dq_init(dir); + dquot_initialize(dir); inode = ufs_new_inode(dir, mode); err = PTR_ERR(inode); @@ -112,7 +112,7 @@ static int ufs_mknod (struct inode * dir, struct dentry *dentry, int mode, dev_t if (!old_valid_dev(rdev)) return -EINVAL; - vfs_dq_init(dir); + dquot_initialize(dir); inode = ufs_new_inode(dir, mode); err = PTR_ERR(inode); @@ -138,7 +138,7 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry, if (l > sb->s_blocksize) goto out_notlocked; - vfs_dq_init(dir); + dquot_initialize(dir); lock_kernel(); inode = ufs_new_inode(dir, S_IFLNK | S_IRWXUGO); @@ -185,7 +185,7 @@ static int ufs_link (struct dentry * old_dentry, struct inode * dir, return -EMLINK; } - vfs_dq_init(dir); + dquot_initialize(dir); inode->i_ctime = CURRENT_TIME_SEC; inode_inc_link_count(inode); @@ -204,7 +204,7 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, int mode) if (dir->i_nlink >= UFS_LINK_MAX) goto out; - vfs_dq_init(dir); + dquot_initialize(dir); lock_kernel(); inode_inc_link_count(dir); @@ -250,7 +250,7 @@ static int ufs_unlink(struct inode *dir, struct dentry *dentry) struct page *page; int err = -ENOENT; - vfs_dq_init(dir); + dquot_initialize(dir); de = ufs_find_entry(dir, &dentry->d_name, &page); if (!de) @@ -296,8 +296,8 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry, struct ufs_dir_entry *old_de; int err = -ENOENT; - vfs_dq_init(old_dir); - vfs_dq_init(new_dir); + dquot_initialize(old_dir); + dquot_initialize(new_dir); old_de = ufs_find_entry(old_dir, &old_dentry->d_name, &old_page); if (!old_de) diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c index e5ef8a3ec230..d3b6270cb377 100644 --- a/fs/ufs/truncate.c +++ b/fs/ufs/truncate.c @@ -528,7 +528,7 @@ static int ufs_setattr(struct dentry *dentry, struct iattr *attr) attr->ia_size != i_size_read(inode)) { loff_t old_i_size = inode->i_size; - vfs_dq_init(inode); + dquot_initialize(inode); error = vmtruncate(inode, attr->ia_size); if (error) diff --git a/include/linux/quota.h b/include/linux/quota.h index aec2e9dac2d7..4aa93554f0eb 100644 --- a/include/linux/quota.h +++ b/include/linux/quota.h @@ -295,7 +295,6 @@ struct quota_format_ops { /* Operations working with dquots */ struct dquot_operations { - int (*initialize) (struct inode *, int); int (*write_dquot) (struct dquot *); /* Ordinary dquot write */ struct dquot *(*alloc_dquot)(struct super_block *, int); /* Allocate memory for new dquot */ void (*destroy_dquot)(struct dquot *); /* Free memory for dquot */ diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h index 93ac788345e2..e6fa7acce290 100644 --- a/include/linux/quotaops.h +++ b/include/linux/quotaops.h @@ -23,7 +23,7 @@ void inode_add_rsv_space(struct inode *inode, qsize_t number); void inode_claim_rsv_space(struct inode *inode, qsize_t number); void inode_sub_rsv_space(struct inode *inode, qsize_t number); -int dquot_initialize(struct inode *inode, int type); +void dquot_initialize(struct inode *inode); void dquot_drop(struct inode *inode); struct dquot *dqget(struct super_block *sb, unsigned int id, int type); void dqput(struct dquot *dquot); @@ -139,15 +139,6 @@ extern const struct quotactl_ops vfs_quotactl_ops; #define sb_dquot_ops (&dquot_operations) #define sb_quotactl_ops (&vfs_quotactl_ops) -/* It is better to call this function outside of any transaction as it might - * need a lot of space in journal for dquot structure allocation. */ -static inline void vfs_dq_init(struct inode *inode) -{ - BUG_ON(!inode->i_sb); - if (sb_any_quota_active(inode->i_sb) && !IS_NOQUOTA(inode)) - inode->i_sb->dq_op->initialize(inode, -1); -} - /* Cannot be called inside a transaction */ static inline int vfs_dq_off(struct super_block *sb, int remount) { @@ -207,7 +198,7 @@ static inline int sb_any_quota_active(struct super_block *sb) #define sb_dquot_ops (NULL) #define sb_quotactl_ops (NULL) -static inline void vfs_dq_init(struct inode *inode) +static inline void dquot_initialize(struct inode *inode) { } @@ -260,6 +251,8 @@ static inline int dquot_claim_space_nodirty(struct inode *inode, qsize_t number) return 0; } +#define dquot_file_open generic_file_open + #endif /* CONFIG_QUOTA */ static inline int dquot_alloc_space_nodirty(struct inode *inode, qsize_t nr) @@ -344,6 +337,4 @@ static inline void dquot_release_reservation_block(struct inode *inode, __dquot_free_space(inode, nr << inode->i_blkbits, 1); } -#define dquot_file_open generic_file_open - #endif /* _LINUX_QUOTAOPS_ */ -- cgit v1.2.3 From efd8f0e6f6c1faa041f228d7113bd3a9db802d49 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@infradead.org> Date: Wed, 3 Mar 2010 09:05:08 -0500 Subject: quota: stop using QUOTA_OK / NO_QUOTA Just use 0 / -EDQUOT directly - that's what it translates to anyway. Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Jan Kara <jack@suse.cz> --- fs/quota/dquot.c | 48 ++++++++++++++++++++++++------------------------ include/linux/quota.h | 3 --- 2 files changed, 24 insertions(+), 27 deletions(-) (limited to 'fs') diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 3c0a7e0dff78..e0b870f4749f 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -1181,13 +1181,13 @@ static int check_idq(struct dquot *dquot, qsize_t inodes, char *warntype) *warntype = QUOTA_NL_NOWARN; if (!sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_type) || test_bit(DQ_FAKE_B, &dquot->dq_flags)) - return QUOTA_OK; + return 0; if (dquot->dq_dqb.dqb_ihardlimit && newinodes > dquot->dq_dqb.dqb_ihardlimit && !ignore_hardlimit(dquot)) { *warntype = QUOTA_NL_IHARDWARN; - return NO_QUOTA; + return -EDQUOT; } if (dquot->dq_dqb.dqb_isoftlimit && @@ -1196,7 +1196,7 @@ static int check_idq(struct dquot *dquot, qsize_t inodes, char *warntype) get_seconds() >= dquot->dq_dqb.dqb_itime && !ignore_hardlimit(dquot)) { *warntype = QUOTA_NL_ISOFTLONGWARN; - return NO_QUOTA; + return -EDQUOT; } if (dquot->dq_dqb.dqb_isoftlimit && @@ -1207,7 +1207,7 @@ static int check_idq(struct dquot *dquot, qsize_t inodes, char *warntype) sb_dqopt(dquot->dq_sb)->info[dquot->dq_type].dqi_igrace; } - return QUOTA_OK; + return 0; } /* needs dq_data_lock */ @@ -1219,7 +1219,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war *warntype = QUOTA_NL_NOWARN; if (!sb_has_quota_limits_enabled(sb, dquot->dq_type) || test_bit(DQ_FAKE_B, &dquot->dq_flags)) - return QUOTA_OK; + return 0; tspace = dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace + space; @@ -1229,7 +1229,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war !ignore_hardlimit(dquot)) { if (!prealloc) *warntype = QUOTA_NL_BHARDWARN; - return NO_QUOTA; + return -EDQUOT; } if (dquot->dq_dqb.dqb_bsoftlimit && @@ -1239,7 +1239,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war !ignore_hardlimit(dquot)) { if (!prealloc) *warntype = QUOTA_NL_BSOFTLONGWARN; - return NO_QUOTA; + return -EDQUOT; } if (dquot->dq_dqb.dqb_bsoftlimit && @@ -1255,10 +1255,10 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war * We don't allow preallocation to exceed softlimit so exceeding will * be always printed */ - return NO_QUOTA; + return -EDQUOT; } - return QUOTA_OK; + return 0; } static int info_idq_free(struct dquot *dquot, qsize_t inodes) @@ -1507,9 +1507,9 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (!inode->i_dquot[cnt]) continue; - if (check_bdq(inode->i_dquot[cnt], number, !warn, warntype+cnt) - == NO_QUOTA) { - ret = -EDQUOT; + ret = check_bdq(inode->i_dquot[cnt], number, !warn, + warntype+cnt); + if (ret) { spin_unlock(&dq_data_lock); goto out_flush_warn; } @@ -1541,7 +1541,7 @@ EXPORT_SYMBOL(__dquot_alloc_space); */ int dquot_alloc_inode(const struct inode *inode) { - int cnt, ret = -EDQUOT; + int cnt, ret = 0; char warntype[MAXQUOTAS]; /* First test before acquiring mutex - solves deadlocks when we @@ -1555,8 +1555,8 @@ int dquot_alloc_inode(const struct inode *inode) for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (!inode->i_dquot[cnt]) continue; - if (check_idq(inode->i_dquot[cnt], 1, warntype+cnt) - == NO_QUOTA) + ret = check_idq(inode->i_dquot[cnt], 1, warntype + cnt); + if (ret) goto warn_put_all; } @@ -1565,7 +1565,7 @@ int dquot_alloc_inode(const struct inode *inode) continue; dquot_incr_inodes(inode->i_dquot[cnt], 1); } - ret = 0; + warn_put_all: spin_unlock(&dq_data_lock); if (ret == 0) @@ -1683,14 +1683,14 @@ static int __dquot_transfer(struct inode *inode, qid_t *chid, unsigned long mask qsize_t rsv_space = 0; struct dquot *transfer_from[MAXQUOTAS]; struct dquot *transfer_to[MAXQUOTAS]; - int cnt, ret = QUOTA_OK; + int cnt, ret = 0; char warntype_to[MAXQUOTAS]; char warntype_from_inodes[MAXQUOTAS], warntype_from_space[MAXQUOTAS]; /* First test before acquiring mutex - solves deadlocks when we * re-enter the quota code and are already holding the mutex */ if (IS_NOQUOTA(inode)) - return QUOTA_OK; + return 0; /* Initialize the arrays */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { transfer_from[cnt] = NULL; @@ -1715,9 +1715,11 @@ static int __dquot_transfer(struct inode *inode, qid_t *chid, unsigned long mask if (!transfer_to[cnt]) continue; transfer_from[cnt] = inode->i_dquot[cnt]; - if (check_idq(transfer_to[cnt], 1, warntype_to + cnt) == - NO_QUOTA || check_bdq(transfer_to[cnt], space, 0, - warntype_to + cnt) == NO_QUOTA) + ret = check_idq(transfer_to[cnt], 1, warntype_to + cnt); + if (ret) + goto over_quota; + ret = check_bdq(transfer_to[cnt], space, 0, warntype_to + cnt); + if (ret) goto over_quota; } @@ -1771,7 +1773,6 @@ over_quota: /* Clear dquot pointers we don't want to dqput() */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) transfer_from[cnt] = NULL; - ret = NO_QUOTA; goto warn_put_all; } @@ -1793,8 +1794,7 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr) } if (sb_any_quota_active(inode->i_sb) && !IS_NOQUOTA(inode)) { dquot_initialize(inode); - if (__dquot_transfer(inode, chid, mask) == NO_QUOTA) - return -EDQUOT; + return __dquot_transfer(inode, chid, mask); } return 0; } diff --git a/include/linux/quota.h b/include/linux/quota.h index 4aa93554f0eb..b462916b2a0a 100644 --- a/include/linux/quota.h +++ b/include/linux/quota.h @@ -279,9 +279,6 @@ struct dquot { struct mem_dqblk dq_dqb; /* Diskquota usage */ }; -#define QUOTA_OK 0 -#define NO_QUOTA 1 - /* Operations which must be implemented by each quota format */ struct quota_format_ops { int (*check_quota_file)(struct super_block *sb, int type); /* Detect whether file is in our format */ -- cgit v1.2.3 From 64e290ec69be39f1887fa0b403c1e417b6b038e7 Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Date: Thu, 4 Mar 2010 22:25:21 -0500 Subject: ext4: fix up rb_root initializations to use RB_ROOT ext4 uses rb_node = NULL; to zero rb_root at few places. Using RB_ROOT as the initializer is more portable in case the underlying implementation of rbtrees changes in the future. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu> Cc: Eric Paris <eparis@redhat.com> --- fs/ext4/block_validity.c | 4 ++-- fs/ext4/dir.c | 2 +- fs/ext4/mballoc.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c index a60ab9aad57d..983f0e127493 100644 --- a/fs/ext4/block_validity.c +++ b/fs/ext4/block_validity.c @@ -205,14 +205,14 @@ void ext4_release_system_zone(struct super_block *sb) entry = rb_entry(n, struct ext4_system_zone, node); kmem_cache_free(ext4_system_zone_cachep, entry); if (!parent) - EXT4_SB(sb)->system_blks.rb_node = NULL; + EXT4_SB(sb)->system_blks = RB_ROOT; else if (parent->rb_left == n) parent->rb_left = NULL; else if (parent->rb_right == n) parent->rb_right = NULL; n = parent; } - EXT4_SB(sb)->system_blks.rb_node = NULL; + EXT4_SB(sb)->system_blks = RB_ROOT; } /* diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 29857ddd9e26..86cb6d86a048 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -305,7 +305,7 @@ static void free_rb_tree_fname(struct rb_root *root) kfree(old); } if (!parent) - root->rb_node = NULL; + *root = RB_ROOT; else if (parent->rb_left == n) parent->rb_left = NULL; else if (parent->rb_right == n) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 37d2438e0cb4..abb11e328b65 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2253,7 +2253,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); init_rwsem(&meta_group_info[i]->alloc_sem); - meta_group_info[i]->bb_free_root.rb_node = NULL; + meta_group_info[i]->bb_free_root = RB_ROOT; #ifdef DOUBLE_CHECK { -- cgit v1.2.3 From 648fa8611de3d4d43bbd64af3226679d2d0eb609 Mon Sep 17 00:00:00 2001 From: Al Viro <viro@zeniv.linux.org.uk> Date: Thu, 24 Dec 2009 01:26:48 -0500 Subject: beginning to untangle do_filp_open() That's going to be a long and painful series. The first step: take the stuff reachable from 'ok' label in do_filp_open() into a new helper (finish_open()). Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> --- fs/namei.c | 106 ++++++++++++++++++++++++++++++++----------------------------- 1 file changed, 56 insertions(+), 50 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 0741c69b3319..60b74b3946a1 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1590,6 +1590,61 @@ static int open_will_truncate(int flag, struct inode *inode) return (flag & O_TRUNC); } +static struct file *finish_open(struct nameidata *nd, + int open_flag, int flag, int acc_mode) +{ + struct file *filp; + int will_truncate; + int error; + + will_truncate = open_will_truncate(flag, nd->path.dentry->d_inode); + if (will_truncate) { + error = mnt_want_write(nd->path.mnt); + if (error) + goto exit; + } + error = may_open(&nd->path, acc_mode, open_flag); + if (error) { + if (will_truncate) + mnt_drop_write(nd->path.mnt); + goto exit; + } + filp = nameidata_to_filp(nd); + if (!IS_ERR(filp)) { + error = ima_file_check(filp, acc_mode); + if (error) { + fput(filp); + filp = ERR_PTR(error); + } + } + if (!IS_ERR(filp)) { + if (acc_mode & MAY_WRITE) + vfs_dq_init(nd->path.dentry->d_inode); + + if (will_truncate) { + error = handle_truncate(&nd->path); + if (error) { + fput(filp); + filp = ERR_PTR(error); + } + } + } + /* + * It is now safe to drop the mnt write + * because the filp has had a write taken + * on its behalf. + */ + if (will_truncate) + mnt_drop_write(nd->path.mnt); + return filp; + +exit: + if (!IS_ERR(nd->intent.open.file)) + release_open_intent(nd); + path_put(&nd->path); + return ERR_PTR(error); +} + /* * Note that the low bits of the passed in "open_flag" * are not the same as in the local variable "flag". See @@ -1604,7 +1659,6 @@ struct file *do_filp_open(int dfd, const char *pathname, struct path path; struct dentry *dir; int count = 0; - int will_truncate; int flag = open_to_namei_flags(open_flag); int force_reval = 0; @@ -1769,55 +1823,7 @@ do_last: if (S_ISDIR(path.dentry->d_inode->i_mode)) goto exit; ok: - /* - * Consider: - * 1. may_open() truncates a file - * 2. a rw->ro mount transition occurs - * 3. nameidata_to_filp() fails due to - * the ro mount. - * That would be inconsistent, and should - * be avoided. Taking this mnt write here - * ensures that (2) can not occur. - */ - will_truncate = open_will_truncate(flag, nd.path.dentry->d_inode); - if (will_truncate) { - error = mnt_want_write(nd.path.mnt); - if (error) - goto exit; - } - error = may_open(&nd.path, acc_mode, open_flag); - if (error) { - if (will_truncate) - mnt_drop_write(nd.path.mnt); - goto exit; - } - filp = nameidata_to_filp(&nd); - if (!IS_ERR(filp)) { - error = ima_file_check(filp, acc_mode); - if (error) { - fput(filp); - filp = ERR_PTR(error); - } - } - if (!IS_ERR(filp)) { - if (acc_mode & MAY_WRITE) - vfs_dq_init(nd.path.dentry->d_inode); - - if (will_truncate) { - error = handle_truncate(&nd.path); - if (error) { - fput(filp); - filp = ERR_PTR(error); - } - } - } - /* - * It is now safe to drop the mnt write - * because the filp has had a write taken - * on its behalf. - */ - if (will_truncate) - mnt_drop_write(nd.path.mnt); + filp = finish_open(&nd, open_flag, flag, acc_mode); if (nd.root.mnt) path_put(&nd.root); return filp; -- cgit v1.2.3 From fb1cc555d533869910e20de4b8d5147570afdfad Mon Sep 17 00:00:00 2001 From: Al Viro <viro@zeniv.linux.org.uk> Date: Thu, 24 Dec 2009 01:58:28 -0500 Subject: gut do_filp_open() a bit more (do_last separation) Brute-force separation of stuff reachable from do_last: with the exception of do_link:; just take all that crap to a helper function as-is and have it tell the caller if it has to go to do_link. Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> --- fs/namei.c | 171 +++++++++++++++++++++++++++++++++++++------------------------ 1 file changed, 103 insertions(+), 68 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 60b74b3946a1..3c39fa1608c5 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1645,6 +1645,104 @@ exit: return ERR_PTR(error); } +static struct file *do_last(struct nameidata *nd, struct path *path, + int open_flag, int flag, int acc_mode, + int mode, const char *pathname, + struct dentry *dir, int *is_link) +{ + struct file *filp; + int error; + + *is_link = 0; + + error = PTR_ERR(path->dentry); + if (IS_ERR(path->dentry)) { + mutex_unlock(&dir->d_inode->i_mutex); + goto exit; + } + + if (IS_ERR(nd->intent.open.file)) { + error = PTR_ERR(nd->intent.open.file); + goto exit_mutex_unlock; + } + + /* Negative dentry, just create the file */ + if (!path->dentry->d_inode) { + /* + * This write is needed to ensure that a + * ro->rw transition does not occur between + * the time when the file is created and when + * a permanent write count is taken through + * the 'struct file' in nameidata_to_filp(). + */ + error = mnt_want_write(nd->path.mnt); + if (error) + goto exit_mutex_unlock; + error = __open_namei_create(nd, path, open_flag, mode); + if (error) { + mnt_drop_write(nd->path.mnt); + goto exit; + } + filp = nameidata_to_filp(nd); + mnt_drop_write(nd->path.mnt); + if (nd->root.mnt) + path_put(&nd->root); + if (!IS_ERR(filp)) { + error = ima_file_check(filp, acc_mode); + if (error) { + fput(filp); + filp = ERR_PTR(error); + } + } + return filp; + } + + /* + * It already exists. + */ + mutex_unlock(&dir->d_inode->i_mutex); + audit_inode(pathname, path->dentry); + + error = -EEXIST; + if (flag & O_EXCL) + goto exit_dput; + + if (__follow_mount(path)) { + error = -ELOOP; + if (flag & O_NOFOLLOW) + goto exit_dput; + } + + error = -ENOENT; + if (!path->dentry->d_inode) + goto exit_dput; + if (path->dentry->d_inode->i_op->follow_link) { + *is_link = 1; + return NULL; + } + + path_to_nameidata(path, nd); + error = -EISDIR; + if (S_ISDIR(path->dentry->d_inode->i_mode)) + goto exit; + filp = finish_open(nd, open_flag, flag, acc_mode); + if (nd->root.mnt) + path_put(&nd->root); + return filp; + +exit_mutex_unlock: + mutex_unlock(&dir->d_inode->i_mutex); +exit_dput: + path_put_conditional(path, nd); +exit: + if (!IS_ERR(nd->intent.open.file)) + release_open_intent(nd); + if (nd->root.mnt) + path_put(&nd->root); + path_put(&nd->path); + return ERR_PTR(error); +} + /* * Note that the low bits of the passed in "open_flag" * are not the same as in the local variable "flag". See @@ -1661,6 +1759,7 @@ struct file *do_filp_open(int dfd, const char *pathname, int count = 0; int flag = open_to_namei_flags(open_flag); int force_reval = 0; + int is_link; /* * O_SYNC is implemented as __O_SYNC|O_DSYNC. As many places only @@ -1754,82 +1853,18 @@ reval: path.mnt = nd.path.mnt; do_last: - error = PTR_ERR(path.dentry); - if (IS_ERR(path.dentry)) { - mutex_unlock(&dir->d_inode->i_mutex); - goto exit; - } - - if (IS_ERR(nd.intent.open.file)) { - error = PTR_ERR(nd.intent.open.file); - goto exit_mutex_unlock; - } - - /* Negative dentry, just create the file */ - if (!path.dentry->d_inode) { - /* - * This write is needed to ensure that a - * ro->rw transition does not occur between - * the time when the file is created and when - * a permanent write count is taken through - * the 'struct file' in nameidata_to_filp(). - */ - error = mnt_want_write(nd.path.mnt); - if (error) - goto exit_mutex_unlock; - error = __open_namei_create(&nd, &path, open_flag, mode); - if (error) { - mnt_drop_write(nd.path.mnt); - goto exit; - } - filp = nameidata_to_filp(&nd); - mnt_drop_write(nd.path.mnt); - if (nd.root.mnt) - path_put(&nd.root); - if (!IS_ERR(filp)) { - error = ima_file_check(filp, acc_mode); - if (error) { - fput(filp); - filp = ERR_PTR(error); - } - } - return filp; - } - - /* - * It already exists. - */ - mutex_unlock(&dir->d_inode->i_mutex); - audit_inode(pathname, path.dentry); - - error = -EEXIST; - if (flag & O_EXCL) - goto exit_dput; - - if (__follow_mount(&path)) { - error = -ELOOP; - if (flag & O_NOFOLLOW) - goto exit_dput; - } - - error = -ENOENT; - if (!path.dentry->d_inode) - goto exit_dput; - if (path.dentry->d_inode->i_op->follow_link) + filp = do_last(&nd, &path, open_flag, flag, acc_mode, mode, + pathname, dir, &is_link); + if (is_link) goto do_link; + return filp; - path_to_nameidata(&path, &nd); - error = -EISDIR; - if (S_ISDIR(path.dentry->d_inode->i_mode)) - goto exit; ok: filp = finish_open(&nd, open_flag, flag, acc_mode); if (nd.root.mnt) path_put(&nd.root); return filp; -exit_mutex_unlock: - mutex_unlock(&dir->d_inode->i_mutex); exit_dput: path_put_conditional(&path, &nd); exit: -- cgit v1.2.3 From 3343eb8209cc69f0d2059f8c484ad7a3e1834c0b Mon Sep 17 00:00:00 2001 From: Al Viro <viro@zeniv.linux.org.uk> Date: Thu, 24 Dec 2009 02:02:38 -0500 Subject: Shift releasing nd->root from do_last() to its caller Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> --- fs/namei.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 3c39fa1608c5..bff27c08134c 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1685,8 +1685,6 @@ static struct file *do_last(struct nameidata *nd, struct path *path, } filp = nameidata_to_filp(nd); mnt_drop_write(nd->path.mnt); - if (nd->root.mnt) - path_put(&nd->root); if (!IS_ERR(filp)) { error = ima_file_check(filp, acc_mode); if (error) { @@ -1726,8 +1724,6 @@ static struct file *do_last(struct nameidata *nd, struct path *path, if (S_ISDIR(path->dentry->d_inode->i_mode)) goto exit; filp = finish_open(nd, open_flag, flag, acc_mode); - if (nd->root.mnt) - path_put(&nd->root); return filp; exit_mutex_unlock: @@ -1737,8 +1733,6 @@ exit_dput: exit: if (!IS_ERR(nd->intent.open.file)) release_open_intent(nd); - if (nd->root.mnt) - path_put(&nd->root); path_put(&nd->path); return ERR_PTR(error); } @@ -1857,6 +1851,8 @@ do_last: pathname, dir, &is_link); if (is_link) goto do_link; + if (nd.root.mnt) + path_put(&nd.root); return filp; ok: -- cgit v1.2.3 From 27bff34300482632caf52ff589a4e7d755b32539 Mon Sep 17 00:00:00 2001 From: Al Viro <viro@zeniv.linux.org.uk> Date: Thu, 24 Dec 2009 02:05:43 -0500 Subject: unroll do_last: loop in do_filp_open() Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> --- fs/namei.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index bff27c08134c..fc6bed7215c9 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1845,8 +1845,6 @@ reval: mutex_lock(&dir->d_inode->i_mutex); path.dentry = lookup_hash(&nd); path.mnt = nd.path.mnt; - -do_last: filp = do_last(&nd, &path, open_flag, flag, acc_mode, mode, pathname, dir, &is_link); if (is_link) @@ -1926,7 +1924,13 @@ do_link: path.dentry = lookup_hash(&nd); path.mnt = nd.path.mnt; __putname(nd.last.name); - goto do_last; + filp = do_last(&nd, &path, open_flag, flag, acc_mode, mode, + pathname, dir, &is_link); + if (is_link) + goto do_link; + if (nd.root.mnt) + path_put(&nd.root); + return filp; } /** -- cgit v1.2.3 From c41c14056210e4a328659c82b1edaccb0910d18c Mon Sep 17 00:00:00 2001 From: Al Viro <viro@zeniv.linux.org.uk> Date: Thu, 24 Dec 2009 02:08:19 -0500 Subject: postpone __putname() until after do_last() Since do_last() doesn't mangle nd->last_name, we can safely postpone __putname() done in handling of trailing symlinks until after the call of do_last() Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> --- fs/namei.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index fc6bed7215c9..30ba3f3a25e2 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1923,9 +1923,9 @@ do_link: mutex_lock(&dir->d_inode->i_mutex); path.dentry = lookup_hash(&nd); path.mnt = nd.path.mnt; - __putname(nd.last.name); filp = do_last(&nd, &path, open_flag, flag, acc_mode, mode, pathname, dir, &is_link); + __putname(nd.last.name); if (is_link) goto do_link; if (nd.root.mnt) -- cgit v1.2.3 From a1e28038df98e186807ff55a49c1c26d33d530a5 Mon Sep 17 00:00:00 2001 From: Al Viro <viro@zeniv.linux.org.uk> Date: Thu, 24 Dec 2009 02:12:06 -0500 Subject: pull the common predecessors into do_last() Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> --- fs/namei.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 30ba3f3a25e2..976fc323272e 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1648,13 +1648,19 @@ exit: static struct file *do_last(struct nameidata *nd, struct path *path, int open_flag, int flag, int acc_mode, int mode, const char *pathname, - struct dentry *dir, int *is_link) + int *is_link) { + struct dentry *dir = nd->path.dentry; struct file *filp; int error; *is_link = 0; + mutex_lock(&dir->d_inode->i_mutex); + + path->dentry = lookup_hash(nd); + path->mnt = nd->path.mnt; + error = PTR_ERR(path->dentry); if (IS_ERR(path->dentry)) { mutex_unlock(&dir->d_inode->i_mutex); @@ -1749,7 +1755,6 @@ struct file *do_filp_open(int dfd, const char *pathname, struct nameidata nd; int error; struct path path; - struct dentry *dir; int count = 0; int flag = open_to_namei_flags(open_flag); int force_reval = 0; @@ -1837,16 +1842,12 @@ reval: filp->f_flags = open_flag; nd.intent.open.flags = flag; nd.intent.open.create_mode = mode; - dir = nd.path.dentry; nd.flags &= ~LOOKUP_PARENT; nd.flags |= LOOKUP_CREATE | LOOKUP_OPEN; if (flag & O_EXCL) nd.flags |= LOOKUP_EXCL; - mutex_lock(&dir->d_inode->i_mutex); - path.dentry = lookup_hash(&nd); - path.mnt = nd.path.mnt; filp = do_last(&nd, &path, open_flag, flag, acc_mode, mode, - pathname, dir, &is_link); + pathname, &is_link); if (is_link) goto do_link; if (nd.root.mnt) @@ -1919,12 +1920,8 @@ do_link: __putname(nd.last.name); goto exit; } - dir = nd.path.dentry; - mutex_lock(&dir->d_inode->i_mutex); - path.dentry = lookup_hash(&nd); - path.mnt = nd.path.mnt; filp = do_last(&nd, &path, open_flag, flag, acc_mode, mode, - pathname, dir, &is_link); + pathname, &is_link); __putname(nd.last.name); if (is_link) goto do_link; -- cgit v1.2.3 From c99658fe970f442199733bcace1a00b087336a0d Mon Sep 17 00:00:00 2001 From: Al Viro <viro@zeniv.linux.org.uk> Date: Thu, 24 Dec 2009 02:27:30 -0500 Subject: bail out with ELOOP earlier in do_link loop If we'd passed through 32 trailing symlinks already, there's no sense following the 33rd - we'll bail out anyway. Better bugger off earlier. It *does* change behaviour, after a fashion - if the 33rd happens to be a procfs-style symlink, original code *would* allow it. This one will not. Cry me a river if that hurts you. Please, do. And post a video of that, while you are at it. Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> --- fs/namei.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 976fc323272e..84f1ec3b4a5d 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1873,7 +1873,7 @@ exit_parent: do_link: error = -ELOOP; - if (flag & O_NOFOLLOW) + if ((flag & O_NOFOLLOW) || count++ == 32) goto exit_dput; /* * This is subtle. Instead of calling do_follow_link() we do the @@ -1915,11 +1915,6 @@ do_link: __putname(nd.last.name); goto exit; } - error = -ELOOP; - if (count++==32) { - __putname(nd.last.name); - goto exit; - } filp = do_last(&nd, &path, open_flag, flag, acc_mode, mode, pathname, &is_link); __putname(nd.last.name); -- cgit v1.2.3 From a2c36b450ee68470836cb858c58a6ba3a52c5ec5 Mon Sep 17 00:00:00 2001 From: Al Viro <viro@zeniv.linux.org.uk> Date: Thu, 24 Dec 2009 03:39:50 -0500 Subject: pull more into do_last() Handling of LAST_DOT/LAST_ROOT/LAST_DOTDOT/terminating slash can be pulled in as well Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> --- fs/namei.c | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 84f1ec3b4a5d..52517e0bbdde 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1656,6 +1656,10 @@ static struct file *do_last(struct nameidata *nd, struct path *path, *is_link = 0; + error = -EISDIR; + if (nd->last_type != LAST_NORM || nd->last.name[nd->last.len]) + goto exit; + mutex_lock(&dir->d_inode->i_mutex); path->dentry = lookup_hash(nd); @@ -1826,13 +1830,8 @@ reval: audit_inode(pathname, nd.path.dentry); /* - * We have the parent and last component. First of all, check - * that we are not asked to creat(2) an obvious directory - that - * will not do. + * We have the parent and last component. */ - error = -EISDIR; - if (nd.last_type != LAST_NORM || nd.last.name[nd.last.len]) - goto exit_parent; error = -ENFILE; filp = get_empty_filp(); @@ -1908,16 +1907,10 @@ do_link: nd.flags &= ~LOOKUP_PARENT; if (nd.last_type == LAST_BIND) goto ok; - error = -EISDIR; - if (nd.last_type != LAST_NORM) - goto exit; - if (nd.last.name[nd.last.len]) { - __putname(nd.last.name); - goto exit; - } filp = do_last(&nd, &path, open_flag, flag, acc_mode, mode, pathname, &is_link); - __putname(nd.last.name); + if (nd.last_type == LAST_NORM) + __putname(nd.last.name); if (is_link) goto do_link; if (nd.root.mnt) -- cgit v1.2.3 From 9a66179e13504c676f891908a1e94912ec5cdefb Mon Sep 17 00:00:00 2001 From: Al Viro <viro@zeniv.linux.org.uk> Date: Thu, 24 Dec 2009 06:49:47 -0500 Subject: Don't pass mangled open_flag to finish_open() Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> --- fs/namei.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 52517e0bbdde..5b9016006913 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1591,13 +1591,13 @@ static int open_will_truncate(int flag, struct inode *inode) } static struct file *finish_open(struct nameidata *nd, - int open_flag, int flag, int acc_mode) + int open_flag, int acc_mode) { struct file *filp; int will_truncate; int error; - will_truncate = open_will_truncate(flag, nd->path.dentry->d_inode); + will_truncate = open_will_truncate(open_flag, nd->path.dentry->d_inode); if (will_truncate) { error = mnt_want_write(nd->path.mnt); if (error) @@ -1733,7 +1733,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path, error = -EISDIR; if (S_ISDIR(path->dentry->d_inode->i_mode)) goto exit; - filp = finish_open(nd, open_flag, flag, acc_mode); + filp = finish_open(nd, open_flag, acc_mode); return filp; exit_mutex_unlock: @@ -1854,7 +1854,7 @@ reval: return filp; ok: - filp = finish_open(&nd, open_flag, flag, acc_mode); + filp = finish_open(&nd, open_flag, acc_mode); if (nd.root.mnt) path_put(&nd.root); return filp; -- cgit v1.2.3 From 5b369df8263fe7ab4dac2bb08b8f423dc5e33752 Mon Sep 17 00:00:00 2001 From: Al Viro <viro@zeniv.linux.org.uk> Date: Thu, 24 Dec 2009 06:51:13 -0500 Subject: Get rid of passing mangled flag to do_last() Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> --- fs/namei.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 5b9016006913..5ea7330c184b 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1646,7 +1646,7 @@ exit: } static struct file *do_last(struct nameidata *nd, struct path *path, - int open_flag, int flag, int acc_mode, + int open_flag, int acc_mode, int mode, const char *pathname, int *is_link) { @@ -1712,12 +1712,12 @@ static struct file *do_last(struct nameidata *nd, struct path *path, audit_inode(pathname, path->dentry); error = -EEXIST; - if (flag & O_EXCL) + if (open_flag & O_EXCL) goto exit_dput; if (__follow_mount(path)) { error = -ELOOP; - if (flag & O_NOFOLLOW) + if (open_flag & O_NOFOLLOW) goto exit_dput; } @@ -1845,7 +1845,7 @@ reval: nd.flags |= LOOKUP_CREATE | LOOKUP_OPEN; if (flag & O_EXCL) nd.flags |= LOOKUP_EXCL; - filp = do_last(&nd, &path, open_flag, flag, acc_mode, mode, + filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname, &is_link); if (is_link) goto do_link; @@ -1907,7 +1907,7 @@ do_link: nd.flags &= ~LOOKUP_PARENT; if (nd.last_type == LAST_BIND) goto ok; - filp = do_last(&nd, &path, open_flag, flag, acc_mode, mode, + filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname, &is_link); if (nd.last_type == LAST_NORM) __putname(nd.last.name); -- cgit v1.2.3 From 4296e2cbf2138b5831b83f03e81de916ce1a967d Mon Sep 17 00:00:00 2001 From: Al Viro <viro@zeniv.linux.org.uk> Date: Thu, 24 Dec 2009 07:15:41 -0500 Subject: Leave mangled flag only for setting nd.intent.open.flag Nothing else uses it anymore Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> --- fs/namei.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 5ea7330c184b..f5e4397dcd7e 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1777,18 +1777,18 @@ struct file *do_filp_open(int dfd, const char *pathname, acc_mode = MAY_OPEN | ACC_MODE(open_flag); /* O_TRUNC implies we need access checks for write permissions */ - if (flag & O_TRUNC) + if (open_flag & O_TRUNC) acc_mode |= MAY_WRITE; /* Allow the LSM permission hook to distinguish append access from general write access. */ - if (flag & O_APPEND) + if (open_flag & O_APPEND) acc_mode |= MAY_APPEND; /* * The simplest case - just a plain lookup. */ - if (!(flag & O_CREAT)) { + if (!(open_flag & O_CREAT)) { filp = get_empty_filp(); if (filp == NULL) @@ -1798,7 +1798,7 @@ struct file *do_filp_open(int dfd, const char *pathname, nd.intent.open.flags = flag; nd.intent.open.create_mode = 0; error = do_path_lookup(dfd, pathname, - lookup_flags(flag)|LOOKUP_OPEN, &nd); + lookup_flags(open_flag)|LOOKUP_OPEN, &nd); if (IS_ERR(nd.intent.open.file)) { if (error == 0) { error = PTR_ERR(nd.intent.open.file); @@ -1843,7 +1843,7 @@ reval: nd.intent.open.create_mode = mode; nd.flags &= ~LOOKUP_PARENT; nd.flags |= LOOKUP_CREATE | LOOKUP_OPEN; - if (flag & O_EXCL) + if (open_flag & O_EXCL) nd.flags |= LOOKUP_EXCL; filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname, &is_link); @@ -1872,7 +1872,7 @@ exit_parent: do_link: error = -ELOOP; - if ((flag & O_NOFOLLOW) || count++ == 32) + if ((open_flag & O_NOFOLLOW) || count++ == 32) goto exit_dput; /* * This is subtle. Instead of calling do_follow_link() we do the -- cgit v1.2.3 From 67ee3ad21d0d0b2cc0b70708de8aed860fadda44 Mon Sep 17 00:00:00 2001 From: Al Viro <viro@zeniv.linux.org.uk> Date: Sat, 26 Dec 2009 07:01:01 -0500 Subject: Pull handling of LAST_BIND into do_last(), clean up ok: part in do_filp_open() Note that in case of !O_CREAT we know that nd.root has already been given up Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> --- fs/namei.c | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index f5e4397dcd7e..0b4d19d47e67 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1656,6 +1656,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path, *is_link = 0; + if (nd->last_type == LAST_BIND) + goto ok; + error = -EISDIR; if (nd->last_type != LAST_NORM || nd->last.name[nd->last.len]) goto exit; @@ -1733,6 +1736,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path, error = -EISDIR; if (S_ISDIR(path->dentry->d_inode->i_mode)) goto exit; +ok: filp = finish_open(nd, open_flag, acc_mode); return filp; @@ -1808,7 +1812,7 @@ struct file *do_filp_open(int dfd, const char *pathname, release_open_intent(&nd); if (error) return ERR_PTR(error); - goto ok; + return finish_open(&nd, open_flag, acc_mode); } /* @@ -1853,21 +1857,14 @@ reval: path_put(&nd.root); return filp; -ok: - filp = finish_open(&nd, open_flag, acc_mode); - if (nd.root.mnt) - path_put(&nd.root); - return filp; - exit_dput: path_put_conditional(&path, &nd); -exit: if (!IS_ERR(nd.intent.open.file)) release_open_intent(&nd); exit_parent: + path_put(&nd.path); if (nd.root.mnt) path_put(&nd.root); - path_put(&nd.path); return ERR_PTR(error); do_link: @@ -1905,8 +1902,6 @@ do_link: return ERR_PTR(error); } nd.flags &= ~LOOKUP_PARENT; - if (nd.last_type == LAST_BIND) - goto ok; filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname, &is_link); if (nd.last_type == LAST_NORM) -- cgit v1.2.3 From 9e67f36169117e07daf16dc7ca314f1db9e2050a Mon Sep 17 00:00:00 2001 From: Al Viro <viro@zeniv.linux.org.uk> Date: Sat, 26 Dec 2009 07:04:50 -0500 Subject: Kill is_link argument of do_last() We set it to 1 iff we return NULL Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> --- fs/namei.c | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 0b4d19d47e67..b0c74fe91fb0 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1647,15 +1647,12 @@ exit: static struct file *do_last(struct nameidata *nd, struct path *path, int open_flag, int acc_mode, - int mode, const char *pathname, - int *is_link) + int mode, const char *pathname) { struct dentry *dir = nd->path.dentry; struct file *filp; int error; - *is_link = 0; - if (nd->last_type == LAST_BIND) goto ok; @@ -1727,10 +1724,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path, error = -ENOENT; if (!path->dentry->d_inode) goto exit_dput; - if (path->dentry->d_inode->i_op->follow_link) { - *is_link = 1; + + if (path->dentry->d_inode->i_op->follow_link) return NULL; - } path_to_nameidata(path, nd); error = -EISDIR; @@ -1766,7 +1762,6 @@ struct file *do_filp_open(int dfd, const char *pathname, int count = 0; int flag = open_to_namei_flags(open_flag); int force_reval = 0; - int is_link; /* * O_SYNC is implemented as __O_SYNC|O_DSYNC. As many places only @@ -1849,9 +1844,8 @@ reval: nd.flags |= LOOKUP_CREATE | LOOKUP_OPEN; if (open_flag & O_EXCL) nd.flags |= LOOKUP_EXCL; - filp = do_last(&nd, &path, open_flag, acc_mode, mode, - pathname, &is_link); - if (is_link) + filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname); + if (!filp) goto do_link; if (nd.root.mnt) path_put(&nd.root); @@ -1902,11 +1896,10 @@ do_link: return ERR_PTR(error); } nd.flags &= ~LOOKUP_PARENT; - filp = do_last(&nd, &path, open_flag, acc_mode, mode, - pathname, &is_link); + filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname); if (nd.last_type == LAST_NORM) __putname(nd.last.name); - if (is_link) + if (!filp) goto do_link; if (nd.root.mnt) path_put(&nd.root); -- cgit v1.2.3 From 10fa8e62f2bc33c452516585911f151d88389e4c Mon Sep 17 00:00:00 2001 From: Al Viro <viro@zeniv.linux.org.uk> Date: Sat, 26 Dec 2009 07:09:49 -0500 Subject: Unify exits in O_CREAT handling Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> --- fs/namei.c | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index b0c74fe91fb0..675a712137f1 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1821,9 +1821,8 @@ reval: nd.flags |= LOOKUP_REVAL; error = path_walk(pathname, &nd); if (error) { - if (nd.root.mnt) - path_put(&nd.root); - return ERR_PTR(error); + filp = ERR_PTR(error); + goto out; } if (unlikely(!audit_dummy_context())) audit_inode(pathname, nd.path.dentry); @@ -1847,9 +1846,7 @@ reval: filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname); if (!filp) goto do_link; - if (nd.root.mnt) - path_put(&nd.root); - return filp; + goto out; exit_dput: path_put_conditional(&path, &nd); @@ -1857,9 +1854,15 @@ exit_dput: release_open_intent(&nd); exit_parent: path_put(&nd.path); + filp = ERR_PTR(error); +out: if (nd.root.mnt) path_put(&nd.root); - return ERR_PTR(error); + if (filp == ERR_PTR(-ESTALE) && !force_reval) { + force_reval = 1; + goto reval; + } + return filp; do_link: error = -ELOOP; @@ -1887,13 +1890,8 @@ do_link: * with "intent.open". */ release_open_intent(&nd); - if (nd.root.mnt) - path_put(&nd.root); - if (error == -ESTALE && !force_reval) { - force_reval = 1; - goto reval; - } - return ERR_PTR(error); + filp = ERR_PTR(error); + goto out; } nd.flags &= ~LOOKUP_PARENT; filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname); @@ -1901,9 +1899,7 @@ do_link: __putname(nd.last.name); if (!filp) goto do_link; - if (nd.root.mnt) - path_put(&nd.root); - return filp; + goto out; } /** -- cgit v1.2.3 From 806b681cbe588bebe8fe47dd24da62f2d1c55851 Mon Sep 17 00:00:00 2001 From: Al Viro <viro@zeniv.linux.org.uk> Date: Sat, 26 Dec 2009 07:16:40 -0500 Subject: Turn do_link spaghetty into a normal loop Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> --- fs/namei.c | 83 ++++++++++++++++++++++++++++---------------------------------- 1 file changed, 38 insertions(+), 45 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 675a712137f1..08da937b1ee2 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1844,17 +1844,38 @@ reval: if (open_flag & O_EXCL) nd.flags |= LOOKUP_EXCL; filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname); - if (!filp) - goto do_link; - goto out; - -exit_dput: - path_put_conditional(&path, &nd); - if (!IS_ERR(nd.intent.open.file)) - release_open_intent(&nd); -exit_parent: - path_put(&nd.path); - filp = ERR_PTR(error); + while (unlikely(!filp)) { /* trailing symlink */ + error = -ELOOP; + if ((open_flag & O_NOFOLLOW) || count++ == 32) + goto exit_dput; + /* + * This is subtle. Instead of calling do_follow_link() we do + * the thing by hands. The reason is that this way we have zero + * link_count and path_walk() (called from ->follow_link) + * honoring LOOKUP_PARENT. After that we have the parent and + * last component, i.e. we are in the same situation as after + * the first path_walk(). Well, almost - if the last component + * is normal we get its copy stored in nd->last.name and we will + * have to putname() it when we are done. Procfs-like symlinks + * just set LAST_BIND. + */ + nd.flags |= LOOKUP_PARENT; + error = security_inode_follow_link(path.dentry, &nd); + if (error) + goto exit_dput; + error = __do_follow_link(&path, &nd); + path_put(&path); + if (error) { + /* nd.path had been dropped */ + release_open_intent(&nd); + filp = ERR_PTR(error); + goto out; + } + nd.flags &= ~LOOKUP_PARENT; + filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname); + if (nd.last_type == LAST_NORM) + __putname(nd.last.name); + } out: if (nd.root.mnt) path_put(&nd.root); @@ -1864,41 +1885,13 @@ out: } return filp; -do_link: - error = -ELOOP; - if ((open_flag & O_NOFOLLOW) || count++ == 32) - goto exit_dput; - /* - * This is subtle. Instead of calling do_follow_link() we do the - * thing by hands. The reason is that this way we have zero link_count - * and path_walk() (called from ->follow_link) honoring LOOKUP_PARENT. - * After that we have the parent and last component, i.e. - * we are in the same situation as after the first path_walk(). - * Well, almost - if the last component is normal we get its copy - * stored in nd->last.name and we will have to putname() it when we - * are done. Procfs-like symlinks just set LAST_BIND. - */ - nd.flags |= LOOKUP_PARENT; - error = security_inode_follow_link(path.dentry, &nd); - if (error) - goto exit_dput; - error = __do_follow_link(&path, &nd); - path_put(&path); - if (error) { - /* Does someone understand code flow here? Or it is only - * me so stupid? Anathema to whoever designed this non-sense - * with "intent.open". - */ +exit_dput: + path_put_conditional(&path, &nd); + if (!IS_ERR(nd.intent.open.file)) release_open_intent(&nd); - filp = ERR_PTR(error); - goto out; - } - nd.flags &= ~LOOKUP_PARENT; - filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname); - if (nd.last_type == LAST_NORM) - __putname(nd.last.name); - if (!filp) - goto do_link; +exit_parent: + path_put(&nd.path); + filp = ERR_PTR(error); goto out; } -- cgit v1.2.3 From 3866248e5f86d74960a3d1592882490ec3021675 Mon Sep 17 00:00:00 2001 From: Al Viro <viro@zeniv.linux.org.uk> Date: Sat, 26 Dec 2009 07:21:48 -0500 Subject: Finish pulling of -ESTALE handling to upper level in do_filp_open() Don't bother with path_walk() (and its retry loop); link_path_walk() will do it. Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> --- fs/namei.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 08da937b1ee2..adfbaf5c04a7 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1819,7 +1819,9 @@ reval: return ERR_PTR(error); if (force_reval) nd.flags |= LOOKUP_REVAL; - error = path_walk(pathname, &nd); + + current->total_link_count = 0; + error = link_path_walk(pathname, &nd); if (error) { filp = ERR_PTR(error); goto out; -- cgit v1.2.3 From def4af30cf945a3735ffca865788ea84b30b25d9 Mon Sep 17 00:00:00 2001 From: Al Viro <viro@zeniv.linux.org.uk> Date: Sat, 26 Dec 2009 08:37:05 -0500 Subject: Get rid of symlink body copying Now that nd->last stays around until ->put_link() is called, we can just postpone that ->put_link() in do_filp_open() a bit and don't bother with copying. Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> --- fs/namei.c | 55 ++++++++++++++++++++++++------------------------------- 1 file changed, 24 insertions(+), 31 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index adfbaf5c04a7..1f5d86d1fbf5 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -498,8 +498,6 @@ static int link_path_walk(const char *, struct nameidata *); static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link) { - int res = 0; - char *name; if (IS_ERR(link)) goto fail; @@ -510,22 +508,7 @@ static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *l path_get(&nd->root); } - res = link_path_walk(link, nd); - if (nd->depth || res || nd->last_type!=LAST_NORM) - return res; - /* - * If it is an iterative symlinks resolution in open_namei() we - * have to copy the last component. And all that crap because of - * bloody create() on broken symlinks. Furrfu... - */ - name = __getname(); - if (unlikely(!name)) { - path_put(&nd->path); - return -ENOMEM; - } - strcpy(name, nd->last.name); - nd->last.name = name; - return 0; + return link_path_walk(link, nd); fail: path_put(&nd->path); return PTR_ERR(link); @@ -547,10 +530,10 @@ static inline void path_to_nameidata(struct path *path, struct nameidata *nd) nd->path.dentry = path->dentry; } -static __always_inline int __do_follow_link(struct path *path, struct nameidata *nd) +static __always_inline int +__do_follow_link(struct path *path, struct nameidata *nd, void **p) { int error; - void *cookie; struct dentry *dentry = path->dentry; touch_atime(path->mnt, dentry); @@ -562,9 +545,9 @@ static __always_inline int __do_follow_link(struct path *path, struct nameidata } mntget(path->mnt); nd->last_type = LAST_BIND; - cookie = dentry->d_inode->i_op->follow_link(dentry, nd); - error = PTR_ERR(cookie); - if (!IS_ERR(cookie)) { + *p = dentry->d_inode->i_op->follow_link(dentry, nd); + error = PTR_ERR(*p); + if (!IS_ERR(*p)) { char *s = nd_get_link(nd); error = 0; if (s) @@ -574,8 +557,6 @@ static __always_inline int __do_follow_link(struct path *path, struct nameidata if (error) path_put(&nd->path); } - if (dentry->d_inode->i_op->put_link) - dentry->d_inode->i_op->put_link(dentry, nd, cookie); } return error; } @@ -589,6 +570,7 @@ static __always_inline int __do_follow_link(struct path *path, struct nameidata */ static inline int do_follow_link(struct path *path, struct nameidata *nd) { + void *cookie; int err = -ELOOP; if (current->link_count >= MAX_NESTED_LINKS) goto loop; @@ -602,7 +584,9 @@ static inline int do_follow_link(struct path *path, struct nameidata *nd) current->link_count++; current->total_link_count++; nd->depth++; - err = __do_follow_link(path, nd); + err = __do_follow_link(path, nd, &cookie); + if (!IS_ERR(cookie) && path->dentry->d_inode->i_op->put_link) + path->dentry->d_inode->i_op->put_link(path->dentry, nd, cookie); path_put(path); current->link_count--; nd->depth--; @@ -1847,6 +1831,9 @@ reval: nd.flags |= LOOKUP_EXCL; filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname); while (unlikely(!filp)) { /* trailing symlink */ + struct path holder; + struct inode *inode; + void *cookie; error = -ELOOP; if ((open_flag & O_NOFOLLOW) || count++ == 32) goto exit_dput; @@ -1865,18 +1852,24 @@ reval: error = security_inode_follow_link(path.dentry, &nd); if (error) goto exit_dput; - error = __do_follow_link(&path, &nd); - path_put(&path); - if (error) { + error = __do_follow_link(&path, &nd, &cookie); + if (unlikely(error)) { /* nd.path had been dropped */ + inode = path.dentry->d_inode; + if (!IS_ERR(cookie) && inode->i_op->put_link) + inode->i_op->put_link(path.dentry, &nd, cookie); + path_put(&path); release_open_intent(&nd); filp = ERR_PTR(error); goto out; } + holder = path; nd.flags &= ~LOOKUP_PARENT; filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname); - if (nd.last_type == LAST_NORM) - __putname(nd.last.name); + inode = holder.dentry->d_inode; + if (inode->i_op->put_link) + inode->i_op->put_link(holder.dentry, &nd, cookie); + path_put(&holder); } out: if (nd.root.mnt) -- cgit v1.2.3 From 1f36f774b22a0ceb7dd33eca626746c81a97b6a5 Mon Sep 17 00:00:00 2001 From: Al Viro <viro@zeniv.linux.org.uk> Date: Sat, 26 Dec 2009 10:56:19 -0500 Subject: Switch !O_CREAT case to use of do_last() ... and now we have all intents crap well localized Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> --- fs/namei.c | 127 ++++++++++++++++++++++++++++++++----------------------------- 1 file changed, 66 insertions(+), 61 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 1f5d86d1fbf5..9a6456099f1e 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1359,22 +1359,6 @@ static inline int may_create(struct inode *dir, struct dentry *child) return inode_permission(dir, MAY_WRITE | MAY_EXEC); } -/* - * O_DIRECTORY translates into forcing a directory lookup. - */ -static inline int lookup_flags(unsigned int f) -{ - unsigned long retval = LOOKUP_FOLLOW; - - if (f & O_NOFOLLOW) - retval &= ~LOOKUP_FOLLOW; - - if (f & O_DIRECTORY) - retval |= LOOKUP_DIRECTORY; - - return retval; -} - /* * p1 and p2 should be directories on the same fs. */ @@ -1631,19 +1615,60 @@ exit: static struct file *do_last(struct nameidata *nd, struct path *path, int open_flag, int acc_mode, - int mode, const char *pathname) + int mode, const char *pathname, + int *want_dir) { struct dentry *dir = nd->path.dentry; struct file *filp; - int error; + int error = -EISDIR; - if (nd->last_type == LAST_BIND) + switch (nd->last_type) { + case LAST_DOTDOT: + follow_dotdot(nd); + dir = nd->path.dentry; + if (nd->path.mnt->mnt_sb->s_type->fs_flags & FS_REVAL_DOT) { + if (!dir->d_op->d_revalidate(dir, nd)) { + error = -ESTALE; + goto exit; + } + } + /* fallthrough */ + case LAST_DOT: + case LAST_ROOT: + if (open_flag & O_CREAT) + goto exit; + /* fallthrough */ + case LAST_BIND: + audit_inode(pathname, dir); goto ok; + } - error = -EISDIR; - if (nd->last_type != LAST_NORM || nd->last.name[nd->last.len]) - goto exit; + /* trailing slashes? */ + if (nd->last.name[nd->last.len]) { + if (open_flag & O_CREAT) + goto exit; + *want_dir = 1; + } + /* just plain open? */ + if (!(open_flag & O_CREAT)) { + error = do_lookup(nd, &nd->last, path); + if (error) + goto exit; + error = -ENOENT; + if (!path->dentry->d_inode) + goto exit_dput; + if (path->dentry->d_inode->i_op->follow_link) + return NULL; + error = -ENOTDIR; + if (*want_dir & !path->dentry->d_inode->i_op->lookup) + goto exit_dput; + path_to_nameidata(path, nd); + audit_inode(pathname, nd->path.dentry); + goto ok; + } + + /* OK, it's O_CREAT */ mutex_lock(&dir->d_inode->i_mutex); path->dentry = lookup_hash(nd); @@ -1746,6 +1771,10 @@ struct file *do_filp_open(int dfd, const char *pathname, int count = 0; int flag = open_to_namei_flags(open_flag); int force_reval = 0; + int want_dir = open_flag & O_DIRECTORY; + + if (!(open_flag & O_CREAT)) + mode = 0; /* * O_SYNC is implemented as __O_SYNC|O_DSYNC. As many places only @@ -1768,35 +1797,7 @@ struct file *do_filp_open(int dfd, const char *pathname, if (open_flag & O_APPEND) acc_mode |= MAY_APPEND; - /* - * The simplest case - just a plain lookup. - */ - if (!(open_flag & O_CREAT)) { - filp = get_empty_filp(); - - if (filp == NULL) - return ERR_PTR(-ENFILE); - nd.intent.open.file = filp; - filp->f_flags = open_flag; - nd.intent.open.flags = flag; - nd.intent.open.create_mode = 0; - error = do_path_lookup(dfd, pathname, - lookup_flags(open_flag)|LOOKUP_OPEN, &nd); - if (IS_ERR(nd.intent.open.file)) { - if (error == 0) { - error = PTR_ERR(nd.intent.open.file); - path_put(&nd.path); - } - } else if (error) - release_open_intent(&nd); - if (error) - return ERR_PTR(error); - return finish_open(&nd, open_flag, acc_mode); - } - - /* - * Create - we need to know the parent. - */ + /* find the parent */ reval: error = path_init(dfd, pathname, LOOKUP_PARENT, &nd); if (error) @@ -1810,7 +1811,7 @@ reval: filp = ERR_PTR(error); goto out; } - if (unlikely(!audit_dummy_context())) + if (unlikely(!audit_dummy_context()) && (open_flag & O_CREAT)) audit_inode(pathname, nd.path.dentry); /* @@ -1826,16 +1827,22 @@ reval: nd.intent.open.flags = flag; nd.intent.open.create_mode = mode; nd.flags &= ~LOOKUP_PARENT; - nd.flags |= LOOKUP_CREATE | LOOKUP_OPEN; - if (open_flag & O_EXCL) - nd.flags |= LOOKUP_EXCL; - filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname); + nd.flags |= LOOKUP_OPEN; + if (open_flag & O_CREAT) { + nd.flags |= LOOKUP_CREATE; + if (open_flag & O_EXCL) + nd.flags |= LOOKUP_EXCL; + } + filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname, &want_dir); while (unlikely(!filp)) { /* trailing symlink */ struct path holder; - struct inode *inode; + struct inode *inode = path.dentry->d_inode; void *cookie; error = -ELOOP; - if ((open_flag & O_NOFOLLOW) || count++ == 32) + /* S_ISDIR part is a temporary automount kludge */ + if ((open_flag & O_NOFOLLOW) && !S_ISDIR(inode->i_mode)) + goto exit_dput; + if (count++ == 32) goto exit_dput; /* * This is subtle. Instead of calling do_follow_link() we do @@ -1855,7 +1862,6 @@ reval: error = __do_follow_link(&path, &nd, &cookie); if (unlikely(error)) { /* nd.path had been dropped */ - inode = path.dentry->d_inode; if (!IS_ERR(cookie) && inode->i_op->put_link) inode->i_op->put_link(path.dentry, &nd, cookie); path_put(&path); @@ -1865,8 +1871,7 @@ reval: } holder = path; nd.flags &= ~LOOKUP_PARENT; - filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname); - inode = holder.dentry->d_inode; + filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname, &want_dir); if (inode->i_op->put_link) inode->i_op->put_link(holder.dentry, &nd, cookie); path_put(&holder); -- cgit v1.2.3 From 6a08ab846cefc82a328cbf9abd96c2e58a6c3664 Mon Sep 17 00:00:00 2001 From: Joern Engel <joern@logfs.org> Date: Fri, 5 Mar 2010 16:07:04 +0100 Subject: [LogFS] Check feature flags --- fs/logfs/logfs_abi.h | 4 ++++ fs/logfs/super.c | 10 ++++++++-- 2 files changed, 12 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/logfs/logfs_abi.h b/fs/logfs/logfs_abi.h index 5d3782ddecc8..8d4dd3de551e 100644 --- a/fs/logfs/logfs_abi.h +++ b/fs/logfs/logfs_abi.h @@ -193,6 +193,10 @@ struct logfs_segment_header { SIZE_CHECK(logfs_segment_header, LOGFS_SEGMENT_HEADERSIZE); +#define LOGFS_FEATURES_INCOMPAT (0ull) +#define LOGFS_FEATURES_RO_COMPAT (0ull) +#define LOGFS_FEATURES_COMPAT (0ull) + /** * struct logfs_disk_super - on-medium superblock * diff --git a/fs/logfs/super.c b/fs/logfs/super.c index 1d081b7ede5a..c66beab78dee 100644 --- a/fs/logfs/super.c +++ b/fs/logfs/super.c @@ -440,7 +440,7 @@ static int __logfs_read_sb(struct super_block *sb) return 0; } -static int logfs_read_sb(struct super_block *sb) +static int logfs_read_sb(struct super_block *sb, int read_only) { struct logfs_super *super = logfs_super(sb); int ret; @@ -460,6 +460,12 @@ static int logfs_read_sb(struct super_block *sb) if (ret) return ret; + if (super->s_feature_incompat & ~LOGFS_FEATURES_INCOMPAT) + return -EIO; + if ((super->s_feature_ro_compat & ~LOGFS_FEATURES_RO_COMPAT) && + !read_only) + return -EIO; + mutex_init(&super->s_dirop_mutex); mutex_init(&super->s_object_alias_mutex); INIT_LIST_HEAD(&super->s_freeing_list); @@ -555,7 +561,7 @@ int logfs_get_sb_device(struct file_system_type *type, int flags, sb->s_op = &logfs_super_operations; sb->s_flags = flags | MS_NOATIME; - err = logfs_read_sb(sb); + err = logfs_read_sb(sb, sb->s_flags & MS_RDONLY); if (err) goto err1; -- cgit v1.2.3 From ae4a3179b11a97a5b36a768ae6ac1662d0315ff0 Mon Sep 17 00:00:00 2001 From: Phillip Lougher <phillip@lougher.demon.co.uk> Date: Thu, 25 Feb 2010 00:54:48 +0000 Subject: Squashfs: get rid of obsolete variable in struct squashfs_sb_info Signed-off-by: Phillip Lougher <phillip@lougher.demon.co.uk> --- fs/squashfs/squashfs_fs_sb.h | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h index 753335085e41..2e77dc547e25 100644 --- a/fs/squashfs/squashfs_fs_sb.h +++ b/fs/squashfs/squashfs_fs_sb.h @@ -61,7 +61,6 @@ struct squashfs_sb_info { int next_meta_index; __le64 *id_table; __le64 *fragment_index; - unsigned int *fragment_index_2; struct mutex read_data_mutex; struct mutex meta_index_mutex; struct meta_index *meta_index; -- cgit v1.2.3 From 06862f884d9c2453daaf0c1d070c69cf444f10b1 Mon Sep 17 00:00:00 2001 From: Phillip Lougher <phillip@lougher.demon.co.uk> Date: Thu, 25 Feb 2010 01:31:13 +0000 Subject: Squashfs: get rid of obsolete definition in header file Signed-off-by: Phillip Lougher <phillip@lougher.demon.co.uk> --- fs/squashfs/squashfs_fs.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h index 36e1604ab1c1..79024245ea00 100644 --- a/fs/squashfs/squashfs_fs.h +++ b/fs/squashfs/squashfs_fs.h @@ -183,8 +183,6 @@ #define SQUASHFS_MAX_FILE_SIZE (1LL << \ (SQUASHFS_MAX_FILE_SIZE_LOG - 2)) -#define SQUASHFS_MARKER_BYTE 0xff - /* meta index cache */ #define SQUASHFS_META_INDEXES (SQUASHFS_METADATA_SIZE / sizeof(unsigned int)) #define SQUASHFS_META_ENTRIES 127 -- cgit v1.2.3 From 694189328a7e566cb84bd3205503a42b60e87882 Mon Sep 17 00:00:00 2001 From: Dave Chinner <david@fromorbit.com> Date: Thu, 4 Mar 2010 00:57:09 +0000 Subject: xfs: Fix a build warning in xfs_aops.c Fix a build warning that slipped through. Dave Chinner had posted an updated version of his patch but the previous version--without this fix--was what got committed. Reported-by: Stephen Rothwell <sfr@canb.auug.org.au> Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au> Signed-off-by: Dave Chinner <david@fromorbit.com> Signed-off-by: Alex Elder <aelder@sgi.com> --- fs/xfs/linux-2.6/xfs_aops.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index b493c63976cd..793908a8a09b 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -229,7 +229,7 @@ xfs_end_io( { xfs_ioend_t *ioend = container_of(work, xfs_ioend_t, io_work); struct xfs_inode *ip = XFS_I(ioend->io_inode); - int error; + int error = 0; /* * For unwritten extents we need to issue transactions to convert a -- cgit v1.2.3 From 20f6b2c785cf187445f126321638ab8ba7aa7494 Mon Sep 17 00:00:00 2001 From: Dave Chinner <david@fromorbit.com> Date: Thu, 4 Mar 2010 01:46:23 +0000 Subject: xfs: check for more work before sleeping in xfssyncd xfssyncd processes a queue of work by detaching the queue and then iterating over all the work items. It then sleeps for a time period or until new work comes in. If new work is queued while xfssyncd is actively processing the detached work queue, it will not process that new work until after a sleep timeout or the next work event queued wakes it. Fix this by checking the work queue again before going to sleep. Signed-off-by: Dave Chinner <david@fromorbit.com> Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Alex Elder <aelder@sgi.com> --- fs/xfs/linux-2.6/xfs_sync.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index f9fc154d9f72..05cd85317f6f 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c @@ -607,7 +607,8 @@ xfssyncd( set_freezable(); timeleft = xfs_syncd_centisecs * msecs_to_jiffies(10); for (;;) { - timeleft = schedule_timeout_interruptible(timeleft); + if (list_empty(&mp->m_sync_list)) + timeleft = schedule_timeout_interruptible(timeleft); /* swsusp */ try_to_freeze(); if (kthread_should_stop() && list_empty(&mp->m_sync_list)) @@ -627,8 +628,7 @@ xfssyncd( list_add_tail(&mp->m_sync_work.w_list, &mp->m_sync_list); } - list_for_each_entry_safe(work, n, &mp->m_sync_list, w_list) - list_move(&work->w_list, &tmp); + list_splice_init(&mp->m_sync_list, &tmp); spin_unlock(&mp->m_sync_lock); list_for_each_entry_safe(work, n, &tmp, w_list) { -- cgit v1.2.3 From 3ed3a4343b79a79d10e31f85f2d1afabcead76c6 Mon Sep 17 00:00:00 2001 From: Dave Chinner <dchinner@redhat.com> Date: Fri, 5 Mar 2010 02:00:42 +0000 Subject: xfs: truncate delalloc extents when IO fails in writeback We currently use block_invalidatepage() to clean up pages where I/O fails in ->writepage(). Unfortunately, if the page has delalloc regions on it, we fail to remove the delalloc regions when we invalidate the page. This can result in tripping a BUG() in xfs_get_blocks() later on if a direct IO read is done on that same region - the delalloc extent is returned when none is supposed to be there. Fix this by truncating away the delalloc regions on the page before invalidating it. Because they are delalloc, we can do this without needing a transaction. Indeed - if we get ENOSPC errors, we have to be able to do this truncation without a transaction as there is no space left for block reservation (typically why we see a ENOSPC in writeback). Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Alex Elder <aelder@sgi.com> --- fs/xfs/linux-2.6/xfs_aops.c | 124 ++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 114 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index 793908a8a09b..9083357f9e44 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -39,6 +39,7 @@ #include "xfs_iomap.h" #include "xfs_vnodeops.h" #include "xfs_trace.h" +#include "xfs_bmap.h" #include <linux/mpage.h> #include <linux/pagevec.h> #include <linux/writeback.h> @@ -893,6 +894,118 @@ xfs_cluster_write( } } +STATIC void +xfs_vm_invalidatepage( + struct page *page, + unsigned long offset) +{ + trace_xfs_invalidatepage(page->mapping->host, page, offset); + block_invalidatepage(page, offset); +} + +/* + * If the page has delalloc buffers on it, we need to punch them out before we + * invalidate the page. If we don't, we leave a stale delalloc mapping on the + * inode that can trip a BUG() in xfs_get_blocks() later on if a direct IO read + * is done on that same region - the delalloc extent is returned when none is + * supposed to be there. + * + * We prevent this by truncating away the delalloc regions on the page before + * invalidating it. Because they are delalloc, we can do this without needing a + * transaction. Indeed - if we get ENOSPC errors, we have to be able to do this + * truncation without a transaction as there is no space left for block + * reservation (typically why we see a ENOSPC in writeback). + * + * This is not a performance critical path, so for now just do the punching a + * buffer head at a time. + */ +STATIC void +xfs_aops_discard_page( + struct page *page) +{ + struct inode *inode = page->mapping->host; + struct xfs_inode *ip = XFS_I(inode); + struct buffer_head *bh, *head; + loff_t offset = page_offset(page); + ssize_t len = 1 << inode->i_blkbits; + + if (!xfs_is_delayed_page(page, IOMAP_DELAY)) + goto out_invalidate; + + xfs_fs_cmn_err(CE_ALERT, ip->i_mount, + "page discard on page %p, inode 0x%llx, offset %llu.", + page, ip->i_ino, offset); + + xfs_ilock(ip, XFS_ILOCK_EXCL); + bh = head = page_buffers(page); + do { + int done; + xfs_fileoff_t offset_fsb; + xfs_bmbt_irec_t imap; + int nimaps = 1; + int error; + xfs_fsblock_t firstblock; + xfs_bmap_free_t flist; + + if (!buffer_delay(bh)) + goto next_buffer; + + offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); + + /* + * Map the range first and check that it is a delalloc extent + * before trying to unmap the range. Otherwise we will be + * trying to remove a real extent (which requires a + * transaction) or a hole, which is probably a bad idea... + */ + error = xfs_bmapi(NULL, ip, offset_fsb, 1, + XFS_BMAPI_ENTIRE, NULL, 0, &imap, + &nimaps, NULL, NULL); + + if (error) { + /* something screwed, just bail */ + xfs_fs_cmn_err(CE_ALERT, ip->i_mount, + "page discard failed delalloc mapping lookup."); + break; + } + if (!nimaps) { + /* nothing there */ + goto next_buffer; + } + if (imap.br_startblock != DELAYSTARTBLOCK) { + /* been converted, ignore */ + goto next_buffer; + } + WARN_ON(imap.br_blockcount == 0); + + /* + * Note: while we initialise the firstblock/flist pair, they + * should never be used because blocks should never be + * allocated or freed for a delalloc extent and hence we need + * don't cancel or finish them after the xfs_bunmapi() call. + */ + xfs_bmap_init(&flist, &firstblock); + error = xfs_bunmapi(NULL, ip, offset_fsb, 1, 0, 1, &firstblock, + &flist, NULL, &done); + + ASSERT(!flist.xbf_count && !flist.xbf_first); + if (error) { + /* something screwed, just bail */ + xfs_fs_cmn_err(CE_ALERT, ip->i_mount, + "page discard unable to remove delalloc mapping."); + break; + } +next_buffer: + offset += len; + + } while ((bh = bh->b_this_page) != head); + + xfs_iunlock(ip, XFS_ILOCK_EXCL); +out_invalidate: + xfs_vm_invalidatepage(page, 0); + return; +} + /* * Calling this without startio set means we are being asked to make a dirty * page ready for freeing it's buffers. When called with startio set then @@ -1144,7 +1257,7 @@ error: */ if (err != -EAGAIN) { if (!unmapped) - block_invalidatepage(page, 0); + xfs_aops_discard_page(page); ClearPageUptodate(page); } return err; @@ -1554,15 +1667,6 @@ xfs_vm_readpages( return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks); } -STATIC void -xfs_vm_invalidatepage( - struct page *page, - unsigned long offset) -{ - trace_xfs_invalidatepage(page->mapping->host, page, offset); - block_invalidatepage(page, offset); -} - const struct address_space_operations xfs_address_space_operations = { .readpage = xfs_vm_readpage, .readpages = xfs_vm_readpages, -- cgit v1.2.3 From 8babd8a2e75cccff3167a61176c2a3e977e13799 Mon Sep 17 00:00:00 2001 From: Dave Chinner <david@fromorbit.com> Date: Thu, 4 Mar 2010 01:46:25 +0000 Subject: xfs: Increase the default size of the reserved blocks pool The current default size of the reserved blocks pool is easy to deplete with certain workloads, in particular workloads that do lots of concurrent delayed allocation extent conversions. If enough transactions are running in parallel and the entire pool is consumed then subsequent calls to xfs_trans_reserve() will fail with ENOSPC. Also add a rate limited warning so we know if this starts happening again. This is an updated version of an old patch from Lachlan McIlroy. Signed-off-by: Dave Chinner <david@fromorbit.com> Signed-off-by: Alex Elder <aelder@sgi.com> --- fs/xfs/xfs_mount.c | 49 +++++++++++++++++++++++++++++-------------------- 1 file changed, 29 insertions(+), 20 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index c207fef6770b..e79b56b4bca6 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1097,13 +1097,15 @@ xfs_default_resblks(xfs_mount_t *mp) __uint64_t resblks; /* - * We default to 5% or 1024 fsbs of space reserved, whichever is smaller. - * This may drive us straight to ENOSPC on mount, but that implies - * we were already there on the last unmount. Warn if this occurs. + * We default to 5% or 8192 fsbs of space reserved, whichever is + * smaller. This is intended to cover concurrent allocation + * transactions when we initially hit enospc. These each require a 4 + * block reservation. Hence by default we cover roughly 2000 concurrent + * allocation reservations. */ resblks = mp->m_sb.sb_dblocks; do_div(resblks, 20); - resblks = min_t(__uint64_t, resblks, 1024); + resblks = min_t(__uint64_t, resblks, 8192); return resblks; } @@ -1417,6 +1419,9 @@ xfs_mountfs( * when at ENOSPC. This is needed for operations like create with * attr, unwritten extent conversion at ENOSPC, etc. Data allocations * are not allowed to use this reserved space. + * + * This may drive us straight to ENOSPC on mount, but that implies + * we were already there on the last unmount. Warn if this occurs. */ if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { resblks = xfs_default_resblks(mp); @@ -1725,26 +1730,30 @@ xfs_mod_incore_sb_unlocked( lcounter += rem; } } else { /* Taking blocks away */ - lcounter += delta; + if (lcounter >= 0) { + mp->m_sb.sb_fdblocks = lcounter + + XFS_ALLOC_SET_ASIDE(mp); + return 0; + } - /* - * If were out of blocks, use any available reserved blocks if - * were allowed to. - */ + /* + * We are out of blocks, use any available reserved + * blocks if were allowed to. + */ + if (!rsvd) + return XFS_ERROR(ENOSPC); - if (lcounter < 0) { - if (rsvd) { - lcounter = (long long)mp->m_resblks_avail + delta; - if (lcounter < 0) { - return XFS_ERROR(ENOSPC); - } - mp->m_resblks_avail = lcounter; - return 0; - } else { /* not reserved */ - return XFS_ERROR(ENOSPC); - } + lcounter = (long long)mp->m_resblks_avail + delta; + if (lcounter >= 0) { + mp->m_resblks_avail = lcounter; + return 0; } + printk_once(KERN_WARNING + "Filesystem \"%s\": reserve blocks depleted! " + "Consider increasing reserve pool size.", + mp->m_fsname); + return XFS_ERROR(ENOSPC); } mp->m_sb.sb_fdblocks = lcounter + XFS_ALLOC_SET_ASIDE(mp); -- cgit v1.2.3 From 07000ee686cf19e853fa06f7904eff2cfe230ea3 Mon Sep 17 00:00:00 2001 From: Dave Chinner <dchinner@redhat.com> Date: Fri, 5 Mar 2010 04:41:14 +0000 Subject: xfs: return inode fork offset in bulkstat for fsr So that fsr can attempt to get the fork offset of the temporary inode it uses the same as the inode it is defragmenting, pass the fork offset out in the bulkstat information. The bulkstat structure has padding that has always been zeroed, so userspace can tell if this field is set or not by use of the xattr present flag and a non-zero value for the fork offset. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Alex Elder <aelder@sgi.com> --- fs/xfs/xfs_fs.h | 3 ++- fs/xfs/xfs_itable.c | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h index f52ac276277e..7cf7220e7d5f 100644 --- a/fs/xfs/xfs_fs.h +++ b/fs/xfs/xfs_fs.h @@ -292,7 +292,8 @@ typedef struct xfs_bstat { __s32 bs_extents; /* number of extents */ __u32 bs_gen; /* generation count */ __u16 bs_projid; /* project id */ - unsigned char bs_pad[14]; /* pad space, unused */ + __u16 bs_forkoff; /* inode fork offset in bytes */ + unsigned char bs_pad[12]; /* pad space, unused */ __u32 bs_dmevmask; /* DMIG event mask */ __u16 bs_dmstate; /* DMIG state info */ __u16 bs_aextents; /* attribute number of extents */ diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index 3af02314c605..b1b801e4a28e 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -106,6 +106,7 @@ xfs_bulkstat_one_iget( buf->bs_dmevmask = dic->di_dmevmask; buf->bs_dmstate = dic->di_dmstate; buf->bs_aextents = dic->di_anextents; + buf->bs_forkoff = XFS_IFORK_BOFF(ip); switch (dic->di_format) { case XFS_DINODE_FMT_DEV: @@ -176,6 +177,7 @@ xfs_bulkstat_one_dinode( buf->bs_dmevmask = be32_to_cpu(dic->di_dmevmask); buf->bs_dmstate = be16_to_cpu(dic->di_dmstate); buf->bs_aextents = be16_to_cpu(dic->di_anextents); + buf->bs_forkoff = XFS_DFORK_BOFF(dic); switch (dic->di_format) { case XFS_DINODE_FMT_DEV: -- cgit v1.2.3 From 26821ed40b4230259e770c9911180f38fcaa6f59 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Fri, 5 Mar 2010 09:21:21 +0100 Subject: make sure data is on disk before calling ->write_inode Similar to the fsync issue fixed a while ago in commit 2daea67e966dc0c42067ebea015ddac6834cef88 we need to write for data to actually hit the disk before writing out the metadata to guarantee data integrity for filesystems that modify the inode in the data I/O completion path. Currently XFS and NFS handle this manually, and AFS has a write_inode method that does nothing but waiting for data, while others are possibly missing out on this. Fortunately this change has a lot less impact than the fsync change as none of the write_inode methods starts data writeout of any form by itself. Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> --- fs/afs/internal.h | 1 - fs/afs/super.c | 1 - fs/afs/write.c | 21 --------------------- fs/fs-writeback.c | 15 ++++++++++----- fs/nfs/inode.c | 7 +------ fs/xfs/linux-2.6/xfs_super.c | 4 ---- 6 files changed, 11 insertions(+), 38 deletions(-) (limited to 'fs') diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 6ece2a13bf71..c54dad4e6063 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -733,7 +733,6 @@ extern int afs_write_end(struct file *file, struct address_space *mapping, struct page *page, void *fsdata); extern int afs_writepage(struct page *, struct writeback_control *); extern int afs_writepages(struct address_space *, struct writeback_control *); -extern int afs_write_inode(struct inode *, int); extern void afs_pages_written_back(struct afs_vnode *, struct afs_call *); extern ssize_t afs_file_write(struct kiocb *, const struct iovec *, unsigned long, loff_t); diff --git a/fs/afs/super.c b/fs/afs/super.c index e1ea1c240b6a..14f6431598ad 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -48,7 +48,6 @@ struct file_system_type afs_fs_type = { static const struct super_operations afs_super_ops = { .statfs = afs_statfs, .alloc_inode = afs_alloc_inode, - .write_inode = afs_write_inode, .destroy_inode = afs_destroy_inode, .clear_inode = afs_clear_inode, .put_super = afs_put_super, diff --git a/fs/afs/write.c b/fs/afs/write.c index 5e15a21dbf9f..3bed54a294d4 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -584,27 +584,6 @@ int afs_writepages(struct address_space *mapping, return ret; } -/* - * write an inode back - */ -int afs_write_inode(struct inode *inode, int sync) -{ - struct afs_vnode *vnode = AFS_FS_I(inode); - int ret; - - _enter("{%x:%u},", vnode->fid.vid, vnode->fid.vnode); - - ret = 0; - if (sync) { - ret = filemap_fdatawait(inode->i_mapping); - if (ret < 0) - __mark_inode_dirty(inode, I_DIRTY_DATASYNC); - } - - _leave(" = %d", ret); - return ret; -} - /* * completion of write to server */ diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 1a7c42c64ff4..5f2721b1e4be 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -461,15 +461,20 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) ret = do_writepages(mapping, wbc); - /* Don't write the inode if only I_DIRTY_PAGES was set */ - if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { - int err = write_inode(inode, wait); + /* + * Make sure to wait on the data before writing out the metadata. + * This is important for filesystems that modify metadata on data + * I/O completion. + */ + if (wait) { + int err = filemap_fdatawait(mapping); if (ret == 0) ret = err; } - if (wait) { - int err = filemap_fdatawait(mapping); + /* Don't write the inode if only I_DIRTY_PAGES was set */ + if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { + int err = write_inode(inode, wait); if (ret == 0) ret = err; } diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 7570573bdb30..5ecd952cae1d 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -101,12 +101,7 @@ int nfs_write_inode(struct inode *inode, int sync) { int ret; - if (sync) { - ret = filemap_fdatawait(inode->i_mapping); - if (ret == 0) - ret = nfs_commit_inode(inode, FLUSH_SYNC); - } else - ret = nfs_commit_inode(inode, 0); + ret = nfs_commit_inode(inode, sync ? FLUSH_SYNC : 0); if (ret >= 0) return 0; __mark_inode_dirty(inode, I_DIRTY_DATASYNC); diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 25ea2408118f..8f117db6070e 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -1075,10 +1075,6 @@ xfs_fs_write_inode( return XFS_ERROR(EIO); if (sync) { - error = xfs_wait_on_pages(ip, 0, -1); - if (error) - goto out; - /* * Make sure the inode has hit stable storage. By using the * log and the fsync transactions we reduce the IOs we have -- cgit v1.2.3 From a9185b41a4f84971b930c519f0c63bd450c4810d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Fri, 5 Mar 2010 09:21:37 +0100 Subject: pass writeback_control to ->write_inode This gives the filesystem more information about the writeback that is happening. Trond requested this for the NFS unstable write handling, and other filesystems might benefit from this too by beeing able to distinguish between the different callers in more detail. Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> --- fs/adfs/adfs.h | 2 +- fs/adfs/inode.c | 5 +++-- fs/affs/affs.h | 3 ++- fs/affs/inode.c | 2 +- fs/bfs/inode.c | 5 +++-- fs/btrfs/ctree.h | 2 +- fs/btrfs/inode.c | 4 ++-- fs/exofs/exofs.h | 2 +- fs/exofs/inode.c | 4 ++-- fs/ext2/ext2.h | 2 +- fs/ext2/inode.c | 11 +++++++++-- fs/ext3/inode.c | 4 ++-- fs/ext4/ext4.h | 2 +- fs/ext4/inode.c | 6 +++--- fs/fat/inode.c | 9 +++++++-- fs/fs-writeback.c | 11 +++++------ fs/gfs2/super.c | 5 +++-- fs/hfs/hfs_fs.h | 2 +- fs/hfs/inode.c | 2 +- fs/hfsplus/super.c | 3 ++- fs/jfs/inode.c | 5 ++++- fs/jfs/jfs_inode.h | 2 +- fs/minix/inode.c | 8 +++++--- fs/nfs/inode.c | 5 +++-- fs/nfs/internal.h | 2 +- fs/ntfs/dir.c | 2 +- fs/ntfs/file.c | 2 +- fs/ntfs/inode.c | 2 +- fs/ntfs/inode.h | 4 ++-- fs/ntfs/super.c | 8 ++++++++ fs/omfs/inode.c | 10 ++++++++-- fs/reiserfs/inode.c | 4 ++-- fs/sysv/inode.c | 10 ++++++++-- fs/sysv/sysv.h | 2 +- fs/ubifs/dir.c | 2 +- fs/ubifs/file.c | 8 ++++---- fs/ubifs/super.c | 2 +- fs/udf/inode.c | 4 ++-- fs/udf/udfdecl.h | 2 +- fs/ufs/inode.c | 5 +++-- fs/ufs/ufs.h | 2 +- fs/xfs/linux-2.6/xfs_super.c | 4 ++-- include/linux/ext3_fs.h | 2 +- include/linux/fs.h | 2 +- include/linux/reiserfs_fs.h | 2 +- 45 files changed, 115 insertions(+), 72 deletions(-) (limited to 'fs') diff --git a/fs/adfs/adfs.h b/fs/adfs/adfs.h index 9cc18775b832..2ff622f6f547 100644 --- a/fs/adfs/adfs.h +++ b/fs/adfs/adfs.h @@ -121,7 +121,7 @@ struct adfs_discmap { /* Inode stuff */ struct inode *adfs_iget(struct super_block *sb, struct object_info *obj); -int adfs_write_inode(struct inode *inode,int unused); +int adfs_write_inode(struct inode *inode, struct writeback_control *wbc); int adfs_notify_change(struct dentry *dentry, struct iattr *attr); /* map.c */ diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c index 3f57ce4bee5d..0f5e30978135 100644 --- a/fs/adfs/inode.c +++ b/fs/adfs/inode.c @@ -9,6 +9,7 @@ */ #include <linux/smp_lock.h> #include <linux/buffer_head.h> +#include <linux/writeback.h> #include "adfs.h" /* @@ -360,7 +361,7 @@ out: * The adfs-specific inode data has already been updated by * adfs_notify_change() */ -int adfs_write_inode(struct inode *inode, int wait) +int adfs_write_inode(struct inode *inode, struct writeback_control *wbc) { struct super_block *sb = inode->i_sb; struct object_info obj; @@ -375,7 +376,7 @@ int adfs_write_inode(struct inode *inode, int wait) obj.attr = ADFS_I(inode)->attr; obj.size = inode->i_size; - ret = adfs_dir_update(sb, &obj, wait); + ret = adfs_dir_update(sb, &obj, wbc->sync_mode == WB_SYNC_ALL); unlock_kernel(); return ret; } diff --git a/fs/affs/affs.h b/fs/affs/affs.h index 0e40caaba456..861dae68ac12 100644 --- a/fs/affs/affs.h +++ b/fs/affs/affs.h @@ -175,7 +175,8 @@ extern void affs_delete_inode(struct inode *inode); extern void affs_clear_inode(struct inode *inode); extern struct inode *affs_iget(struct super_block *sb, unsigned long ino); -extern int affs_write_inode(struct inode *inode, int); +extern int affs_write_inode(struct inode *inode, + struct writeback_control *wbc); extern int affs_add_entry(struct inode *dir, struct inode *inode, struct dentry *dentry, s32 type); /* file.c */ diff --git a/fs/affs/inode.c b/fs/affs/inode.c index 3c4ec7d864c4..c9744d771d98 100644 --- a/fs/affs/inode.c +++ b/fs/affs/inode.c @@ -166,7 +166,7 @@ bad_inode: } int -affs_write_inode(struct inode *inode, int unused) +affs_write_inode(struct inode *inode, struct writeback_control *wbc) { struct super_block *sb = inode->i_sb; struct buffer_head *bh; diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index 8f3d9fd89604..f22a7d3dc362 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -15,6 +15,7 @@ #include <linux/smp_lock.h> #include <linux/buffer_head.h> #include <linux/vfs.h> +#include <linux/writeback.h> #include <asm/uaccess.h> #include "bfs.h" @@ -98,7 +99,7 @@ error: return ERR_PTR(-EIO); } -static int bfs_write_inode(struct inode *inode, int wait) +static int bfs_write_inode(struct inode *inode, struct writeback_control *wbc) { struct bfs_sb_info *info = BFS_SB(inode->i_sb); unsigned int ino = (u16)inode->i_ino; @@ -147,7 +148,7 @@ static int bfs_write_inode(struct inode *inode, int wait) di->i_eoffset = cpu_to_le32(i_sblock * BFS_BSIZE + inode->i_size - 1); mark_buffer_dirty(bh); - if (wait) { + if (wbc->sync_mode == WB_SYNC_ALL) { sync_dirty_buffer(bh); if (buffer_req(bh) && !buffer_uptodate(bh)) err = -EIO; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 2aa8ec6a0981..8b5cfdd4bfc1 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2326,7 +2326,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); int btrfs_readpage(struct file *file, struct page *page); void btrfs_delete_inode(struct inode *inode); void btrfs_put_inode(struct inode *inode); -int btrfs_write_inode(struct inode *inode, int wait); +int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc); void btrfs_dirty_inode(struct inode *inode); struct inode *btrfs_alloc_inode(struct super_block *sb); void btrfs_destroy_inode(struct inode *inode); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 4deb280f8969..c41db6d45ab6 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3968,7 +3968,7 @@ err: return ret; } -int btrfs_write_inode(struct inode *inode, int wait) +int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc) { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; @@ -3977,7 +3977,7 @@ int btrfs_write_inode(struct inode *inode, int wait) if (root->fs_info->btree_inode == inode) return 0; - if (wait) { + if (wbc->sync_mode == WB_SYNC_ALL) { trans = btrfs_join_transaction(root, 1); btrfs_set_trans_block_group(trans, inode); ret = btrfs_commit_transaction(trans, root); diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h index 59b8bf2825c7..8442e353309f 100644 --- a/fs/exofs/exofs.h +++ b/fs/exofs/exofs.h @@ -261,7 +261,7 @@ int exofs_write_begin(struct file *file, struct address_space *mapping, struct page **pagep, void **fsdata); extern struct inode *exofs_iget(struct super_block *, unsigned long); struct inode *exofs_new_inode(struct inode *, int); -extern int exofs_write_inode(struct inode *, int); +extern int exofs_write_inode(struct inode *, struct writeback_control *wbc); extern void exofs_delete_inode(struct inode *); /* dir.c: */ diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index 5514f3c2c2f4..a17e4b733e35 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -1280,9 +1280,9 @@ out: return ret; } -int exofs_write_inode(struct inode *inode, int wait) +int exofs_write_inode(struct inode *inode, struct writeback_control *wbc) { - return exofs_update_inode(inode, wait); + return exofs_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL); } /* diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index 061914add3cf..0b038e47ad2f 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -118,7 +118,7 @@ extern unsigned long ext2_count_free (struct buffer_head *, unsigned); /* inode.c */ extern struct inode *ext2_iget (struct super_block *, unsigned long); -extern int ext2_write_inode (struct inode *, int); +extern int ext2_write_inode (struct inode *, struct writeback_control *); extern void ext2_delete_inode (struct inode *); extern int ext2_sync_inode (struct inode *); extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int); diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 71b032c65a02..36ae1cac767c 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -41,6 +41,8 @@ MODULE_AUTHOR("Remy Card and others"); MODULE_DESCRIPTION("Second Extended Filesystem"); MODULE_LICENSE("GPL"); +static int __ext2_write_inode(struct inode *inode, int do_sync); + /* * Test whether an inode is a fast symlink. */ @@ -64,7 +66,7 @@ void ext2_delete_inode (struct inode * inode) goto no_delete; EXT2_I(inode)->i_dtime = get_seconds(); mark_inode_dirty(inode); - ext2_write_inode(inode, inode_needs_sync(inode)); + __ext2_write_inode(inode, inode_needs_sync(inode)); inode->i_size = 0; if (inode->i_blocks) @@ -1335,7 +1337,7 @@ bad_inode: return ERR_PTR(ret); } -int ext2_write_inode(struct inode *inode, int do_sync) +static int __ext2_write_inode(struct inode *inode, int do_sync) { struct ext2_inode_info *ei = EXT2_I(inode); struct super_block *sb = inode->i_sb; @@ -1440,6 +1442,11 @@ int ext2_write_inode(struct inode *inode, int do_sync) return err; } +int ext2_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + return __ext2_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL); +} + int ext2_sync_inode(struct inode *inode) { struct writeback_control wbc = { diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 455e6e6e5cb9..7aca55fcc976 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -3096,7 +3096,7 @@ out_brelse: * `stuff()' is running, and the new i_size will be lost. Plus the inode * will no longer be on the superblock's dirty inode list. */ -int ext3_write_inode(struct inode *inode, int wait) +int ext3_write_inode(struct inode *inode, struct writeback_control *wbc) { if (current->flags & PF_MEMALLOC) return 0; @@ -3107,7 +3107,7 @@ int ext3_write_inode(struct inode *inode, int wait) return -EIO; } - if (!wait) + if (wbc->sync_mode != WB_SYNC_ALL) return 0; return ext3_force_commit(inode->i_sb); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 4cedc91ec59d..50af1a2c65e7 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1416,7 +1416,7 @@ int ext4_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); extern struct inode *ext4_iget(struct super_block *, unsigned long); -extern int ext4_write_inode(struct inode *, int); +extern int ext4_write_inode(struct inode *, struct writeback_control *); extern int ext4_setattr(struct dentry *, struct iattr *); extern int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index e11952404e02..d01a6cdbf854 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5177,7 +5177,7 @@ out_brelse: * `stuff()' is running, and the new i_size will be lost. Plus the inode * will no longer be on the superblock's dirty inode list. */ -int ext4_write_inode(struct inode *inode, int wait) +int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) { int err; @@ -5191,7 +5191,7 @@ int ext4_write_inode(struct inode *inode, int wait) return -EIO; } - if (!wait) + if (wbc->sync_mode != WB_SYNC_ALL) return 0; err = ext4_force_commit(inode->i_sb); @@ -5201,7 +5201,7 @@ int ext4_write_inode(struct inode *inode, int wait) err = ext4_get_inode_loc(inode, &iloc); if (err) return err; - if (wait) + if (wbc->sync_mode == WB_SYNC_ALL) sync_dirty_buffer(iloc.bh); if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) { ext4_error(inode->i_sb, __func__, diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 14da530b05ca..fbeecdc194dc 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -577,7 +577,7 @@ static inline loff_t fat_i_pos_read(struct msdos_sb_info *sbi, return i_pos; } -static int fat_write_inode(struct inode *inode, int wait) +static int __fat_write_inode(struct inode *inode, int wait) { struct super_block *sb = inode->i_sb; struct msdos_sb_info *sbi = MSDOS_SB(sb); @@ -634,9 +634,14 @@ retry: return err; } +static int fat_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + return __fat_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL); +} + int fat_sync_inode(struct inode *inode) { - return fat_write_inode(inode, 1); + return __fat_write_inode(inode, 1); } EXPORT_SYMBOL_GPL(fat_sync_inode); diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 5f2721b1e4be..76fc4d594acb 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -381,10 +381,10 @@ static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this) move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this); } -static int write_inode(struct inode *inode, int sync) +static int write_inode(struct inode *inode, struct writeback_control *wbc) { if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) - return inode->i_sb->s_op->write_inode(inode, sync); + return inode->i_sb->s_op->write_inode(inode, wbc); return 0; } @@ -421,7 +421,6 @@ static int writeback_single_inode(struct inode *inode, struct writeback_control *wbc) { struct address_space *mapping = inode->i_mapping; - int wait = wbc->sync_mode == WB_SYNC_ALL; unsigned dirty; int ret; @@ -439,7 +438,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) * We'll have another go at writing back this inode when we * completed a full scan of b_io. */ - if (!wait) { + if (wbc->sync_mode != WB_SYNC_ALL) { requeue_io(inode); return 0; } @@ -466,7 +465,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) * This is important for filesystems that modify metadata on data * I/O completion. */ - if (wait) { + if (wbc->sync_mode == WB_SYNC_ALL) { int err = filemap_fdatawait(mapping); if (ret == 0) ret = err; @@ -474,7 +473,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) /* Don't write the inode if only I_DIRTY_PAGES was set */ if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { - int err = write_inode(inode, wait); + int err = write_inode(inode, wbc); if (ret == 0) ret = err; } diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index e5e22629da67..ca87598ead7f 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -22,6 +22,7 @@ #include <linux/crc32.h> #include <linux/time.h> #include <linux/wait.h> +#include <linux/writeback.h> #include "gfs2.h" #include "incore.h" @@ -711,7 +712,7 @@ void gfs2_unfreeze_fs(struct gfs2_sbd *sdp) * Returns: errno */ -static int gfs2_write_inode(struct inode *inode, int sync) +static int gfs2_write_inode(struct inode *inode, struct writeback_control *wbc) { struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); @@ -745,7 +746,7 @@ static int gfs2_write_inode(struct inode *inode, int sync) do_unlock: gfs2_glock_dq_uninit(&gh); do_flush: - if (sync != 0) + if (wbc->sync_mode == WB_SYNC_ALL) gfs2_log_flush(GFS2_SB(inode), ip->i_gl); return ret; } diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h index 052387e11671..fe35e3b626c4 100644 --- a/fs/hfs/hfs_fs.h +++ b/fs/hfs/hfs_fs.h @@ -188,7 +188,7 @@ extern const struct address_space_operations hfs_btree_aops; extern struct inode *hfs_new_inode(struct inode *, struct qstr *, int); extern void hfs_inode_write_fork(struct inode *, struct hfs_extent *, __be32 *, __be32 *); -extern int hfs_write_inode(struct inode *, int); +extern int hfs_write_inode(struct inode *, struct writeback_control *); extern int hfs_inode_setattr(struct dentry *, struct iattr *); extern void hfs_inode_read_fork(struct inode *inode, struct hfs_extent *ext, __be32 log_size, __be32 phys_size, u32 clump_size); diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index a1cbff2b4d99..14f5cb1b9fdc 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -381,7 +381,7 @@ void hfs_inode_write_fork(struct inode *inode, struct hfs_extent *ext, HFS_SB(inode->i_sb)->alloc_blksz); } -int hfs_write_inode(struct inode *inode, int unused) +int hfs_write_inode(struct inode *inode, struct writeback_control *wbc) { struct inode *main_inode = inode; struct hfs_find_data fd; diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 43022f3d5148..74b473a8ef92 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -87,7 +87,8 @@ bad_inode: return ERR_PTR(err); } -static int hfsplus_write_inode(struct inode *inode, int unused) +static int hfsplus_write_inode(struct inode *inode, + struct writeback_control *wbc) { struct hfsplus_vh *vhdr; int ret = 0; diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index b2ae190a77ba..182b78cc3e62 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -22,6 +22,7 @@ #include <linux/buffer_head.h> #include <linux/pagemap.h> #include <linux/quotaops.h> +#include <linux/writeback.h> #include "jfs_incore.h" #include "jfs_inode.h" #include "jfs_filsys.h" @@ -120,8 +121,10 @@ int jfs_commit_inode(struct inode *inode, int wait) return rc; } -int jfs_write_inode(struct inode *inode, int wait) +int jfs_write_inode(struct inode *inode, struct writeback_control *wbc) { + int wait = wbc->sync_mode == WB_SYNC_ALL; + if (test_cflag(COMMIT_Nolink, inode)) return 0; /* diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h index 1eff7db34d63..15902b03c2a7 100644 --- a/fs/jfs/jfs_inode.h +++ b/fs/jfs/jfs_inode.h @@ -26,7 +26,7 @@ extern long jfs_ioctl(struct file *, unsigned int, unsigned long); extern long jfs_compat_ioctl(struct file *, unsigned int, unsigned long); extern struct inode *jfs_iget(struct super_block *, unsigned long); extern int jfs_commit_inode(struct inode *, int); -extern int jfs_write_inode(struct inode*, int); +extern int jfs_write_inode(struct inode *, struct writeback_control *); extern void jfs_delete_inode(struct inode *); extern void jfs_dirty_inode(struct inode *); extern void jfs_truncate(struct inode *); diff --git a/fs/minix/inode.c b/fs/minix/inode.c index 74ea82d72164..756f8c93780c 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -17,8 +17,10 @@ #include <linux/init.h> #include <linux/highuid.h> #include <linux/vfs.h> +#include <linux/writeback.h> -static int minix_write_inode(struct inode * inode, int wait); +static int minix_write_inode(struct inode *inode, + struct writeback_control *wbc); static int minix_statfs(struct dentry *dentry, struct kstatfs *buf); static int minix_remount (struct super_block * sb, int * flags, char * data); @@ -552,7 +554,7 @@ static struct buffer_head * V2_minix_update_inode(struct inode * inode) return bh; } -static int minix_write_inode(struct inode *inode, int wait) +static int minix_write_inode(struct inode *inode, struct writeback_control *wbc) { int err = 0; struct buffer_head *bh; @@ -563,7 +565,7 @@ static int minix_write_inode(struct inode *inode, int wait) bh = V2_minix_update_inode(inode); if (!bh) return -EIO; - if (wait && buffer_dirty(bh)) { + if (wbc->sync_mode == WB_SYNC_ALL && buffer_dirty(bh)) { sync_dirty_buffer(bh); if (buffer_req(bh) && !buffer_uptodate(bh)) { printk("IO error syncing minix inode [%s:%08lx]\n", diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 5ecd952cae1d..7f9ecc46f3fb 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -97,11 +97,12 @@ u64 nfs_compat_user_ino64(u64 fileid) return ino; } -int nfs_write_inode(struct inode *inode, int sync) +int nfs_write_inode(struct inode *inode, struct writeback_control *wbc) { int ret; - ret = nfs_commit_inode(inode, sync ? FLUSH_SYNC : 0); + ret = nfs_commit_inode(inode, + wbc->sync_mode == WB_SYNC_ALL ? FLUSH_SYNC : 0); if (ret >= 0) return 0; __mark_inode_dirty(inode, I_DIRTY_DATASYNC); diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 29e464d23b32..11f82f03c5de 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -211,7 +211,7 @@ extern int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask); extern struct workqueue_struct *nfsiod_workqueue; extern struct inode *nfs_alloc_inode(struct super_block *sb); extern void nfs_destroy_inode(struct inode *); -extern int nfs_write_inode(struct inode *,int); +extern int nfs_write_inode(struct inode *, struct writeback_control *); extern void nfs_clear_inode(struct inode *); #ifdef CONFIG_NFS_V4 extern void nfs4_clear_inode(struct inode *); diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c index 5a9e34475e37..9173e82a45d1 100644 --- a/fs/ntfs/dir.c +++ b/fs/ntfs/dir.c @@ -1545,7 +1545,7 @@ static int ntfs_dir_fsync(struct file *filp, struct dentry *dentry, write_inode_now(bmp_vi, !datasync); iput(bmp_vi); } - ret = ntfs_write_inode(vi, 1); + ret = __ntfs_write_inode(vi, 1); write_inode_now(vi, !datasync); err = sync_blockdev(vi->i_sb->s_bdev); if (unlikely(err && !ret)) diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index 43179ddd336f..b681c71d7069 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -2182,7 +2182,7 @@ static int ntfs_file_fsync(struct file *filp, struct dentry *dentry, ntfs_debug("Entering for inode 0x%lx.", vi->i_ino); BUG_ON(S_ISDIR(vi->i_mode)); if (!datasync || !NInoNonResident(NTFS_I(vi))) - ret = ntfs_write_inode(vi, 1); + ret = __ntfs_write_inode(vi, 1); write_inode_now(vi, !datasync); /* * NOTE: If we were to use mapping->private_list (see ext2 and diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index dc2505abb6d7..4b57fb1eac2a 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -2957,7 +2957,7 @@ out: * * Return 0 on success and -errno on error. */ -int ntfs_write_inode(struct inode *vi, int sync) +int __ntfs_write_inode(struct inode *vi, int sync) { sle64 nt; ntfs_inode *ni = NTFS_I(vi); diff --git a/fs/ntfs/inode.h b/fs/ntfs/inode.h index 117eaf8032a3..9a113544605d 100644 --- a/fs/ntfs/inode.h +++ b/fs/ntfs/inode.h @@ -307,12 +307,12 @@ extern void ntfs_truncate_vfs(struct inode *vi); extern int ntfs_setattr(struct dentry *dentry, struct iattr *attr); -extern int ntfs_write_inode(struct inode *vi, int sync); +extern int __ntfs_write_inode(struct inode *vi, int sync); static inline void ntfs_commit_inode(struct inode *vi) { if (!is_bad_inode(vi)) - ntfs_write_inode(vi, 1); + __ntfs_write_inode(vi, 1); return; } diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index 80b04770e8e9..1cf39dfaee7a 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -39,6 +39,7 @@ #include "dir.h" #include "debug.h" #include "index.h" +#include "inode.h" #include "aops.h" #include "layout.h" #include "malloc.h" @@ -2662,6 +2663,13 @@ static int ntfs_statfs(struct dentry *dentry, struct kstatfs *sfs) return 0; } +#ifdef NTFS_RW +static int ntfs_write_inode(struct inode *vi, struct writeback_control *wbc) +{ + return __ntfs_write_inode(vi, wbc->sync_mode == WB_SYNC_ALL); +} +#endif + /** * The complete super operations. */ diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c index f3b7c1541f3a..75d9b5ba1d45 100644 --- a/fs/omfs/inode.c +++ b/fs/omfs/inode.c @@ -11,6 +11,7 @@ #include <linux/parser.h> #include <linux/buffer_head.h> #include <linux/vmalloc.h> +#include <linux/writeback.h> #include <linux/crc-itu-t.h> #include "omfs.h" @@ -89,7 +90,7 @@ static void omfs_update_checksums(struct omfs_inode *oi) oi->i_head.h_check_xor = xor; } -static int omfs_write_inode(struct inode *inode, int wait) +static int __omfs_write_inode(struct inode *inode, int wait) { struct omfs_inode *oi; struct omfs_sb_info *sbi = OMFS_SB(inode->i_sb); @@ -162,9 +163,14 @@ out: return ret; } +static int omfs_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + return __omfs_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL); +} + int omfs_sync_inode(struct inode *inode) { - return omfs_write_inode(inode, 1); + return __omfs_write_inode(inode, 1); } /* diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 2df0f5c7c60b..0d651f980a8d 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -1615,7 +1615,7 @@ int reiserfs_encode_fh(struct dentry *dentry, __u32 * data, int *lenp, ** to properly mark inodes for datasync and such, but only actually ** does something when called for a synchronous update. */ -int reiserfs_write_inode(struct inode *inode, int do_sync) +int reiserfs_write_inode(struct inode *inode, struct writeback_control *wbc) { struct reiserfs_transaction_handle th; int jbegin_count = 1; @@ -1627,7 +1627,7 @@ int reiserfs_write_inode(struct inode *inode, int do_sync) ** inode needs to reach disk for safety, and they can safely be ** ignored because the altered inode has already been logged. */ - if (do_sync && !(current->flags & PF_MEMALLOC)) { + if (wbc->sync_mode == WB_SYNC_ALL && !(current->flags & PF_MEMALLOC)) { reiserfs_write_lock(inode->i_sb); if (!journal_begin(&th, inode->i_sb, jbegin_count)) { reiserfs_update_sd(&th, inode); diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c index 9824743832a7..4573734d723d 100644 --- a/fs/sysv/inode.c +++ b/fs/sysv/inode.c @@ -26,6 +26,7 @@ #include <linux/init.h> #include <linux/buffer_head.h> #include <linux/vfs.h> +#include <linux/writeback.h> #include <linux/namei.h> #include <asm/byteorder.h> #include "sysv.h" @@ -246,7 +247,7 @@ bad_inode: return ERR_PTR(-EIO); } -int sysv_write_inode(struct inode *inode, int wait) +static int __sysv_write_inode(struct inode *inode, int wait) { struct super_block * sb = inode->i_sb; struct sysv_sb_info * sbi = SYSV_SB(sb); @@ -296,9 +297,14 @@ int sysv_write_inode(struct inode *inode, int wait) return 0; } +int sysv_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + return __sysv_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL); +} + int sysv_sync_inode(struct inode *inode) { - return sysv_write_inode(inode, 1); + return __sysv_write_inode(inode, 1); } static void sysv_delete_inode(struct inode *inode) diff --git a/fs/sysv/sysv.h b/fs/sysv/sysv.h index 53786eb5cf60..94cb9b4d76c2 100644 --- a/fs/sysv/sysv.h +++ b/fs/sysv/sysv.h @@ -142,7 +142,7 @@ extern int __sysv_write_begin(struct file *file, struct address_space *mapping, /* inode.c */ extern struct inode *sysv_iget(struct super_block *, unsigned int); -extern int sysv_write_inode(struct inode *, int); +extern int sysv_write_inode(struct inode *, struct writeback_control *wbc); extern int sysv_sync_inode(struct inode *); extern void sysv_set_inode(struct inode *, dev_t); extern int sysv_getattr(struct vfsmount *, struct dentry *, struct kstat *); diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 552fb0111fff..401e503d44a1 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -1120,7 +1120,7 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry, if (release) ubifs_release_budget(c, &ino_req); if (IS_SYNC(old_inode)) - err = old_inode->i_sb->s_op->write_inode(old_inode, 1); + err = old_inode->i_sb->s_op->write_inode(old_inode, NULL); return err; out_cancel: diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 16a6444330ec..e26c02ab6cd5 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1011,7 +1011,7 @@ static int ubifs_writepage(struct page *page, struct writeback_control *wbc) /* Is the page fully inside @i_size? */ if (page->index < end_index) { if (page->index >= synced_i_size >> PAGE_CACHE_SHIFT) { - err = inode->i_sb->s_op->write_inode(inode, 1); + err = inode->i_sb->s_op->write_inode(inode, NULL); if (err) goto out_unlock; /* @@ -1039,7 +1039,7 @@ static int ubifs_writepage(struct page *page, struct writeback_control *wbc) kunmap_atomic(kaddr, KM_USER0); if (i_size > synced_i_size) { - err = inode->i_sb->s_op->write_inode(inode, 1); + err = inode->i_sb->s_op->write_inode(inode, NULL); if (err) goto out_unlock; } @@ -1242,7 +1242,7 @@ static int do_setattr(struct ubifs_info *c, struct inode *inode, if (release) ubifs_release_budget(c, &req); if (IS_SYNC(inode)) - err = inode->i_sb->s_op->write_inode(inode, 1); + err = inode->i_sb->s_op->write_inode(inode, NULL); return err; out: @@ -1316,7 +1316,7 @@ int ubifs_fsync(struct file *file, struct dentry *dentry, int datasync) * the inode unless this is a 'datasync()' call. */ if (!datasync || (inode->i_state & I_DIRTY_DATASYNC)) { - err = inode->i_sb->s_op->write_inode(inode, 1); + err = inode->i_sb->s_op->write_inode(inode, NULL); if (err) return err; } diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 43f9d19a6f33..4d2f2157dd3f 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -283,7 +283,7 @@ static void ubifs_destroy_inode(struct inode *inode) /* * Note, Linux write-back code calls this without 'i_mutex'. */ -static int ubifs_write_inode(struct inode *inode, int wait) +static int ubifs_write_inode(struct inode *inode, struct writeback_control *wbc) { int err = 0; struct ubifs_info *c = inode->i_sb->s_fs_info; diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 378a7592257c..b02089247296 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -1373,12 +1373,12 @@ static mode_t udf_convert_permissions(struct fileEntry *fe) return mode; } -int udf_write_inode(struct inode *inode, int sync) +int udf_write_inode(struct inode *inode, struct writeback_control *wbc) { int ret; lock_kernel(); - ret = udf_update_inode(inode, sync); + ret = udf_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL); unlock_kernel(); return ret; diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h index 8d46f4294ee7..4223ac855da9 100644 --- a/fs/udf/udfdecl.h +++ b/fs/udf/udfdecl.h @@ -142,7 +142,7 @@ extern void udf_truncate(struct inode *); extern void udf_read_inode(struct inode *); extern void udf_delete_inode(struct inode *); extern void udf_clear_inode(struct inode *); -extern int udf_write_inode(struct inode *, int); +extern int udf_write_inode(struct inode *, struct writeback_control *wbc); extern long udf_block_map(struct inode *, sector_t); extern int udf_extend_file(struct inode *, struct extent_position *, struct kernel_long_ad *, sector_t); diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index 7cf33379fd46..0a627e08610b 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -36,6 +36,7 @@ #include <linux/mm.h> #include <linux/smp_lock.h> #include <linux/buffer_head.h> +#include <linux/writeback.h> #include "ufs_fs.h" #include "ufs.h" @@ -890,11 +891,11 @@ static int ufs_update_inode(struct inode * inode, int do_sync) return 0; } -int ufs_write_inode (struct inode * inode, int wait) +int ufs_write_inode(struct inode *inode, struct writeback_control *wbc) { int ret; lock_kernel(); - ret = ufs_update_inode (inode, wait); + ret = ufs_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL); unlock_kernel(); return ret; } diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h index 01d0e2a3b230..43f9f5d5670e 100644 --- a/fs/ufs/ufs.h +++ b/fs/ufs/ufs.h @@ -106,7 +106,7 @@ extern struct inode * ufs_new_inode (struct inode *, int); /* inode.c */ extern struct inode *ufs_iget(struct super_block *, unsigned long); -extern int ufs_write_inode (struct inode *, int); +extern int ufs_write_inode (struct inode *, struct writeback_control *); extern int ufs_sync_inode (struct inode *); extern void ufs_delete_inode (struct inode *); extern struct buffer_head * ufs_bread (struct inode *, unsigned, int, int *); diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 8f117db6070e..71345a370d9f 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -1063,7 +1063,7 @@ xfs_log_inode( STATIC int xfs_fs_write_inode( struct inode *inode, - int sync) + struct writeback_control *wbc) { struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; @@ -1074,7 +1074,7 @@ xfs_fs_write_inode( if (XFS_FORCED_SHUTDOWN(mp)) return XFS_ERROR(EIO); - if (sync) { + if (wbc->sync_mode == WB_SYNC_ALL) { /* * Make sure the inode has hit stable storage. By using the * log and the fsync transactions we reduce the IOs we have diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h index 6b049030fbe6..deac2566450e 100644 --- a/include/linux/ext3_fs.h +++ b/include/linux/ext3_fs.h @@ -877,7 +877,7 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode, int create); extern struct inode *ext3_iget(struct super_block *, unsigned long); -extern int ext3_write_inode (struct inode *, int); +extern int ext3_write_inode (struct inode *, struct writeback_control *); extern int ext3_setattr (struct dentry *, struct iattr *); extern void ext3_delete_inode (struct inode *); extern int ext3_sync_inode (handle_t *, struct inode *); diff --git a/include/linux/fs.h b/include/linux/fs.h index 5b3182c7eb5f..45689621a851 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1557,7 +1557,7 @@ struct super_operations { void (*destroy_inode)(struct inode *); void (*dirty_inode) (struct inode *); - int (*write_inode) (struct inode *, int); + int (*write_inode) (struct inode *, struct writeback_control *wbc); void (*drop_inode) (struct inode *); void (*delete_inode) (struct inode *); void (*put_super) (struct super_block *); diff --git a/include/linux/reiserfs_fs.h b/include/linux/reiserfs_fs.h index 1ba3cf6edfbb..3b603f474186 100644 --- a/include/linux/reiserfs_fs.h +++ b/include/linux/reiserfs_fs.h @@ -2034,7 +2034,7 @@ void reiserfs_read_locked_inode(struct inode *inode, int reiserfs_find_actor(struct inode *inode, void *p); int reiserfs_init_locked_inode(struct inode *inode, void *p); void reiserfs_delete_inode(struct inode *inode); -int reiserfs_write_inode(struct inode *inode, int); +int reiserfs_write_inode(struct inode *inode, struct writeback_control *wbc); int reiserfs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh_result, int create); struct dentry *reiserfs_fh_to_dentry(struct super_block *sb, struct fid *fid, -- cgit v1.2.3 From 8fc795f703c5138e1a8bfb88c69f52632031aa6a Mon Sep 17 00:00:00 2001 From: Trond Myklebust <Trond.Myklebust@netapp.com> Date: Fri, 19 Feb 2010 16:46:56 -0800 Subject: NFS: Cleanup - move nfs_write_inode() into fs/nfs/write.c The sole purpose of nfs_write_inode is to commit unstable writes, so move it into fs/nfs/write.c, and make nfs_commit_inode static. Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com> --- fs/nfs/inode.c | 12 ------------ fs/nfs/write.c | 24 +++++++++++++++++++++++- include/linux/nfs_fs.h | 7 ------- 3 files changed, 23 insertions(+), 20 deletions(-) (limited to 'fs') diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 7f9ecc46f3fb..89e98312599d 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -97,18 +97,6 @@ u64 nfs_compat_user_ino64(u64 fileid) return ino; } -int nfs_write_inode(struct inode *inode, struct writeback_control *wbc) -{ - int ret; - - ret = nfs_commit_inode(inode, - wbc->sync_mode == WB_SYNC_ALL ? FLUSH_SYNC : 0); - if (ret >= 0) - return 0; - __mark_inode_dirty(inode, I_DIRTY_DATASYNC); - return ret; -} - void nfs_clear_inode(struct inode *inode) { /* diff --git a/fs/nfs/write.c b/fs/nfs/write.c index d63d964a0392..09e97097baaa 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1391,7 +1391,7 @@ static const struct rpc_call_ops nfs_commit_ops = { .rpc_release = nfs_commit_release, }; -int nfs_commit_inode(struct inode *inode, int how) +static int nfs_commit_inode(struct inode *inode, int how) { LIST_HEAD(head); int res; @@ -1406,13 +1406,35 @@ int nfs_commit_inode(struct inode *inode, int how) } return res; } + +static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_control *wbc) +{ + int ret; + + ret = nfs_commit_inode(inode, + wbc->sync_mode == WB_SYNC_ALL ? FLUSH_SYNC : 0); + if (ret >= 0) + return 0; + __mark_inode_dirty(inode, I_DIRTY_DATASYNC); + return ret; +} #else static inline int nfs_commit_list(struct inode *inode, struct list_head *head, int how) { return 0; } + +static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_control *wbc) +{ + return 0; +} #endif +int nfs_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + return nfs_commit_unstable_pages(inode, wbc); +} + long nfs_sync_mapping_wait(struct address_space *mapping, struct writeback_control *wbc, int how) { struct inode *inode = mapping->host; diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index d09db1bc9083..384ea3ef2863 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -483,15 +483,8 @@ extern int nfs_wb_nocommit(struct inode *inode); extern int nfs_wb_page(struct inode *inode, struct page* page); extern int nfs_wb_page_cancel(struct inode *inode, struct page* page); #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) -extern int nfs_commit_inode(struct inode *, int); extern struct nfs_write_data *nfs_commitdata_alloc(void); extern void nfs_commit_free(struct nfs_write_data *wdata); -#else -static inline int -nfs_commit_inode(struct inode *inode, int how) -{ - return 0; -} #endif static inline int -- cgit v1.2.3 From ff778d02bf867e1733a09b34ad6dbb723b024814 Mon Sep 17 00:00:00 2001 From: Trond Myklebust <Trond.Myklebust@netapp.com> Date: Fri, 19 Feb 2010 16:53:39 -0800 Subject: NFS: Add a count of the number of unstable writes carried by an inode In order to know when we should do opportunistic commits of the unstable writes, when the VM is doing a background flush, we add a field to count the number of unstable writes. Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com> --- fs/nfs/inode.c | 1 + fs/nfs/write.c | 14 ++++++++++---- include/linux/nfs_fs.h | 1 + 3 files changed, 12 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 89e98312599d..aa5a831001ab 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1404,6 +1404,7 @@ static void init_once(void *foo) INIT_LIST_HEAD(&nfsi->access_cache_inode_lru); INIT_RADIX_TREE(&nfsi->nfs_page_tree, GFP_ATOMIC); nfsi->npages = 0; + nfsi->ncommit = 0; atomic_set(&nfsi->silly_count, 1); INIT_HLIST_HEAD(&nfsi->silly_list); init_waitqueue_head(&nfsi->waitqueue); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 09e97097baaa..dc08a6fbde67 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -438,6 +438,7 @@ nfs_mark_request_commit(struct nfs_page *req) radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_COMMIT); + nfsi->ncommit++; spin_unlock(&inode->i_lock); inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE); @@ -573,11 +574,15 @@ static int nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages) { struct nfs_inode *nfsi = NFS_I(inode); + int ret; if (!nfs_need_commit(nfsi)) return 0; - return nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT); + ret = nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT); + if (ret > 0) + nfsi->ncommit -= ret; + return ret; } #else static inline int nfs_need_commit(struct nfs_inode *nfsi) @@ -642,9 +647,10 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode, spin_lock(&inode->i_lock); } - if (nfs_clear_request_commit(req)) - radix_tree_tag_clear(&NFS_I(inode)->nfs_page_tree, - req->wb_index, NFS_PAGE_TAG_COMMIT); + if (nfs_clear_request_commit(req) && + radix_tree_tag_clear(&NFS_I(inode)->nfs_page_tree, + req->wb_index, NFS_PAGE_TAG_COMMIT) != NULL) + NFS_I(inode)->ncommit--; /* Okay, the request matches. Update the region */ if (offset < req->wb_offset) { diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 384ea3ef2863..309217f46e28 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -166,6 +166,7 @@ struct nfs_inode { struct radix_tree_root nfs_page_tree; unsigned long npages; + unsigned long ncommit; /* Open contexts for shared mmap writes */ struct list_head open_files; -- cgit v1.2.3 From 420e3646bb7d93a571734034249fbb1ae1a7a5c7 Mon Sep 17 00:00:00 2001 From: Trond Myklebust <Trond.Myklebust@netapp.com> Date: Fri, 19 Feb 2010 17:00:02 -0800 Subject: NFS: Reduce the number of unnecessary COMMIT calls If the caller is doing a non-blocking flush, and there are still writebacks pending on the wire, we can usually defer the COMMIT call until those writes are done. Also ensure that we honour the wbc->nonblocking flag. Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com> --- fs/nfs/write.c | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index dc08a6fbde67..fc05e35da6a9 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1415,12 +1415,30 @@ static int nfs_commit_inode(struct inode *inode, int how) static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_control *wbc) { - int ret; + struct nfs_inode *nfsi = NFS_I(inode); + int flags = FLUSH_SYNC; + int ret = 0; - ret = nfs_commit_inode(inode, - wbc->sync_mode == WB_SYNC_ALL ? FLUSH_SYNC : 0); - if (ret >= 0) + /* Don't commit yet if this is a non-blocking flush and there are + * lots of outstanding writes for this mapping. + */ + if (wbc->sync_mode == WB_SYNC_NONE && + nfsi->ncommit <= (nfsi->npages >> 1)) + goto out_mark_dirty; + + if (wbc->nonblocking) + flags = 0; + ret = nfs_commit_inode(inode, flags); + if (ret >= 0) { + if (wbc->sync_mode == WB_SYNC_NONE) { + if (ret < wbc->nr_to_write) + wbc->nr_to_write -= ret; + else + wbc->nr_to_write = 0; + } return 0; + } +out_mark_dirty: __mark_inode_dirty(inode, I_DIRTY_DATASYNC); return ret; } -- cgit v1.2.3 From 5bad5abec4058c5214bfc72cec418348d6747977 Mon Sep 17 00:00:00 2001 From: Trond Myklebust <Trond.Myklebust@netapp.com> Date: Fri, 19 Feb 2010 17:02:24 -0800 Subject: NFS: Run COMMIT as an asynchronous RPC call when wbc->for_background is set Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com> Acked-by: Peter Zijlstra <peterz@infradead.org> Acked-by: Wu Fengguang <fengguang.wu@intel.com> --- fs/nfs/write.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index fc05e35da6a9..704e67d392e5 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1426,7 +1426,7 @@ static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_contr nfsi->ncommit <= (nfsi->npages >> 1)) goto out_mark_dirty; - if (wbc->nonblocking) + if (wbc->nonblocking || wbc->for_background) flags = 0; ret = nfs_commit_inode(inode, flags); if (ret >= 0) { -- cgit v1.2.3 From 2928db1ffeacc9717c2d5c230d450bcc377b3ae9 Mon Sep 17 00:00:00 2001 From: Trond Myklebust <Trond.Myklebust@netapp.com> Date: Fri, 19 Feb 2010 17:03:18 -0800 Subject: NFS: Ensure inode is always marked I_DIRTY_DATASYNC, if it has unstable pages Since nfs_scan_list() doesn't wait for locked pages, we have a race in which it is possible to end up with an inode that needs to send a COMMIT, but which does not have the I_DIRTY_DATASYNC flag set. Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com> --- fs/nfs/write.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 704e67d392e5..e40e949598fd 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -582,6 +582,8 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, u ret = nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT); if (ret > 0) nfsi->ncommit -= ret; + if (nfs_need_commit(NFS_I(inode))) + __mark_inode_dirty(inode, I_DIRTY_DATASYNC); return ret; } #else -- cgit v1.2.3 From c988950eb6dd6f8e6d98503ca094622729e9aa13 Mon Sep 17 00:00:00 2001 From: Trond Myklebust <Trond.Myklebust@netapp.com> Date: Fri, 19 Feb 2010 17:03:21 -0800 Subject: NFS: Simplify nfs_wb_page_cancel() In all cases we should be able to just remove the request and call cancel_dirty_page(). Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com> --- fs/nfs/write.c | 39 +-------------------------------------- include/linux/nfs_fs.h | 2 -- 2 files changed, 1 insertion(+), 40 deletions(-) (limited to 'fs') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index e40e949598fd..dc7f5e9a23b4 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -540,19 +540,6 @@ static int nfs_wait_on_requests_locked(struct inode *inode, pgoff_t idx_start, u return res; } -static void nfs_cancel_commit_list(struct list_head *head) -{ - struct nfs_page *req; - - while(!list_empty(head)) { - req = nfs_list_entry(head->next); - nfs_list_remove_request(req); - nfs_clear_request_commit(req); - nfs_inode_remove_request(req); - nfs_unlock_request(req); - } -} - #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) static int nfs_need_commit(struct nfs_inode *nfsi) @@ -1495,13 +1482,6 @@ long nfs_sync_mapping_wait(struct address_space *mapping, struct writeback_contr pages = nfs_scan_commit(inode, &head, idx_start, npages); if (pages == 0) break; - if (how & FLUSH_INVALIDATE) { - spin_unlock(&inode->i_lock); - nfs_cancel_commit_list(&head); - ret = pages; - spin_lock(&inode->i_lock); - continue; - } pages += nfs_scan_commit(inode, &head, 0, 0); spin_unlock(&inode->i_lock); ret = nfs_commit_list(inode, &head, how); @@ -1558,26 +1538,13 @@ int nfs_wb_nocommit(struct inode *inode) int nfs_wb_page_cancel(struct inode *inode, struct page *page) { struct nfs_page *req; - loff_t range_start = page_offset(page); - loff_t range_end = range_start + (loff_t)(PAGE_CACHE_SIZE - 1); - struct writeback_control wbc = { - .bdi = page->mapping->backing_dev_info, - .sync_mode = WB_SYNC_ALL, - .nr_to_write = LONG_MAX, - .range_start = range_start, - .range_end = range_end, - }; int ret = 0; BUG_ON(!PageLocked(page)); for (;;) { req = nfs_page_find_request(page); if (req == NULL) - goto out; - if (test_bit(PG_CLEAN, &req->wb_flags)) { - nfs_release_request(req); break; - } if (nfs_lock_request_dontget(req)) { nfs_inode_remove_request(req); /* @@ -1591,12 +1558,8 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page) ret = nfs_wait_on_request(req); nfs_release_request(req); if (ret < 0) - goto out; + break; } - if (!PagePrivate(page)) - return 0; - ret = nfs_sync_mapping_wait(page->mapping, &wbc, FLUSH_INVALIDATE); -out: return ret; } diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 309217f46e28..1083134c02ff 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -34,8 +34,6 @@ #define FLUSH_LOWPRI 8 /* low priority background flush */ #define FLUSH_HIGHPRI 16 /* high priority memory reclaim flush */ #define FLUSH_NOCOMMIT 32 /* Don't send the NFSv3/v4 COMMIT */ -#define FLUSH_INVALIDATE 64 /* Invalidate the page cache */ -#define FLUSH_NOWRITEPAGE 128 /* Don't call writepage() */ #ifdef __KERNEL__ -- cgit v1.2.3 From acdc53b2146c7ee67feb1f02f7bc3020126514b8 Mon Sep 17 00:00:00 2001 From: Trond Myklebust <Trond.Myklebust@netapp.com> Date: Fri, 19 Feb 2010 17:03:26 -0800 Subject: NFS: Replace __nfs_write_mapping with sync_inode() Now that we have correct COMMIT semantics in writeback_single_inode, we can reduce and simplify nfs_wb_all(). Also replace nfs_wb_nocommit() with a call to filemap_write_and_wait(), which doesn't need to hold the inode->i_mutex. With that done, we can eliminate nfs_write_mapping() altogether. Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com> --- fs/nfs/inode.c | 15 +++++---------- fs/nfs/write.c | 42 +++++------------------------------------- include/linux/nfs_fs.h | 2 -- 3 files changed, 10 insertions(+), 49 deletions(-) (limited to 'fs') diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index aa5a831001ab..443772df9b17 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -495,17 +495,11 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) int need_atime = NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATIME; int err; - /* - * Flush out writes to the server in order to update c/mtime. - * - * Hold the i_mutex to suspend application writes temporarily; - * this prevents long-running writing applications from blocking - * nfs_wb_nocommit. - */ + /* Flush out writes to the server in order to update c/mtime. */ if (S_ISREG(inode->i_mode)) { - mutex_lock(&inode->i_mutex); - nfs_wb_nocommit(inode); - mutex_unlock(&inode->i_mutex); + err = filemap_write_and_wait(inode->i_mapping); + if (err) + goto out; } /* @@ -529,6 +523,7 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) generic_fillattr(inode, stat); stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode)); } +out: return err; } diff --git a/fs/nfs/write.c b/fs/nfs/write.c index dc7f5e9a23b4..0b323091b481 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1454,7 +1454,6 @@ long nfs_sync_mapping_wait(struct address_space *mapping, struct writeback_contr pgoff_t idx_start, idx_end; unsigned int npages = 0; LIST_HEAD(head); - int nocommit = how & FLUSH_NOCOMMIT; long pages, ret; /* FIXME */ @@ -1471,14 +1470,11 @@ long nfs_sync_mapping_wait(struct address_space *mapping, struct writeback_contr npages = 0; } } - how &= ~FLUSH_NOCOMMIT; spin_lock(&inode->i_lock); do { ret = nfs_wait_on_requests_locked(inode, idx_start, npages); if (ret != 0) continue; - if (nocommit) - break; pages = nfs_scan_commit(inode, &head, idx_start, npages); if (pages == 0) break; @@ -1492,47 +1488,19 @@ long nfs_sync_mapping_wait(struct address_space *mapping, struct writeback_contr return ret; } -static int __nfs_write_mapping(struct address_space *mapping, struct writeback_control *wbc, int how) -{ - int ret; - - ret = nfs_writepages(mapping, wbc); - if (ret < 0) - goto out; - ret = nfs_sync_mapping_wait(mapping, wbc, how); - if (ret < 0) - goto out; - return 0; -out: - __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); - return ret; -} - -/* Two pass sync: first using WB_SYNC_NONE, then WB_SYNC_ALL */ -static int nfs_write_mapping(struct address_space *mapping, int how) +/* + * flush the inode to disk. + */ +int nfs_wb_all(struct inode *inode) { struct writeback_control wbc = { - .bdi = mapping->backing_dev_info, .sync_mode = WB_SYNC_ALL, .nr_to_write = LONG_MAX, .range_start = 0, .range_end = LLONG_MAX, }; - return __nfs_write_mapping(mapping, &wbc, how); -} - -/* - * flush the inode to disk. - */ -int nfs_wb_all(struct inode *inode) -{ - return nfs_write_mapping(inode->i_mapping, 0); -} - -int nfs_wb_nocommit(struct inode *inode) -{ - return nfs_write_mapping(inode->i_mapping, FLUSH_NOCOMMIT); + return sync_inode(inode, &wbc); } int nfs_wb_page_cancel(struct inode *inode, struct page *page) diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 1083134c02ff..93f439e7c5bf 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -33,7 +33,6 @@ #define FLUSH_STABLE 4 /* commit to stable storage */ #define FLUSH_LOWPRI 8 /* low priority background flush */ #define FLUSH_HIGHPRI 16 /* high priority memory reclaim flush */ -#define FLUSH_NOCOMMIT 32 /* Don't send the NFSv3/v4 COMMIT */ #ifdef __KERNEL__ @@ -478,7 +477,6 @@ extern int nfs_writeback_done(struct rpc_task *, struct nfs_write_data *); */ extern long nfs_sync_mapping_wait(struct address_space *, struct writeback_control *, int); extern int nfs_wb_all(struct inode *inode); -extern int nfs_wb_nocommit(struct inode *inode); extern int nfs_wb_page(struct inode *inode, struct page* page); extern int nfs_wb_page_cancel(struct inode *inode, struct page* page); #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) -- cgit v1.2.3 From 7f2f12d963e7c33a93bfb0b22f0178eb1e6a4196 Mon Sep 17 00:00:00 2001 From: Trond Myklebust <Trond.Myklebust@netapp.com> Date: Fri, 19 Feb 2010 17:03:28 -0800 Subject: NFS: Simplify nfs_wb_page() Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com> --- fs/nfs/write.c | 120 ++++++++++--------------------------------------- include/linux/nfs_fs.h | 1 - 2 files changed, 23 insertions(+), 98 deletions(-) (limited to 'fs') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 0b323091b481..53ff70e23993 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -502,44 +502,6 @@ int nfs_reschedule_unstable_write(struct nfs_page *req) } #endif -/* - * Wait for a request to complete. - * - * Interruptible by fatal signals only. - */ -static int nfs_wait_on_requests_locked(struct inode *inode, pgoff_t idx_start, unsigned int npages) -{ - struct nfs_inode *nfsi = NFS_I(inode); - struct nfs_page *req; - pgoff_t idx_end, next; - unsigned int res = 0; - int error; - - if (npages == 0) - idx_end = ~0; - else - idx_end = idx_start + npages - 1; - - next = idx_start; - while (radix_tree_gang_lookup_tag(&nfsi->nfs_page_tree, (void **)&req, next, 1, NFS_PAGE_TAG_LOCKED)) { - if (req->wb_index > idx_end) - break; - - next = req->wb_index + 1; - BUG_ON(!NFS_WBACK_BUSY(req)); - - kref_get(&req->wb_kref); - spin_unlock(&inode->i_lock); - error = nfs_wait_on_request(req); - nfs_release_request(req); - spin_lock(&inode->i_lock); - if (error < 0) - return error; - res++; - } - return res; -} - #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) static int nfs_need_commit(struct nfs_inode *nfsi) @@ -1432,7 +1394,7 @@ out_mark_dirty: return ret; } #else -static inline int nfs_commit_list(struct inode *inode, struct list_head *head, int how) +static int nfs_commit_inode(struct inode *inode, int how) { return 0; } @@ -1448,46 +1410,6 @@ int nfs_write_inode(struct inode *inode, struct writeback_control *wbc) return nfs_commit_unstable_pages(inode, wbc); } -long nfs_sync_mapping_wait(struct address_space *mapping, struct writeback_control *wbc, int how) -{ - struct inode *inode = mapping->host; - pgoff_t idx_start, idx_end; - unsigned int npages = 0; - LIST_HEAD(head); - long pages, ret; - - /* FIXME */ - if (wbc->range_cyclic) - idx_start = 0; - else { - idx_start = wbc->range_start >> PAGE_CACHE_SHIFT; - idx_end = wbc->range_end >> PAGE_CACHE_SHIFT; - if (idx_end > idx_start) { - pgoff_t l_npages = 1 + idx_end - idx_start; - npages = l_npages; - if (sizeof(npages) != sizeof(l_npages) && - (pgoff_t)npages != l_npages) - npages = 0; - } - } - spin_lock(&inode->i_lock); - do { - ret = nfs_wait_on_requests_locked(inode, idx_start, npages); - if (ret != 0) - continue; - pages = nfs_scan_commit(inode, &head, idx_start, npages); - if (pages == 0) - break; - pages += nfs_scan_commit(inode, &head, 0, 0); - spin_unlock(&inode->i_lock); - ret = nfs_commit_list(inode, &head, how); - spin_lock(&inode->i_lock); - - } while (ret >= 0); - spin_unlock(&inode->i_lock); - return ret; -} - /* * flush the inode to disk. */ @@ -1531,45 +1453,49 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page) return ret; } -static int nfs_wb_page_priority(struct inode *inode, struct page *page, - int how) +/* + * Write back all requests on one page - we do this before reading it. + */ +int nfs_wb_page(struct inode *inode, struct page *page) { loff_t range_start = page_offset(page); loff_t range_end = range_start + (loff_t)(PAGE_CACHE_SIZE - 1); struct writeback_control wbc = { - .bdi = page->mapping->backing_dev_info, .sync_mode = WB_SYNC_ALL, - .nr_to_write = LONG_MAX, + .nr_to_write = 0, .range_start = range_start, .range_end = range_end, }; + struct nfs_page *req; + int need_commit; int ret; - do { + while(PagePrivate(page)) { if (clear_page_dirty_for_io(page)) { ret = nfs_writepage_locked(page, &wbc); if (ret < 0) goto out_error; - } else if (!PagePrivate(page)) + } + req = nfs_find_and_lock_request(page); + if (!req) break; - ret = nfs_sync_mapping_wait(page->mapping, &wbc, how); - if (ret < 0) + if (IS_ERR(req)) { + ret = PTR_ERR(req); goto out_error; - } while (PagePrivate(page)); + } + need_commit = test_bit(PG_CLEAN, &req->wb_flags); + nfs_clear_page_tag_locked(req); + if (need_commit) { + ret = nfs_commit_inode(inode, FLUSH_SYNC); + if (ret < 0) + goto out_error; + } + } return 0; out_error: - __mark_inode_dirty(inode, I_DIRTY_PAGES); return ret; } -/* - * Write back all requests on one page - we do this before reading it. - */ -int nfs_wb_page(struct inode *inode, struct page* page) -{ - return nfs_wb_page_priority(inode, page, FLUSH_STABLE); -} - #ifdef CONFIG_MIGRATION int nfs_migrate_page(struct address_space *mapping, struct page *newpage, struct page *page) diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 93f439e7c5bf..b789d85bff82 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -475,7 +475,6 @@ extern int nfs_writeback_done(struct rpc_task *, struct nfs_write_data *); * Try to write back everything synchronously (but check the * return value!) */ -extern long nfs_sync_mapping_wait(struct address_space *, struct writeback_control *, int); extern int nfs_wb_all(struct inode *inode); extern int nfs_wb_page(struct inode *inode, struct page* page); extern int nfs_wb_page_cancel(struct inode *inode, struct page* page); -- cgit v1.2.3 From 5cf95214ccb915591e2214f81de4659302d3e452 Mon Sep 17 00:00:00 2001 From: Trond Myklebust <Trond.Myklebust@netapp.com> Date: Fri, 19 Feb 2010 17:03:29 -0800 Subject: NFS: Clean up nfs_sync_mapping Remove the redundant call to filemap_write_and_wait(). Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com> --- fs/nfs/inode.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 443772df9b17..e8b41170d295 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -114,16 +114,12 @@ void nfs_clear_inode(struct inode *inode) */ int nfs_sync_mapping(struct address_space *mapping) { - int ret; + int ret = 0; - if (mapping->nrpages == 0) - return 0; - unmap_mapping_range(mapping, 0, 0, 0); - ret = filemap_write_and_wait(mapping); - if (ret != 0) - goto out; - ret = nfs_wb_all(mapping->host); -out: + if (mapping->nrpages != 0) { + unmap_mapping_range(mapping, 0, 0, 0); + ret = nfs_wb_all(mapping->host); + } return ret; } -- cgit v1.2.3 From 1cda707d52e51a6cafac0aef12d2bd7052d572e6 Mon Sep 17 00:00:00 2001 From: Trond Myklebust <Trond.Myklebust@netapp.com> Date: Fri, 19 Feb 2010 17:03:30 -0800 Subject: NFS: Remove requirement for inode->i_mutex from nfs_invalidate_mapping Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com> --- fs/nfs/dir.c | 2 +- fs/nfs/inode.c | 41 +---------------------------------------- fs/nfs/symlink.c | 2 +- include/linux/nfs_fs.h | 1 - 4 files changed, 3 insertions(+), 43 deletions(-) (limited to 'fs') diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 3c7f03b669fb..a1f6b4438fb1 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -560,7 +560,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir) desc->entry = &my_entry; nfs_block_sillyrename(dentry); - res = nfs_revalidate_mapping_nolock(inode, filp->f_mapping); + res = nfs_revalidate_mapping(inode, filp->f_mapping); if (res < 0) goto out; diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index e8b41170d295..dbaaf7d2a188 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -754,7 +754,7 @@ int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) return __nfs_revalidate_inode(server, inode); } -static int nfs_invalidate_mapping_nolock(struct inode *inode, struct address_space *mapping) +static int nfs_invalidate_mapping(struct inode *inode, struct address_space *mapping) { struct nfs_inode *nfsi = NFS_I(inode); @@ -775,49 +775,10 @@ static int nfs_invalidate_mapping_nolock(struct inode *inode, struct address_spa return 0; } -static int nfs_invalidate_mapping(struct inode *inode, struct address_space *mapping) -{ - int ret = 0; - - mutex_lock(&inode->i_mutex); - if (NFS_I(inode)->cache_validity & NFS_INO_INVALID_DATA) { - ret = nfs_sync_mapping(mapping); - if (ret == 0) - ret = nfs_invalidate_mapping_nolock(inode, mapping); - } - mutex_unlock(&inode->i_mutex); - return ret; -} - -/** - * nfs_revalidate_mapping_nolock - Revalidate the pagecache - * @inode - pointer to host inode - * @mapping - pointer to mapping - */ -int nfs_revalidate_mapping_nolock(struct inode *inode, struct address_space *mapping) -{ - struct nfs_inode *nfsi = NFS_I(inode); - int ret = 0; - - if ((nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE) - || nfs_attribute_timeout(inode) || NFS_STALE(inode)) { - ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode); - if (ret < 0) - goto out; - } - if (nfsi->cache_validity & NFS_INO_INVALID_DATA) - ret = nfs_invalidate_mapping_nolock(inode, mapping); -out: - return ret; -} - /** * nfs_revalidate_mapping - Revalidate the pagecache * @inode - pointer to host inode * @mapping - pointer to mapping - * - * This version of the function will take the inode->i_mutex and attempt to - * flush out all dirty data if it needs to invalidate the page cache. */ int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping) { diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c index 412738dbfbc7..2ea9e5c27e55 100644 --- a/fs/nfs/symlink.c +++ b/fs/nfs/symlink.c @@ -50,7 +50,7 @@ static void *nfs_follow_link(struct dentry *dentry, struct nameidata *nd) struct page *page; void *err; - err = ERR_PTR(nfs_revalidate_mapping_nolock(inode, inode->i_mapping)); + err = ERR_PTR(nfs_revalidate_mapping(inode, inode->i_mapping)); if (err) goto read_failed; page = read_cache_page(&inode->i_data, 0, diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index b789d85bff82..1a0b85aa151e 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -347,7 +347,6 @@ extern int nfs_attribute_timeout(struct inode *inode); extern int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode); extern int __nfs_revalidate_inode(struct nfs_server *, struct inode *); extern int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping); -extern int nfs_revalidate_mapping_nolock(struct inode *inode, struct address_space *mapping); extern int nfs_setattr(struct dentry *, struct iattr *); extern void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr); extern struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx); -- cgit v1.2.3 From dd6102fbd917889384d89bc427e98e85e8fda000 Mon Sep 17 00:00:00 2001 From: Sripathi Kodi <sripathik@in.ibm.com> Date: Fri, 5 Mar 2010 18:48:00 +0000 Subject: 9P2010.L handshake: Add VFS flags Add 9P2000.u and 9P2010.L protocol flags to V9FS VFS This patch adds 9P2000.u and 9P2010.L protocol flags into V9FS VFS side code and removes the single flag used for 'extended'. Signed-off-by: Sripathi Kodi <sripathik@in.ibm.com> Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com> --- fs/9p/fid.c | 2 +- fs/9p/v9fs.c | 6 +++--- fs/9p/v9fs.h | 23 +++++++++++++++-------- fs/9p/vfs_file.c | 4 ++-- fs/9p/vfs_inode.c | 29 +++++++++++++++-------------- 5 files changed, 36 insertions(+), 28 deletions(-) (limited to 'fs') diff --git a/fs/9p/fid.c b/fs/9p/fid.c index 14d944204571..08b2eb157048 100644 --- a/fs/9p/fid.c +++ b/fs/9p/fid.c @@ -151,7 +151,7 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry) if (access == V9FS_ACCESS_SINGLE) return ERR_PTR(-EPERM); - if (v9fs_extended(v9ses)) + if (v9fs_proto_dotu(v9ses)) uname = NULL; else uname = v9ses->uname; diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index 7d6c2139891d..247f10a934ed 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -241,7 +241,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, list_add(&v9ses->slist, &v9fs_sessionlist); spin_unlock(&v9fs_sessionlist_lock); - v9ses->flags = V9FS_EXTENDED | V9FS_ACCESS_USER; + v9ses->flags = V9FS_PROTO_2000U | V9FS_ACCESS_USER; strcpy(v9ses->uname, V9FS_DEFUSER); strcpy(v9ses->aname, V9FS_DEFANAME); v9ses->uid = ~0; @@ -263,12 +263,12 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, } if (!v9ses->clnt->dotu) - v9ses->flags &= ~V9FS_EXTENDED; + v9ses->flags &= ~V9FS_PROTO_2000U; v9ses->maxdata = v9ses->clnt->msize - P9_IOHDRSZ; /* for legacy mode, fall back to V9FS_ACCESS_ANY */ - if (!v9fs_extended(v9ses) && + if (!v9fs_proto_dotu(v9ses) && ((v9ses->flags&V9FS_ACCESS_MASK) == V9FS_ACCESS_USER)) { v9ses->flags &= ~V9FS_ACCESS_MASK; diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h index 019f4ccb70c1..79000bf62491 100644 --- a/fs/9p/v9fs.h +++ b/fs/9p/v9fs.h @@ -23,7 +23,8 @@ /** * enum p9_session_flags - option flags for each 9P session - * @V9FS_EXTENDED: whether or not to use 9P2000.u extensions + * @V9FS_PROTO_2000U: whether or not to use 9P2000.u extensions + * @V9FS_PROTO_2010L: whether or not to use 9P2010.l extensions * @V9FS_ACCESS_SINGLE: only the mounting user can access the hierarchy * @V9FS_ACCESS_USER: a new attach will be issued for every user (default) * @V9FS_ACCESS_ANY: use a single attach for all users @@ -32,11 +33,12 @@ * Session flags reflect options selected by users at mount time */ enum p9_session_flags { - V9FS_EXTENDED = 0x01, - V9FS_ACCESS_SINGLE = 0x02, - V9FS_ACCESS_USER = 0x04, - V9FS_ACCESS_ANY = 0x06, - V9FS_ACCESS_MASK = 0x06, + V9FS_PROTO_2000U = 0x01, + V9FS_PROTO_2010L = 0x02, + V9FS_ACCESS_SINGLE = 0x04, + V9FS_ACCESS_USER = 0x08, + V9FS_ACCESS_ANY = 0x0C, + V9FS_ACCESS_MASK = 0x0C, }; /* possible values of ->cache */ @@ -121,7 +123,12 @@ static inline struct v9fs_session_info *v9fs_inode2v9ses(struct inode *inode) return (inode->i_sb->s_fs_info); } -static inline int v9fs_extended(struct v9fs_session_info *v9ses) +static inline int v9fs_proto_dotu(struct v9fs_session_info *v9ses) { - return v9ses->flags & V9FS_EXTENDED; + return v9ses->flags & V9FS_PROTO_2000U; +} + +static inline int v9fs_proto_dotl(struct v9fs_session_info *v9ses) +{ + return v9ses->flags & V9FS_PROTO_2010L; } diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 74a0461a9ac0..36122683fae8 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -61,7 +61,7 @@ int v9fs_file_open(struct inode *inode, struct file *file) P9_DPRINTK(P9_DEBUG_VFS, "inode: %p file: %p \n", inode, file); v9ses = v9fs_inode2v9ses(inode); - omode = v9fs_uflags2omode(file->f_flags, v9fs_extended(v9ses)); + omode = v9fs_uflags2omode(file->f_flags, v9fs_proto_dotu(v9ses)); fid = file->private_data; if (!fid) { fid = v9fs_fid_clone(file->f_path.dentry); @@ -77,7 +77,7 @@ int v9fs_file_open(struct inode *inode, struct file *file) i_size_write(inode, 0); inode->i_blocks = 0; } - if ((file->f_flags & O_APPEND) && (!v9fs_extended(v9ses))) + if ((file->f_flags & O_APPEND) && (!v9fs_proto_dotu(v9ses))) generic_file_llseek(file, 0, SEEK_END); } diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index a407fa3388c0..d3d3c3c20001 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -60,7 +60,7 @@ static int unixmode2p9mode(struct v9fs_session_info *v9ses, int mode) res = mode & 0777; if (S_ISDIR(mode)) res |= P9_DMDIR; - if (v9fs_extended(v9ses)) { + if (v9fs_proto_dotu(v9ses)) { if (S_ISLNK(mode)) res |= P9_DMSYMLINK; if (v9ses->nodev == 0) { @@ -102,21 +102,21 @@ static int p9mode2unixmode(struct v9fs_session_info *v9ses, int mode) if ((mode & P9_DMDIR) == P9_DMDIR) res |= S_IFDIR; - else if ((mode & P9_DMSYMLINK) && (v9fs_extended(v9ses))) + else if ((mode & P9_DMSYMLINK) && (v9fs_proto_dotu(v9ses))) res |= S_IFLNK; - else if ((mode & P9_DMSOCKET) && (v9fs_extended(v9ses)) + else if ((mode & P9_DMSOCKET) && (v9fs_proto_dotu(v9ses)) && (v9ses->nodev == 0)) res |= S_IFSOCK; - else if ((mode & P9_DMNAMEDPIPE) && (v9fs_extended(v9ses)) + else if ((mode & P9_DMNAMEDPIPE) && (v9fs_proto_dotu(v9ses)) && (v9ses->nodev == 0)) res |= S_IFIFO; - else if ((mode & P9_DMDEVICE) && (v9fs_extended(v9ses)) + else if ((mode & P9_DMDEVICE) && (v9fs_proto_dotu(v9ses)) && (v9ses->nodev == 0)) res |= S_IFBLK; else res |= S_IFREG; - if (v9fs_extended(v9ses)) { + if (v9fs_proto_dotu(v9ses)) { if ((mode & P9_DMSETUID) == P9_DMSETUID) res |= S_ISUID; @@ -265,7 +265,7 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode) case S_IFBLK: case S_IFCHR: case S_IFSOCK: - if (!v9fs_extended(v9ses)) { + if (!v9fs_proto_dotu(v9ses)) { P9_DPRINTK(P9_DEBUG_ERROR, "special files without extended mode\n"); err = -EINVAL; @@ -278,7 +278,7 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode) inode->i_fop = &v9fs_file_operations; break; case S_IFLNK: - if (!v9fs_extended(v9ses)) { + if (!v9fs_proto_dotu(v9ses)) { P9_DPRINTK(P9_DEBUG_ERROR, "extended modes used w/o 9P2000.u\n"); err = -EINVAL; @@ -288,7 +288,7 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode) break; case S_IFDIR: inc_nlink(inode); - if (v9fs_extended(v9ses)) + if (v9fs_proto_dotu(v9ses)) inode->i_op = &v9fs_dir_inode_operations_ext; else inode->i_op = &v9fs_dir_inode_operations; @@ -575,7 +575,8 @@ v9fs_vfs_create(struct inode *dir, struct dentry *dentry, int mode, flags = O_RDWR; fid = v9fs_create(v9ses, dir, dentry, NULL, perm, - v9fs_uflags2omode(flags, v9fs_extended(v9ses))); + v9fs_uflags2omode(flags, + v9fs_proto_dotu(v9ses))); if (IS_ERR(fid)) { err = PTR_ERR(fid); fid = NULL; @@ -858,7 +859,7 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr) if (iattr->ia_valid & ATTR_SIZE) wstat.length = iattr->ia_size; - if (v9fs_extended(v9ses)) { + if (v9fs_proto_dotu(v9ses)) { if (iattr->ia_valid & ATTR_UID) wstat.n_uid = iattr->ia_uid; @@ -897,7 +898,7 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode, inode->i_uid = v9ses->dfltuid; inode->i_gid = v9ses->dfltgid; - if (v9fs_extended(v9ses)) { + if (v9fs_proto_dotu(v9ses)) { inode->i_uid = stat->n_uid; inode->i_gid = stat->n_gid; } @@ -976,7 +977,7 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen) if (IS_ERR(fid)) return PTR_ERR(fid); - if (!v9fs_extended(v9ses)) + if (!v9fs_proto_dotu(v9ses)) return -EBADF; st = p9_client_stat(fid); @@ -1066,7 +1067,7 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry, struct p9_fid *fid; v9ses = v9fs_inode2v9ses(dir); - if (!v9fs_extended(v9ses)) { + if (!v9fs_proto_dotu(v9ses)) { P9_DPRINTK(P9_DEBUG_ERROR, "not extended\n"); return -EPERM; } -- cgit v1.2.3 From 342fee1d5c7dfa05f4e14ec1e583df4553b09776 Mon Sep 17 00:00:00 2001 From: Sripathi Kodi <sripathik@in.ibm.com> Date: Fri, 5 Mar 2010 18:50:14 +0000 Subject: 9P2010.L handshake: Remove "dotu" variable Removes 'dotu' variable and make everything dependent on 'proto_version' field. Signed-off-by: Sripathi Kodi <sripathik@in.ibm.com> Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com> --- fs/9p/v9fs.c | 2 +- fs/9p/vfs_dir.c | 2 +- include/net/9p/client.h | 3 +- net/9p/client.c | 65 ++++++++++++++++++++++++++----------------- net/9p/protocol.c | 74 +++++++++++++++++++++++++++---------------------- net/9p/protocol.h | 6 ++-- 6 files changed, 87 insertions(+), 65 deletions(-) (limited to 'fs') diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index 247f10a934ed..6c7f6a251115 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -262,7 +262,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, goto error; } - if (!v9ses->clnt->dotu) + if (!p9_is_proto_dotu(v9ses->clnt)) v9ses->flags &= ~V9FS_PROTO_2000U; v9ses->maxdata = v9ses->clnt->msize - P9_IOHDRSZ; diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c index 15cce53bf61e..6580aa449541 100644 --- a/fs/9p/vfs_dir.c +++ b/fs/9p/vfs_dir.c @@ -135,7 +135,7 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir) while (rdir->head < rdir->tail) { err = p9stat_read(rdir->buf + rdir->head, buflen - rdir->head, &st, - fid->clnt->dotu); + fid->clnt->proto_version); if (err) { P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err); err = -EIO; diff --git a/include/net/9p/client.h b/include/net/9p/client.h index d40f8c55dfae..52e1fff709e4 100644 --- a/include/net/9p/client.h +++ b/include/net/9p/client.h @@ -151,7 +151,6 @@ struct p9_req_t { struct p9_client { spinlock_t lock; /* protect client structure */ int msize; - unsigned char dotu; unsigned char proto_version; struct p9_trans_module *trans_mod; enum p9_trans_status status; @@ -224,5 +223,7 @@ int p9_parse_header(struct p9_fcall *, int32_t *, int8_t *, int16_t *, int); int p9stat_read(char *, int, struct p9_wstat *, int); void p9stat_free(struct p9_wstat *); +int p9_is_proto_dotu(struct p9_client *clnt); +int p9_is_proto_dotl(struct p9_client *clnt); #endif /* NET_9P_CLIENT_H */ diff --git a/net/9p/client.c b/net/9p/client.c index 3b5f3c94a6eb..9994676e57da 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -58,6 +58,18 @@ static const match_table_t tokens = { {Opt_err, NULL}, }; +inline int p9_is_proto_dotl(struct p9_client *clnt) +{ + return (clnt->proto_version == p9_proto_2010L); +} +EXPORT_SYMBOL(p9_is_proto_dotl); + +inline int p9_is_proto_dotu(struct p9_client *clnt) +{ + return (clnt->proto_version == p9_proto_2000u); +} +EXPORT_SYMBOL(p9_is_proto_dotu); + /* Interpret mount option for protocol version */ static unsigned char get_protocol_version(const substring_t *name) { @@ -97,7 +109,7 @@ static int parse_opts(char *opts, struct p9_client *clnt) int option; int ret = 0; - clnt->dotu = 1; + clnt->proto_version = p9_proto_2000u; clnt->msize = 8192; if (!opts) @@ -140,7 +152,7 @@ static int parse_opts(char *opts, struct p9_client *clnt) } break; case Opt_legacy: - clnt->dotu = 0; + clnt->proto_version = p9_proto_legacy; break; case Opt_version: ret = get_protocol_version(&args[0]); @@ -438,14 +450,15 @@ static int p9_check_errors(struct p9_client *c, struct p9_req_t *req) int ecode; char *ename; - err = p9pdu_readf(req->rc, c->dotu, "s?d", &ename, &ecode); + err = p9pdu_readf(req->rc, c->proto_version, "s?d", + &ename, &ecode); if (err) { P9_DPRINTK(P9_DEBUG_ERROR, "couldn't parse error%d\n", err); return err; } - if (c->dotu) + if (p9_is_proto_dotu(c)) err = -ecode; if (!err || !IS_ERR_VALUE(err)) @@ -543,7 +556,7 @@ p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...) /* marshall the data */ p9pdu_prepare(req->tc, tag, type); va_start(ap, fmt); - err = p9pdu_vwritef(req->tc, c->dotu, fmt, ap); + err = p9pdu_vwritef(req->tc, c->proto_version, fmt, ap); va_end(ap); p9pdu_finalize(req->tc); @@ -655,14 +668,14 @@ int p9_client_version(struct p9_client *c) char *version; int msize; - P9_DPRINTK(P9_DEBUG_9P, ">>> TVERSION msize %d extended %d\n", - c->msize, c->dotu); + P9_DPRINTK(P9_DEBUG_9P, ">>> TVERSION msize %d protocol %d\n", + c->msize, c->proto_version); req = p9_client_rpc(c, P9_TVERSION, "ds", c->msize, - c->dotu ? "9P2000.u" : "9P2000"); + p9_is_proto_dotu(c) ? "9P2000.u" : "9P2000"); if (IS_ERR(req)) return PTR_ERR(req); - err = p9pdu_readf(req->rc, c->dotu, "ds", &msize, &version); + err = p9pdu_readf(req->rc, c->proto_version, "ds", &msize, &version); if (err) { P9_DPRINTK(P9_DEBUG_9P, "version error %d\n", err); p9pdu_dump(1, req->rc); @@ -670,10 +683,10 @@ int p9_client_version(struct p9_client *c) } P9_DPRINTK(P9_DEBUG_9P, "<<< RVERSION msize %d %s\n", msize, version); - if (!memcmp(version, "9P2000.u", 8)) - c->dotu = 1; - else if (!memcmp(version, "9P2000", 6)) - c->dotu = 0; + if (!strncmp(version, "9P2000.u", 8)) + c->proto_version = p9_proto_2000u; + else if (!strncmp(version, "9P2000", 6)) + c->proto_version = p9_proto_legacy; else { err = -EREMOTEIO; goto error; @@ -728,8 +741,8 @@ struct p9_client *p9_client_create(const char *dev_name, char *options) goto put_trans; } - P9_DPRINTK(P9_DEBUG_MUX, "clnt %p trans %p msize %d dotu %d\n", - clnt, clnt->trans_mod, clnt->msize, clnt->dotu); + P9_DPRINTK(P9_DEBUG_MUX, "clnt %p trans %p msize %d protocol %d\n", + clnt, clnt->trans_mod, clnt->msize, clnt->proto_version); err = clnt->trans_mod->create(clnt, dev_name, options); if (err) @@ -812,7 +825,7 @@ struct p9_fid *p9_client_attach(struct p9_client *clnt, struct p9_fid *afid, goto error; } - err = p9pdu_readf(req->rc, clnt->dotu, "Q", &qid); + err = p9pdu_readf(req->rc, clnt->proto_version, "Q", &qid); if (err) { p9pdu_dump(1, req->rc); p9_free_req(clnt, req); @@ -861,7 +874,7 @@ p9_client_auth(struct p9_client *clnt, char *uname, u32 n_uname, char *aname) goto error; } - err = p9pdu_readf(req->rc, clnt->dotu, "Q", &qid); + err = p9pdu_readf(req->rc, clnt->proto_version, "Q", &qid); if (err) { p9pdu_dump(1, req->rc); p9_free_req(clnt, req); @@ -919,7 +932,7 @@ struct p9_fid *p9_client_walk(struct p9_fid *oldfid, int nwname, char **wnames, goto error; } - err = p9pdu_readf(req->rc, clnt->dotu, "R", &nwqids, &wqids); + err = p9pdu_readf(req->rc, clnt->proto_version, "R", &nwqids, &wqids); if (err) { p9pdu_dump(1, req->rc); p9_free_req(clnt, req); @@ -980,7 +993,7 @@ int p9_client_open(struct p9_fid *fid, int mode) goto error; } - err = p9pdu_readf(req->rc, clnt->dotu, "Qd", &qid, &iounit); + err = p9pdu_readf(req->rc, clnt->proto_version, "Qd", &qid, &iounit); if (err) { p9pdu_dump(1, req->rc); goto free_and_error; @@ -1025,7 +1038,7 @@ int p9_client_fcreate(struct p9_fid *fid, char *name, u32 perm, int mode, goto error; } - err = p9pdu_readf(req->rc, clnt->dotu, "Qd", &qid, &iounit); + err = p9pdu_readf(req->rc, clnt->proto_version, "Qd", &qid, &iounit); if (err) { p9pdu_dump(1, req->rc); goto free_and_error; @@ -1126,7 +1139,7 @@ p9_client_read(struct p9_fid *fid, char *data, char __user *udata, u64 offset, goto error; } - err = p9pdu_readf(req->rc, clnt->dotu, "D", &count, &dataptr); + err = p9pdu_readf(req->rc, clnt->proto_version, "D", &count, &dataptr); if (err) { p9pdu_dump(1, req->rc); goto free_and_error; @@ -1187,7 +1200,7 @@ p9_client_write(struct p9_fid *fid, char *data, const char __user *udata, goto error; } - err = p9pdu_readf(req->rc, clnt->dotu, "d", &count); + err = p9pdu_readf(req->rc, clnt->proto_version, "d", &count); if (err) { p9pdu_dump(1, req->rc); goto free_and_error; @@ -1227,7 +1240,7 @@ struct p9_wstat *p9_client_stat(struct p9_fid *fid) goto error; } - err = p9pdu_readf(req->rc, clnt->dotu, "wS", &ignored, ret); + err = p9pdu_readf(req->rc, clnt->proto_version, "wS", &ignored, ret); if (err) { p9pdu_dump(1, req->rc); p9_free_req(clnt, req); @@ -1254,7 +1267,7 @@ error: } EXPORT_SYMBOL(p9_client_stat); -static int p9_client_statsize(struct p9_wstat *wst, int optional) +static int p9_client_statsize(struct p9_wstat *wst, int proto_version) { int ret; @@ -1273,7 +1286,7 @@ static int p9_client_statsize(struct p9_wstat *wst, int optional) if (wst->muid) ret += strlen(wst->muid); - if (optional) { + if (proto_version == p9_proto_2000u) { ret += 2+4+4+4; /* extension[s] n_uid[4] n_gid[4] n_muid[4] */ if (wst->extension) ret += strlen(wst->extension); @@ -1290,7 +1303,7 @@ int p9_client_wstat(struct p9_fid *fid, struct p9_wstat *wst) err = 0; clnt = fid->clnt; - wst->size = p9_client_statsize(wst, clnt->dotu); + wst->size = p9_client_statsize(wst, clnt->proto_version); P9_DPRINTK(P9_DEBUG_9P, ">>> TWSTAT fid %d\n", fid->fid); P9_DPRINTK(P9_DEBUG_9P, " sz=%x type=%x dev=%x qid=%x.%llx.%x\n" diff --git a/net/9p/protocol.c b/net/9p/protocol.c index fc70147c771e..94f5a8f65e9c 100644 --- a/net/9p/protocol.c +++ b/net/9p/protocol.c @@ -52,7 +52,7 @@ #endif static int -p9pdu_writef(struct p9_fcall *pdu, int optional, const char *fmt, ...); +p9pdu_writef(struct p9_fcall *pdu, int proto_version, const char *fmt, ...); #ifdef CONFIG_NET_9P_DEBUG void @@ -144,7 +144,8 @@ pdu_write_u(struct p9_fcall *pdu, const char __user *udata, size_t size) */ static int -p9pdu_vreadf(struct p9_fcall *pdu, int optional, const char *fmt, va_list ap) +p9pdu_vreadf(struct p9_fcall *pdu, int proto_version, const char *fmt, + va_list ap) { const char *ptr; int errcode = 0; @@ -194,7 +195,8 @@ p9pdu_vreadf(struct p9_fcall *pdu, int optional, const char *fmt, va_list ap) int16_t len; int size; - errcode = p9pdu_readf(pdu, optional, "w", &len); + errcode = p9pdu_readf(pdu, proto_version, + "w", &len); if (errcode) break; @@ -217,7 +219,7 @@ p9pdu_vreadf(struct p9_fcall *pdu, int optional, const char *fmt, va_list ap) struct p9_qid *qid = va_arg(ap, struct p9_qid *); - errcode = p9pdu_readf(pdu, optional, "bdq", + errcode = p9pdu_readf(pdu, proto_version, "bdq", &qid->type, &qid->version, &qid->path); } @@ -230,7 +232,7 @@ p9pdu_vreadf(struct p9_fcall *pdu, int optional, const char *fmt, va_list ap) stbuf->n_uid = stbuf->n_gid = stbuf->n_muid = -1; errcode = - p9pdu_readf(pdu, optional, + p9pdu_readf(pdu, proto_version, "wwdQdddqssss?sddd", &stbuf->size, &stbuf->type, &stbuf->dev, &stbuf->qid, @@ -250,7 +252,7 @@ p9pdu_vreadf(struct p9_fcall *pdu, int optional, const char *fmt, va_list ap) void **data = va_arg(ap, void **); errcode = - p9pdu_readf(pdu, optional, "d", count); + p9pdu_readf(pdu, proto_version, "d", count); if (!errcode) { *count = MIN(*count, @@ -263,8 +265,8 @@ p9pdu_vreadf(struct p9_fcall *pdu, int optional, const char *fmt, va_list ap) int16_t *nwname = va_arg(ap, int16_t *); char ***wnames = va_arg(ap, char ***); - errcode = - p9pdu_readf(pdu, optional, "w", nwname); + errcode = p9pdu_readf(pdu, proto_version, + "w", nwname); if (!errcode) { *wnames = kmalloc(sizeof(char *) * *nwname, @@ -278,7 +280,8 @@ p9pdu_vreadf(struct p9_fcall *pdu, int optional, const char *fmt, va_list ap) for (i = 0; i < *nwname; i++) { errcode = - p9pdu_readf(pdu, optional, + p9pdu_readf(pdu, + proto_version, "s", &(*wnames)[i]); if (errcode) @@ -306,7 +309,7 @@ p9pdu_vreadf(struct p9_fcall *pdu, int optional, const char *fmt, va_list ap) *wqids = NULL; errcode = - p9pdu_readf(pdu, optional, "w", nwqid); + p9pdu_readf(pdu, proto_version, "w", nwqid); if (!errcode) { *wqids = kmalloc(*nwqid * @@ -321,7 +324,8 @@ p9pdu_vreadf(struct p9_fcall *pdu, int optional, const char *fmt, va_list ap) for (i = 0; i < *nwqid; i++) { errcode = - p9pdu_readf(pdu, optional, + p9pdu_readf(pdu, + proto_version, "Q", &(*wqids)[i]); if (errcode) @@ -336,7 +340,7 @@ p9pdu_vreadf(struct p9_fcall *pdu, int optional, const char *fmt, va_list ap) } break; case '?': - if (!optional) + if (proto_version != p9_proto_2000u) return 0; break; default: @@ -352,7 +356,8 @@ p9pdu_vreadf(struct p9_fcall *pdu, int optional, const char *fmt, va_list ap) } int -p9pdu_vwritef(struct p9_fcall *pdu, int optional, const char *fmt, va_list ap) +p9pdu_vwritef(struct p9_fcall *pdu, int proto_version, const char *fmt, + va_list ap) { const char *ptr; int errcode = 0; @@ -389,7 +394,8 @@ p9pdu_vwritef(struct p9_fcall *pdu, int optional, const char *fmt, va_list ap) if (sptr) len = MIN(strlen(sptr), USHORT_MAX); - errcode = p9pdu_writef(pdu, optional, "w", len); + errcode = p9pdu_writef(pdu, proto_version, + "w", len); if (!errcode && pdu_write(pdu, sptr, len)) errcode = -EFAULT; } @@ -398,7 +404,7 @@ p9pdu_vwritef(struct p9_fcall *pdu, int optional, const char *fmt, va_list ap) const struct p9_qid *qid = va_arg(ap, const struct p9_qid *); errcode = - p9pdu_writef(pdu, optional, "bdq", + p9pdu_writef(pdu, proto_version, "bdq", qid->type, qid->version, qid->path); } break; @@ -406,7 +412,7 @@ p9pdu_vwritef(struct p9_fcall *pdu, int optional, const char *fmt, va_list ap) const struct p9_wstat *stbuf = va_arg(ap, const struct p9_wstat *); errcode = - p9pdu_writef(pdu, optional, + p9pdu_writef(pdu, proto_version, "wwdQdddqssss?sddd", stbuf->size, stbuf->type, stbuf->dev, &stbuf->qid, @@ -421,8 +427,8 @@ p9pdu_vwritef(struct p9_fcall *pdu, int optional, const char *fmt, va_list ap) int32_t count = va_arg(ap, int32_t); const void *data = va_arg(ap, const void *); - errcode = - p9pdu_writef(pdu, optional, "d", count); + errcode = p9pdu_writef(pdu, proto_version, "d", + count); if (!errcode && pdu_write(pdu, data, count)) errcode = -EFAULT; } @@ -431,8 +437,8 @@ p9pdu_vwritef(struct p9_fcall *pdu, int optional, const char *fmt, va_list ap) int32_t count = va_arg(ap, int32_t); const char __user *udata = va_arg(ap, const void __user *); - errcode = - p9pdu_writef(pdu, optional, "d", count); + errcode = p9pdu_writef(pdu, proto_version, "d", + count); if (!errcode && pdu_write_u(pdu, udata, count)) errcode = -EFAULT; } @@ -441,14 +447,15 @@ p9pdu_vwritef(struct p9_fcall *pdu, int optional, const char *fmt, va_list ap) int16_t nwname = va_arg(ap, int); const char **wnames = va_arg(ap, const char **); - errcode = - p9pdu_writef(pdu, optional, "w", nwname); + errcode = p9pdu_writef(pdu, proto_version, "w", + nwname); if (!errcode) { int i; for (i = 0; i < nwname; i++) { errcode = - p9pdu_writef(pdu, optional, + p9pdu_writef(pdu, + proto_version, "s", wnames[i]); if (errcode) @@ -462,14 +469,15 @@ p9pdu_vwritef(struct p9_fcall *pdu, int optional, const char *fmt, va_list ap) struct p9_qid *wqids = va_arg(ap, struct p9_qid *); - errcode = - p9pdu_writef(pdu, optional, "w", nwqid); + errcode = p9pdu_writef(pdu, proto_version, "w", + nwqid); if (!errcode) { int i; for (i = 0; i < nwqid; i++) { errcode = - p9pdu_writef(pdu, optional, + p9pdu_writef(pdu, + proto_version, "Q", &wqids[i]); if (errcode) @@ -479,7 +487,7 @@ p9pdu_vwritef(struct p9_fcall *pdu, int optional, const char *fmt, va_list ap) } break; case '?': - if (!optional) + if (proto_version != p9_proto_2000u) return 0; break; default: @@ -494,32 +502,32 @@ p9pdu_vwritef(struct p9_fcall *pdu, int optional, const char *fmt, va_list ap) return errcode; } -int p9pdu_readf(struct p9_fcall *pdu, int optional, const char *fmt, ...) +int p9pdu_readf(struct p9_fcall *pdu, int proto_version, const char *fmt, ...) { va_list ap; int ret; va_start(ap, fmt); - ret = p9pdu_vreadf(pdu, optional, fmt, ap); + ret = p9pdu_vreadf(pdu, proto_version, fmt, ap); va_end(ap); return ret; } static int -p9pdu_writef(struct p9_fcall *pdu, int optional, const char *fmt, ...) +p9pdu_writef(struct p9_fcall *pdu, int proto_version, const char *fmt, ...) { va_list ap; int ret; va_start(ap, fmt); - ret = p9pdu_vwritef(pdu, optional, fmt, ap); + ret = p9pdu_vwritef(pdu, proto_version, fmt, ap); va_end(ap); return ret; } -int p9stat_read(char *buf, int len, struct p9_wstat *st, int dotu) +int p9stat_read(char *buf, int len, struct p9_wstat *st, int proto_version) { struct p9_fcall fake_pdu; int ret; @@ -529,7 +537,7 @@ int p9stat_read(char *buf, int len, struct p9_wstat *st, int dotu) fake_pdu.sdata = buf; fake_pdu.offset = 0; - ret = p9pdu_readf(&fake_pdu, dotu, "S", st); + ret = p9pdu_readf(&fake_pdu, proto_version, "S", st); if (ret) { P9_DPRINTK(P9_DEBUG_9P, "<<< p9stat_read failed: %d\n", ret); p9pdu_dump(1, &fake_pdu); diff --git a/net/9p/protocol.h b/net/9p/protocol.h index ccde462e7ac5..2431c0f38d56 100644 --- a/net/9p/protocol.h +++ b/net/9p/protocol.h @@ -25,9 +25,9 @@ * */ -int -p9pdu_vwritef(struct p9_fcall *pdu, int optional, const char *fmt, va_list ap); -int p9pdu_readf(struct p9_fcall *pdu, int optional, const char *fmt, ...); +int p9pdu_vwritef(struct p9_fcall *pdu, int proto_version, const char *fmt, + va_list ap); +int p9pdu_readf(struct p9_fcall *pdu, int proto_version, const char *fmt, ...); int p9pdu_prepare(struct p9_fcall *pdu, int16_t tag, int8_t type); int p9pdu_finalize(struct p9_fcall *pdu); void p9pdu_dump(int, struct p9_fcall *); -- cgit v1.2.3 From 5717144a01d701614cfdb15f09ed562d720cf3db Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com> Date: Fri, 5 Mar 2010 14:43:43 -0600 Subject: fs/9p: Add hardlink support to .u extension For regular file and directories we put the link count in th extension field in a tagged string format. Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com> --- fs/9p/vfs_inode.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index d3d3c3c20001..5fe45d692c9f 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -887,6 +887,8 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode, struct super_block *sb) { char ext[32]; + char tag_name[14]; + unsigned int i_nlink; struct v9fs_session_info *v9ses = sb->s_fs_info; inode->i_nlink = 1; @@ -902,7 +904,22 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode, inode->i_uid = stat->n_uid; inode->i_gid = stat->n_gid; } - + if ((S_ISREG(inode->i_mode)) || (S_ISDIR(inode->i_mode))) { + if (v9fs_proto_dotu(v9ses) && (stat->extension[0] != '\0')) { + /* + * Hadlink support got added later to + * to the .u extension. So there can be + * server out there that doesn't support + * this even with .u extension. So check + * for non NULL stat->extension + */ + strncpy(ext, stat->extension, sizeof(ext)); + /* HARDLINKCOUNT %u */ + sscanf(ext, "%13s %u", tag_name, &i_nlink); + if (!strncmp(tag_name, "HARDLINKCOUNT", 13)) + inode->i_nlink = i_nlink; + } + } inode->i_mode = p9mode2unixmode(v9ses, stat->mode); if ((S_ISBLK(inode->i_mode)) || (S_ISCHR(inode->i_mode))) { char type = 0; -- cgit v1.2.3 From 9cf05b416d3324457f1dd8be35f4eaa7a9640bed Mon Sep 17 00:00:00 2001 From: Joern Engel <joern@logfs.org> Date: Sat, 6 Mar 2010 10:01:46 +0100 Subject: [LogFS] Remove h_version field Incompatible change: h_compr is moved up so the padding is all in one chunk. --- fs/logfs/journal.c | 7 ++++--- fs/logfs/logfs_abi.h | 4 +--- 2 files changed, 5 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/logfs/journal.c b/fs/logfs/journal.c index 57eb4fb444a9..6ad30a4c9052 100644 --- a/fs/logfs/journal.c +++ b/fs/logfs/journal.c @@ -419,12 +419,13 @@ static size_t __logfs_write_header(struct logfs_super *super, { jh->h_len = cpu_to_be16(len); jh->h_type = cpu_to_be16(type); - jh->h_version = cpu_to_be16(++super->s_last_version); jh->h_datalen = cpu_to_be16(datalen); jh->h_compr = compr; jh->h_pad[0] = 'H'; - jh->h_pad[1] = 'A'; - jh->h_pad[2] = 'T'; + jh->h_pad[1] = 'E'; + jh->h_pad[2] = 'A'; + jh->h_pad[3] = 'D'; + jh->h_pad[4] = 'R'; jh->h_crc = logfs_crc32(jh, len + sizeof(*jh), 4); return ALIGN(len, 16) + sizeof(*jh); } diff --git a/fs/logfs/logfs_abi.h b/fs/logfs/logfs_abi.h index 8d4dd3de551e..d8cc95734a60 100644 --- a/fs/logfs/logfs_abi.h +++ b/fs/logfs/logfs_abi.h @@ -422,7 +422,6 @@ SIZE_CHECK(logfs_segment_entry, 8); * not including header * @h_datalen: length of uncompressed data * @h_type: JE type - * @h_version: unnormalized version of journal entry * @h_compr: compression type * @h_pad: reserved */ @@ -431,9 +430,8 @@ struct logfs_journal_header { __be16 h_len; __be16 h_datalen; __be16 h_type; - __be16 h_version; __u8 h_compr; - __u8 h_pad[3]; + __u8 h_pad[5]; }; SIZE_CHECK(logfs_journal_header, 16); -- cgit v1.2.3 From c2f843f03d658e9ab2a1a455f2c1851fd6a869af Mon Sep 17 00:00:00 2001 From: Joern Engel <joern@logfs.org> Date: Sat, 6 Mar 2010 10:03:11 +0100 Subject: [LogFS] Change magic number Many changes were made during development that could result in old versions of mklogfs and the kernel code being subtly incompatible. Not being a friend of subtleties, I hereby change the magic number. Any old version of mklogfs is now guaranteed to fail. --- fs/logfs/logfs_abi.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/logfs/logfs_abi.h b/fs/logfs/logfs_abi.h index d8cc95734a60..f674725663fe 100644 --- a/fs/logfs/logfs_abi.h +++ b/fs/logfs/logfs_abi.h @@ -61,7 +61,7 @@ static inline void check_##type(void) \ /* Magic numbers. 64bit for superblock, 32bit for statfs f_type */ -#define LOGFS_MAGIC 0xb21f205ac97e8168ull +#define LOGFS_MAGIC 0x7a3a8e5cb9d5bf67ull #define LOGFS_MAGIC_U32 0xc97e8168u /* -- cgit v1.2.3 From 781b16775ba0bb55fac0e1757bf0bd87c8879632 Mon Sep 17 00:00:00 2001 From: Al Viro <viro@ZenIV.linux.org.uk> Date: Sat, 6 Mar 2010 18:41:07 +0000 Subject: Fix a dumb typo - use of & instead of && We managed to lose O_DIRECTORY testing due to a stupid typo in commit 1f36f774b2 ("Switch !O_CREAT case to use of do_last()") Reported-by: Walter Sheets <w41ter@gmail.com> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- fs/namei.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 3d9d2f965f84..48e60a187325 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1656,7 +1656,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path, if (path->dentry->d_inode->i_op->follow_link) return NULL; error = -ENOTDIR; - if (*want_dir & !path->dentry->d_inode->i_op->lookup) + if (*want_dir && !path->dentry->d_inode->i_op->lookup) goto exit_dput; path_to_nameidata(path, nd); audit_inode(pathname, nd->path.dentry); -- cgit v1.2.3 From 984b3f5746ed2cde3d184651dabf26980f2b66e5 Mon Sep 17 00:00:00 2001 From: Akinobu Mita <akinobu.mita@gmail.com> Date: Fri, 5 Mar 2010 13:41:37 -0800 Subject: bitops: rename for_each_bit() to for_each_set_bit() Rename for_each_bit to for_each_set_bit in the kernel source tree. To permit for_each_clear_bit(), should that ever be added. The patch includes a macro to map the old for_each_bit() onto the new for_each_set_bit(). This is a (very) temporary thing to ease the migration. [akpm@linux-foundation.org: add temporary for_each_bit()] Suggested-by: Alexey Dobriyan <adobriyan@gmail.com> Suggested-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com> Cc: "David S. Miller" <davem@davemloft.net> Cc: Russell King <rmk@arm.linux.org.uk> Cc: David Woodhouse <dwmw2@infradead.org> Cc: Artem Bityutskiy <dedekind@infradead.org> Cc: Stephen Rothwell <sfr@canb.auug.org.au> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- arch/x86/kernel/cpu/perf_event.c | 2 +- arch/x86/kernel/cpu/perf_event_intel.c | 2 +- drivers/dma/ioat/dma.c | 2 +- drivers/gpio/pl061.c | 2 +- drivers/gpio/timbgpio.c | 2 +- drivers/i2c/busses/i2c-designware.c | 4 ++-- drivers/mfd/htc-egpio.c | 2 +- drivers/misc/sgi-xp/xpnet.c | 2 +- drivers/net/gianfar.c | 12 ++++++------ drivers/net/ixgbe/ixgbe_main.c | 2 +- drivers/net/ixgbevf/ixgbevf_main.c | 2 +- drivers/net/wireless/ath/ar9170/main.c | 2 +- drivers/net/wireless/iwmc3200wifi/debugfs.c | 2 +- drivers/net/wireless/iwmc3200wifi/rx.c | 2 +- fs/ocfs2/quota_local.c | 2 +- include/linux/bitops.h | 4 +++- kernel/sched_cpupri.c | 2 +- sound/soc/codecs/uda1380.c | 2 +- 18 files changed, 26 insertions(+), 24 deletions(-) (limited to 'fs') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 641ccb9dddbc..b1fbdeecf6c9 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -676,7 +676,7 @@ static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) if (c->weight != w) continue; - for_each_bit(j, c->idxmsk, X86_PMC_IDX_MAX) { + for_each_set_bit(j, c->idxmsk, X86_PMC_IDX_MAX) { if (!test_bit(j, used_mask)) break; } diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index cf6590cf4a5f..977e7544738c 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -757,7 +757,7 @@ again: inc_irq_stat(apic_perf_irqs); ack = status; - for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { + for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { struct perf_event *event = cpuc->events[bit]; clear_bit(bit, (unsigned long *) &status); diff --git a/drivers/dma/ioat/dma.c b/drivers/dma/ioat/dma.c index 5d0e42b263df..af14c9a5b8d4 100644 --- a/drivers/dma/ioat/dma.c +++ b/drivers/dma/ioat/dma.c @@ -71,7 +71,7 @@ static irqreturn_t ioat_dma_do_interrupt(int irq, void *data) } attnstatus = readl(instance->reg_base + IOAT_ATTNSTATUS_OFFSET); - for_each_bit(bit, &attnstatus, BITS_PER_LONG) { + for_each_set_bit(bit, &attnstatus, BITS_PER_LONG) { chan = ioat_chan_by_index(instance, bit); tasklet_schedule(&chan->cleanup_task); } diff --git a/drivers/gpio/pl061.c b/drivers/gpio/pl061.c index 4ee4c8367a3f..3ad1eeb49609 100644 --- a/drivers/gpio/pl061.c +++ b/drivers/gpio/pl061.c @@ -219,7 +219,7 @@ static void pl061_irq_handler(unsigned irq, struct irq_desc *desc) if (pending == 0) continue; - for_each_bit(offset, &pending, PL061_GPIO_NR) + for_each_set_bit(offset, &pending, PL061_GPIO_NR) generic_handle_irq(pl061_to_irq(&chip->gc, offset)); } desc->chip->unmask(irq); diff --git a/drivers/gpio/timbgpio.c b/drivers/gpio/timbgpio.c index d941f45fe557..4ecba6e5a32d 100644 --- a/drivers/gpio/timbgpio.c +++ b/drivers/gpio/timbgpio.c @@ -175,7 +175,7 @@ static void timbgpio_irq(unsigned int irq, struct irq_desc *desc) ipr = ioread32(tgpio->membase + TGPIO_IPR); iowrite32(ipr, tgpio->membase + TGPIO_ICR); - for_each_bit(offset, &ipr, tgpio->gpio.ngpio) + for_each_set_bit(offset, &ipr, tgpio->gpio.ngpio) generic_handle_irq(timbgpio_to_irq(&tgpio->gpio, offset)); } diff --git a/drivers/i2c/busses/i2c-designware.c b/drivers/i2c/busses/i2c-designware.c index 9e18ef97f156..3e72b69aa7f8 100644 --- a/drivers/i2c/busses/i2c-designware.c +++ b/drivers/i2c/busses/i2c-designware.c @@ -497,13 +497,13 @@ static int i2c_dw_handle_tx_abort(struct dw_i2c_dev *dev) int i; if (abort_source & DW_IC_TX_ABRT_NOACK) { - for_each_bit(i, &abort_source, ARRAY_SIZE(abort_sources)) + for_each_set_bit(i, &abort_source, ARRAY_SIZE(abort_sources)) dev_dbg(dev->dev, "%s: %s\n", __func__, abort_sources[i]); return -EREMOTEIO; } - for_each_bit(i, &abort_source, ARRAY_SIZE(abort_sources)) + for_each_set_bit(i, &abort_source, ARRAY_SIZE(abort_sources)) dev_err(dev->dev, "%s: %s\n", __func__, abort_sources[i]); if (abort_source & DW_IC_TX_ARB_LOST) diff --git a/drivers/mfd/htc-egpio.c b/drivers/mfd/htc-egpio.c index aa266e1f69b2..addb846c1e34 100644 --- a/drivers/mfd/htc-egpio.c +++ b/drivers/mfd/htc-egpio.c @@ -108,7 +108,7 @@ static void egpio_handler(unsigned int irq, struct irq_desc *desc) ack_irqs(ei); /* Process all set pins. */ readval &= ei->irqs_enabled; - for_each_bit(irqpin, &readval, ei->nirqs) { + for_each_set_bit(irqpin, &readval, ei->nirqs) { /* Run irq handler */ pr_debug("got IRQ %d\n", irqpin); irq = ei->irq_start + irqpin; diff --git a/drivers/misc/sgi-xp/xpnet.c b/drivers/misc/sgi-xp/xpnet.c index 16f0abda1423..57b152f8d1b9 100644 --- a/drivers/misc/sgi-xp/xpnet.c +++ b/drivers/misc/sgi-xp/xpnet.c @@ -475,7 +475,7 @@ xpnet_dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev) if (skb->data[0] == 0xff) { /* we are being asked to broadcast to all partitions */ - for_each_bit(dest_partid, xpnet_broadcast_partitions, + for_each_set_bit(dest_partid, xpnet_broadcast_partitions, xp_max_npartitions) { xpnet_send(skb, queued_msg, start_addr, end_addr, diff --git a/drivers/net/gianfar.c b/drivers/net/gianfar.c index 6aa526ee9096..61a7b4351e78 100644 --- a/drivers/net/gianfar.c +++ b/drivers/net/gianfar.c @@ -998,7 +998,7 @@ static int gfar_probe(struct of_device *ofdev, } /* Need to reverse the bit maps as bit_map's MSB is q0 - * but, for_each_bit parses from right to left, which + * but, for_each_set_bit parses from right to left, which * basically reverses the queue numbers */ for (i = 0; i< priv->num_grps; i++) { priv->gfargrp[i].tx_bit_map = reverse_bitmap( @@ -1011,7 +1011,7 @@ static int gfar_probe(struct of_device *ofdev, * also assign queues to groups */ for (grp_idx = 0; grp_idx < priv->num_grps; grp_idx++) { priv->gfargrp[grp_idx].num_rx_queues = 0x0; - for_each_bit(i, &priv->gfargrp[grp_idx].rx_bit_map, + for_each_set_bit(i, &priv->gfargrp[grp_idx].rx_bit_map, priv->num_rx_queues) { priv->gfargrp[grp_idx].num_rx_queues++; priv->rx_queue[i]->grp = &priv->gfargrp[grp_idx]; @@ -1019,7 +1019,7 @@ static int gfar_probe(struct of_device *ofdev, rqueue = rqueue | ((RQUEUE_EN0 | RQUEUE_EX0) >> i); } priv->gfargrp[grp_idx].num_tx_queues = 0x0; - for_each_bit (i, &priv->gfargrp[grp_idx].tx_bit_map, + for_each_set_bit(i, &priv->gfargrp[grp_idx].tx_bit_map, priv->num_tx_queues) { priv->gfargrp[grp_idx].num_tx_queues++; priv->tx_queue[i]->grp = &priv->gfargrp[grp_idx]; @@ -1709,7 +1709,7 @@ void gfar_configure_coalescing(struct gfar_private *priv, if (priv->mode == MQ_MG_MODE) { baddr = ®s->txic0; - for_each_bit (i, &tx_mask, priv->num_tx_queues) { + for_each_set_bit(i, &tx_mask, priv->num_tx_queues) { if (likely(priv->tx_queue[i]->txcoalescing)) { gfar_write(baddr + i, 0); gfar_write(baddr + i, priv->tx_queue[i]->txic); @@ -1717,7 +1717,7 @@ void gfar_configure_coalescing(struct gfar_private *priv, } baddr = ®s->rxic0; - for_each_bit (i, &rx_mask, priv->num_rx_queues) { + for_each_set_bit(i, &rx_mask, priv->num_rx_queues) { if (likely(priv->rx_queue[i]->rxcoalescing)) { gfar_write(baddr + i, 0); gfar_write(baddr + i, priv->rx_queue[i]->rxic); @@ -2607,7 +2607,7 @@ static int gfar_poll(struct napi_struct *napi, int budget) budget_per_queue = left_over_budget/num_queues; left_over_budget = 0; - for_each_bit(i, &gfargrp->rx_bit_map, priv->num_rx_queues) { + for_each_set_bit(i, &gfargrp->rx_bit_map, priv->num_rx_queues) { if (test_bit(i, &serviced_queues)) continue; rx_queue = priv->rx_queue[i]; diff --git a/drivers/net/ixgbe/ixgbe_main.c b/drivers/net/ixgbe/ixgbe_main.c index 45e3532b166f..684af371462d 100644 --- a/drivers/net/ixgbe/ixgbe_main.c +++ b/drivers/net/ixgbe/ixgbe_main.c @@ -1050,7 +1050,7 @@ static void ixgbe_configure_msix(struct ixgbe_adapter *adapter) */ for (v_idx = 0; v_idx < q_vectors; v_idx++) { q_vector = adapter->q_vector[v_idx]; - /* XXX for_each_bit(...) */ + /* XXX for_each_set_bit(...) */ r_idx = find_first_bit(q_vector->rxr_idx, adapter->num_rx_queues); diff --git a/drivers/net/ixgbevf/ixgbevf_main.c b/drivers/net/ixgbevf/ixgbevf_main.c index 235b5fd4b8d4..ca653c49b765 100644 --- a/drivers/net/ixgbevf/ixgbevf_main.c +++ b/drivers/net/ixgbevf/ixgbevf_main.c @@ -751,7 +751,7 @@ static void ixgbevf_configure_msix(struct ixgbevf_adapter *adapter) */ for (v_idx = 0; v_idx < q_vectors; v_idx++) { q_vector = adapter->q_vector[v_idx]; - /* XXX for_each_bit(...) */ + /* XXX for_each_set_bit(...) */ r_idx = find_first_bit(q_vector->rxr_idx, adapter->num_rx_queues); diff --git a/drivers/net/wireless/ath/ar9170/main.c b/drivers/net/wireless/ath/ar9170/main.c index 8a964f130367..a6452af9c6c5 100644 --- a/drivers/net/wireless/ath/ar9170/main.c +++ b/drivers/net/wireless/ath/ar9170/main.c @@ -394,7 +394,7 @@ static void ar9170_tx_fake_ampdu_status(struct ar9170 *ar) ieee80211_tx_status_irqsafe(ar->hw, skb); } - for_each_bit(i, &queue_bitmap, BITS_PER_BYTE) { + for_each_set_bit(i, &queue_bitmap, BITS_PER_BYTE) { #ifdef AR9170_QUEUE_STOP_DEBUG printk(KERN_DEBUG "%s: wake queue %d\n", wiphy_name(ar->hw->wiphy), i); diff --git a/drivers/net/wireless/iwmc3200wifi/debugfs.c b/drivers/net/wireless/iwmc3200wifi/debugfs.c index be992ca41cf1..c29c994de0e2 100644 --- a/drivers/net/wireless/iwmc3200wifi/debugfs.c +++ b/drivers/net/wireless/iwmc3200wifi/debugfs.c @@ -89,7 +89,7 @@ static int iwm_debugfs_dbg_modules_write(void *data, u64 val) for (i = 0; i < __IWM_DM_NR; i++) iwm->dbg.dbg_module[i] = 0; - for_each_bit(bit, &iwm->dbg.dbg_modules, __IWM_DM_NR) + for_each_set_bit(bit, &iwm->dbg.dbg_modules, __IWM_DM_NR) iwm->dbg.dbg_module[bit] = iwm->dbg.dbg_level; return 0; diff --git a/drivers/net/wireless/iwmc3200wifi/rx.c b/drivers/net/wireless/iwmc3200wifi/rx.c index ad8f7eabb5aa..8456b4dbd146 100644 --- a/drivers/net/wireless/iwmc3200wifi/rx.c +++ b/drivers/net/wireless/iwmc3200wifi/rx.c @@ -1116,7 +1116,7 @@ static int iwm_ntf_stop_resume_tx(struct iwm_priv *iwm, u8 *buf, return -EINVAL; } - for_each_bit(bit, (unsigned long *)&tid_msk, IWM_UMAC_TID_NR) { + for_each_set_bit(bit, (unsigned long *)&tid_msk, IWM_UMAC_TID_NR) { tid_info = &sta_info->tid_info[bit]; mutex_lock(&tid_info->mutex); diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c index 21f9e71223ca..a6467f3d262e 100644 --- a/fs/ocfs2/quota_local.c +++ b/fs/ocfs2/quota_local.c @@ -457,7 +457,7 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode, break; } dchunk = (struct ocfs2_local_disk_chunk *)hbh->b_data; - for_each_bit(bit, rchunk->rc_bitmap, ol_chunk_entries(sb)) { + for_each_set_bit(bit, rchunk->rc_bitmap, ol_chunk_entries(sb)) { qbh = NULL; status = ocfs2_read_quota_block(lqinode, ol_dqblk_block(sb, chunk, bit), diff --git a/include/linux/bitops.h b/include/linux/bitops.h index 25b8b2f33ae9..b79389879238 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -16,11 +16,13 @@ */ #include <asm/bitops.h> -#define for_each_bit(bit, addr, size) \ +#define for_each_set_bit(bit, addr, size) \ for ((bit) = find_first_bit((addr), (size)); \ (bit) < (size); \ (bit) = find_next_bit((addr), (size), (bit) + 1)) +/* Temporary */ +#define for_each_bit(bit, addr, size) for_each_set_bit(bit, addr, size) static __inline__ int get_bitmask_order(unsigned int count) { diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c index eeb3506c4834..82095bf2099f 100644 --- a/kernel/sched_cpupri.c +++ b/kernel/sched_cpupri.c @@ -47,7 +47,7 @@ static int convert_prio(int prio) } #define for_each_cpupri_active(array, idx) \ - for_each_bit(idx, array, CPUPRI_NR_PRIORITIES) + for_each_set_bit(idx, array, CPUPRI_NR_PRIORITIES) /** * cpupri_find - find the best (lowest-pri) CPU in the system diff --git a/sound/soc/codecs/uda1380.c b/sound/soc/codecs/uda1380.c index a2763c2e7348..9cd0a66b7663 100644 --- a/sound/soc/codecs/uda1380.c +++ b/sound/soc/codecs/uda1380.c @@ -137,7 +137,7 @@ static void uda1380_flush_work(struct work_struct *work) { int bit, reg; - for_each_bit(bit, &uda1380_cache_dirty, UDA1380_CACHEREGNUM - 0x10) { + for_each_set_bit(bit, &uda1380_cache_dirty, UDA1380_CACHEREGNUM - 0x10) { reg = 0x10 + bit; pr_debug("uda1380: flush reg %x val %x:\n", reg, uda1380_read_reg_cache(uda1380_codec, reg)); -- cgit v1.2.3 From d559db086ff5be9bcc259e5aa50bf3d881eaf1d1 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Date: Fri, 5 Mar 2010 13:41:39 -0800 Subject: mm: clean up mm_counter Presently, per-mm statistics counter is defined by macro in sched.h This patch modifies it to - defined in mm.h as inlinf functions - use array instead of macro's name creation. This patch is for reducing patch size in future patch to modify implementation of per-mm counter. Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Reviewed-by: Minchan Kim <minchan.kim@gmail.com> Cc: Christoph Lameter <cl@linux-foundation.org> Cc: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: David Rientjes <rientjes@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- fs/proc/task_mmu.c | 4 +- include/linux/mm.h | 104 +++++++++++++++++++++++++++++++++++++++++++++++ include/linux/mm_types.h | 33 ++++++++++----- include/linux/sched.h | 54 ------------------------ kernel/fork.c | 3 +- kernel/tsacct.c | 1 + mm/filemap_xip.c | 2 +- mm/fremap.c | 2 +- mm/memory.c | 56 +++++++++++++++---------- mm/oom_kill.c | 4 +- mm/rmap.c | 10 ++--- mm/swapfile.c | 2 +- 12 files changed, 174 insertions(+), 101 deletions(-) (limited to 'fs') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index f277c4a111cb..375581276011 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -65,11 +65,11 @@ unsigned long task_vsize(struct mm_struct *mm) int task_statm(struct mm_struct *mm, int *shared, int *text, int *data, int *resident) { - *shared = get_mm_counter(mm, file_rss); + *shared = get_mm_counter(mm, MM_FILEPAGES); *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> PAGE_SHIFT; *data = mm->total_vm - mm->shared_vm; - *resident = *shared + get_mm_counter(mm, anon_rss); + *resident = *shared + get_mm_counter(mm, MM_ANONPAGES); return mm->total_vm; } diff --git a/include/linux/mm.h b/include/linux/mm.h index 90957f14195c..2124cdb2d1d0 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -870,6 +870,110 @@ extern int mprotect_fixup(struct vm_area_struct *vma, */ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages); +/* + * per-process(per-mm_struct) statistics. + */ +#if USE_SPLIT_PTLOCKS +/* + * The mm counters are not protected by its page_table_lock, + * so must be incremented atomically. + */ +static inline void set_mm_counter(struct mm_struct *mm, int member, long value) +{ + atomic_long_set(&mm->rss_stat.count[member], value); +} + +static inline unsigned long get_mm_counter(struct mm_struct *mm, int member) +{ + return (unsigned long)atomic_long_read(&mm->rss_stat.count[member]); +} + +static inline void add_mm_counter(struct mm_struct *mm, int member, long value) +{ + atomic_long_add(value, &mm->rss_stat.count[member]); +} + +static inline void inc_mm_counter(struct mm_struct *mm, int member) +{ + atomic_long_inc(&mm->rss_stat.count[member]); +} + +static inline void dec_mm_counter(struct mm_struct *mm, int member) +{ + atomic_long_dec(&mm->rss_stat.count[member]); +} + +#else /* !USE_SPLIT_PTLOCKS */ +/* + * The mm counters are protected by its page_table_lock, + * so can be incremented directly. + */ +static inline void set_mm_counter(struct mm_struct *mm, int member, long value) +{ + mm->rss_stat.count[member] = value; +} + +static inline unsigned long get_mm_counter(struct mm_struct *mm, int member) +{ + return mm->rss_stat.count[member]; +} + +static inline void add_mm_counter(struct mm_struct *mm, int member, long value) +{ + mm->rss_stat.count[member] += value; +} + +static inline void inc_mm_counter(struct mm_struct *mm, int member) +{ + mm->rss_stat.count[member]++; +} + +static inline void dec_mm_counter(struct mm_struct *mm, int member) +{ + mm->rss_stat.count[member]--; +} + +#endif /* !USE_SPLIT_PTLOCKS */ + +static inline unsigned long get_mm_rss(struct mm_struct *mm) +{ + return get_mm_counter(mm, MM_FILEPAGES) + + get_mm_counter(mm, MM_ANONPAGES); +} + +static inline unsigned long get_mm_hiwater_rss(struct mm_struct *mm) +{ + return max(mm->hiwater_rss, get_mm_rss(mm)); +} + +static inline unsigned long get_mm_hiwater_vm(struct mm_struct *mm) +{ + return max(mm->hiwater_vm, mm->total_vm); +} + +static inline void update_hiwater_rss(struct mm_struct *mm) +{ + unsigned long _rss = get_mm_rss(mm); + + if ((mm)->hiwater_rss < _rss) + (mm)->hiwater_rss = _rss; +} + +static inline void update_hiwater_vm(struct mm_struct *mm) +{ + if (mm->hiwater_vm < mm->total_vm) + mm->hiwater_vm = mm->total_vm; +} + +static inline void setmax_mm_hiwater_rss(unsigned long *maxrss, + struct mm_struct *mm) +{ + unsigned long hiwater_rss = get_mm_hiwater_rss(mm); + + if (*maxrss < hiwater_rss) + *maxrss = hiwater_rss; +} + /* * A callback you can register to apply pressure to ageable caches. diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 36f96271306c..e1ca64be6678 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -24,12 +24,6 @@ struct address_space; #define USE_SPLIT_PTLOCKS (NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS) -#if USE_SPLIT_PTLOCKS -typedef atomic_long_t mm_counter_t; -#else /* !USE_SPLIT_PTLOCKS */ -typedef unsigned long mm_counter_t; -#endif /* !USE_SPLIT_PTLOCKS */ - /* * Each physical page in the system has a struct page associated with * it to keep track of whatever it is we are using the page for at the @@ -201,6 +195,22 @@ struct core_state { struct completion startup; }; +enum { + MM_FILEPAGES, + MM_ANONPAGES, + NR_MM_COUNTERS +}; + +#if USE_SPLIT_PTLOCKS +struct mm_rss_stat { + atomic_long_t count[NR_MM_COUNTERS]; +}; +#else /* !USE_SPLIT_PTLOCKS */ +struct mm_rss_stat { + unsigned long count[NR_MM_COUNTERS]; +}; +#endif /* !USE_SPLIT_PTLOCKS */ + struct mm_struct { struct vm_area_struct * mmap; /* list of VMAs */ struct rb_root mm_rb; @@ -227,11 +237,6 @@ struct mm_struct { * by mmlist_lock */ - /* Special counters, in some configurations protected by the - * page_table_lock, in other configurations by being atomic. - */ - mm_counter_t _file_rss; - mm_counter_t _anon_rss; unsigned long hiwater_rss; /* High-watermark of RSS usage */ unsigned long hiwater_vm; /* High-water virtual memory usage */ @@ -244,6 +249,12 @@ struct mm_struct { unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */ + /* + * Special counters, in some configurations protected by the + * page_table_lock, in other configurations by being atomic. + */ + struct mm_rss_stat rss_stat; + struct linux_binfmt *binfmt; cpumask_t cpu_vm_mask; diff --git a/include/linux/sched.h b/include/linux/sched.h index 4b1753f7e48e..cbeafa49a53b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -396,60 +396,6 @@ extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long); static inline void arch_pick_mmap_layout(struct mm_struct *mm) {} #endif -#if USE_SPLIT_PTLOCKS -/* - * The mm counters are not protected by its page_table_lock, - * so must be incremented atomically. - */ -#define set_mm_counter(mm, member, value) atomic_long_set(&(mm)->_##member, value) -#define get_mm_counter(mm, member) ((unsigned long)atomic_long_read(&(mm)->_##member)) -#define add_mm_counter(mm, member, value) atomic_long_add(value, &(mm)->_##member) -#define inc_mm_counter(mm, member) atomic_long_inc(&(mm)->_##member) -#define dec_mm_counter(mm, member) atomic_long_dec(&(mm)->_##member) - -#else /* !USE_SPLIT_PTLOCKS */ -/* - * The mm counters are protected by its page_table_lock, - * so can be incremented directly. - */ -#define set_mm_counter(mm, member, value) (mm)->_##member = (value) -#define get_mm_counter(mm, member) ((mm)->_##member) -#define add_mm_counter(mm, member, value) (mm)->_##member += (value) -#define inc_mm_counter(mm, member) (mm)->_##member++ -#define dec_mm_counter(mm, member) (mm)->_##member-- - -#endif /* !USE_SPLIT_PTLOCKS */ - -#define get_mm_rss(mm) \ - (get_mm_counter(mm, file_rss) + get_mm_counter(mm, anon_rss)) -#define update_hiwater_rss(mm) do { \ - unsigned long _rss = get_mm_rss(mm); \ - if ((mm)->hiwater_rss < _rss) \ - (mm)->hiwater_rss = _rss; \ -} while (0) -#define update_hiwater_vm(mm) do { \ - if ((mm)->hiwater_vm < (mm)->total_vm) \ - (mm)->hiwater_vm = (mm)->total_vm; \ -} while (0) - -static inline unsigned long get_mm_hiwater_rss(struct mm_struct *mm) -{ - return max(mm->hiwater_rss, get_mm_rss(mm)); -} - -static inline void setmax_mm_hiwater_rss(unsigned long *maxrss, - struct mm_struct *mm) -{ - unsigned long hiwater_rss = get_mm_hiwater_rss(mm); - - if (*maxrss < hiwater_rss) - *maxrss = hiwater_rss; -} - -static inline unsigned long get_mm_hiwater_vm(struct mm_struct *mm) -{ - return max(mm->hiwater_vm, mm->total_vm); -} extern void set_dumpable(struct mm_struct *mm, int value); extern int get_dumpable(struct mm_struct *mm); diff --git a/kernel/fork.c b/kernel/fork.c index 17bbf093356d..7616bcf107b9 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -455,8 +455,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) (current->mm->flags & MMF_INIT_MASK) : default_dump_filter; mm->core_state = NULL; mm->nr_ptes = 0; - set_mm_counter(mm, file_rss, 0); - set_mm_counter(mm, anon_rss, 0); + memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); spin_lock_init(&mm->page_table_lock); mm->free_area_cache = TASK_UNMAPPED_BASE; mm->cached_hole_size = ~0UL; diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 00d59d048edf..0a67e041edf8 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -21,6 +21,7 @@ #include <linux/tsacct_kern.h> #include <linux/acct.h> #include <linux/jiffies.h> +#include <linux/mm.h> /* * fill in basic accounting fields diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index 1888b2d71bb8..78b94f0b6d5d 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -194,7 +194,7 @@ retry: flush_cache_page(vma, address, pte_pfn(*pte)); pteval = ptep_clear_flush_notify(vma, address, pte); page_remove_rmap(page); - dec_mm_counter(mm, file_rss); + dec_mm_counter(mm, MM_FILEPAGES); BUG_ON(pte_dirty(pteval)); pte_unmap_unlock(pte, ptl); page_cache_release(page); diff --git a/mm/fremap.c b/mm/fremap.c index b6ec85abbb39..46f5dacf90a2 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -40,7 +40,7 @@ static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, page_remove_rmap(page); page_cache_release(page); update_hiwater_rss(mm); - dec_mm_counter(mm, file_rss); + dec_mm_counter(mm, MM_FILEPAGES); } } else { if (!pte_file(pte)) diff --git a/mm/memory.c b/mm/memory.c index 72fb5f39bccc..c57678478801 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -121,6 +121,7 @@ static int __init init_zero_pfn(void) } core_initcall(init_zero_pfn); + /* * If a p?d_bad entry is found while walking page tables, report * the error, before resetting entry to p?d_none. Usually (but @@ -376,12 +377,18 @@ int __pte_alloc_kernel(pmd_t *pmd, unsigned long address) return 0; } -static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss) +static inline void init_rss_vec(int *rss) { - if (file_rss) - add_mm_counter(mm, file_rss, file_rss); - if (anon_rss) - add_mm_counter(mm, anon_rss, anon_rss); + memset(rss, 0, sizeof(int) * NR_MM_COUNTERS); +} + +static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss) +{ + int i; + + for (i = 0; i < NR_MM_COUNTERS; i++) + if (rss[i]) + add_mm_counter(mm, i, rss[i]); } /* @@ -632,7 +639,10 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, if (page) { get_page(page); page_dup_rmap(page); - rss[PageAnon(page)]++; + if (PageAnon(page)) + rss[MM_ANONPAGES]++; + else + rss[MM_FILEPAGES]++; } out_set_pte: @@ -648,11 +658,12 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, pte_t *src_pte, *dst_pte; spinlock_t *src_ptl, *dst_ptl; int progress = 0; - int rss[2]; + int rss[NR_MM_COUNTERS]; swp_entry_t entry = (swp_entry_t){0}; again: - rss[1] = rss[0] = 0; + init_rss_vec(rss); + dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); if (!dst_pte) return -ENOMEM; @@ -688,7 +699,7 @@ again: arch_leave_lazy_mmu_mode(); spin_unlock(src_ptl); pte_unmap_nested(orig_src_pte); - add_mm_rss(dst_mm, rss[0], rss[1]); + add_mm_rss_vec(dst_mm, rss); pte_unmap_unlock(orig_dst_pte, dst_ptl); cond_resched(); @@ -816,8 +827,9 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, struct mm_struct *mm = tlb->mm; pte_t *pte; spinlock_t *ptl; - int file_rss = 0; - int anon_rss = 0; + int rss[NR_MM_COUNTERS]; + + init_rss_vec(rss); pte = pte_offset_map_lock(mm, pmd, addr, &ptl); arch_enter_lazy_mmu_mode(); @@ -863,14 +875,14 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, set_pte_at(mm, addr, pte, pgoff_to_pte(page->index)); if (PageAnon(page)) - anon_rss--; + rss[MM_ANONPAGES]--; else { if (pte_dirty(ptent)) set_page_dirty(page); if (pte_young(ptent) && likely(!VM_SequentialReadHint(vma))) mark_page_accessed(page); - file_rss--; + rss[MM_FILEPAGES]--; } page_remove_rmap(page); if (unlikely(page_mapcount(page) < 0)) @@ -893,7 +905,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0)); - add_mm_rss(mm, file_rss, anon_rss); + add_mm_rss_vec(mm, rss); arch_leave_lazy_mmu_mode(); pte_unmap_unlock(pte - 1, ptl); @@ -1527,7 +1539,7 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr, /* Ok, finally just insert the thing.. */ get_page(page); - inc_mm_counter(mm, file_rss); + inc_mm_counter(mm, MM_FILEPAGES); page_add_file_rmap(page); set_pte_at(mm, addr, pte, mk_pte(page, prot)); @@ -2163,11 +2175,11 @@ gotten: if (likely(pte_same(*page_table, orig_pte))) { if (old_page) { if (!PageAnon(old_page)) { - dec_mm_counter(mm, file_rss); - inc_mm_counter(mm, anon_rss); + dec_mm_counter(mm, MM_FILEPAGES); + inc_mm_counter(mm, MM_ANONPAGES); } } else - inc_mm_counter(mm, anon_rss); + inc_mm_counter(mm, MM_ANONPAGES); flush_cache_page(vma, address, pte_pfn(orig_pte)); entry = mk_pte(new_page, vma->vm_page_prot); entry = maybe_mkwrite(pte_mkdirty(entry), vma); @@ -2604,7 +2616,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, * discarded at swap_free(). */ - inc_mm_counter(mm, anon_rss); + inc_mm_counter(mm, MM_ANONPAGES); pte = mk_pte(page, vma->vm_page_prot); if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) { pte = maybe_mkwrite(pte_mkdirty(pte), vma); @@ -2688,7 +2700,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, if (!pte_none(*page_table)) goto release; - inc_mm_counter(mm, anon_rss); + inc_mm_counter(mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, address); setpte: set_pte_at(mm, address, page_table, entry); @@ -2842,10 +2854,10 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (flags & FAULT_FLAG_WRITE) entry = maybe_mkwrite(pte_mkdirty(entry), vma); if (anon) { - inc_mm_counter(mm, anon_rss); + inc_mm_counter(mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, address); } else { - inc_mm_counter(mm, file_rss); + inc_mm_counter(mm, MM_FILEPAGES); page_add_file_rmap(page); if (flags & FAULT_FLAG_WRITE) { dirty_page = page; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 237050478f28..35755a4156d6 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -401,8 +401,8 @@ static void __oom_kill_task(struct task_struct *p, int verbose) "vsz:%lukB, anon-rss:%lukB, file-rss:%lukB\n", task_pid_nr(p), p->comm, K(p->mm->total_vm), - K(get_mm_counter(p->mm, anon_rss)), - K(get_mm_counter(p->mm, file_rss))); + K(get_mm_counter(p->mm, MM_ANONPAGES)), + K(get_mm_counter(p->mm, MM_FILEPAGES))); task_unlock(p); /* diff --git a/mm/rmap.c b/mm/rmap.c index 278cd277bdec..73d0472884c2 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -815,9 +815,9 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) { if (PageAnon(page)) - dec_mm_counter(mm, anon_rss); + dec_mm_counter(mm, MM_ANONPAGES); else - dec_mm_counter(mm, file_rss); + dec_mm_counter(mm, MM_FILEPAGES); set_pte_at(mm, address, pte, swp_entry_to_pte(make_hwpoison_entry(page))); } else if (PageAnon(page)) { @@ -839,7 +839,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, list_add(&mm->mmlist, &init_mm.mmlist); spin_unlock(&mmlist_lock); } - dec_mm_counter(mm, anon_rss); + dec_mm_counter(mm, MM_ANONPAGES); } else if (PAGE_MIGRATION) { /* * Store the pfn of the page in a special migration @@ -857,7 +857,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, entry = make_migration_entry(page, pte_write(pteval)); set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); } else - dec_mm_counter(mm, file_rss); + dec_mm_counter(mm, MM_FILEPAGES); page_remove_rmap(page); page_cache_release(page); @@ -996,7 +996,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, page_remove_rmap(page); page_cache_release(page); - dec_mm_counter(mm, file_rss); + dec_mm_counter(mm, MM_FILEPAGES); (*mapcount)--; } pte_unmap_unlock(pte - 1, ptl); diff --git a/mm/swapfile.c b/mm/swapfile.c index 6c0585b16418..893984946a2c 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -840,7 +840,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, goto out; } - inc_mm_counter(vma->vm_mm, anon_rss); + inc_mm_counter(vma->vm_mm, MM_ANONPAGES); get_page(page); set_pte_at(vma->vm_mm, addr, pte, pte_mkold(mk_pte(page, vma->vm_page_prot))); -- cgit v1.2.3 From 34e55232e59f7b19050267a05ff1226e5cd122a5 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Date: Fri, 5 Mar 2010 13:41:40 -0800 Subject: mm: avoid false sharing of mm_counter Considering the nature of per mm stats, it's the shared object among threads and can be a cache-miss point in the page fault path. This patch adds per-thread cache for mm_counter. RSS value will be counted into a struct in task_struct and synchronized with mm's one at events. Now, in this patch, the event is the number of calls to handle_mm_fault. Per-thread value is added to mm at each 64 calls. rough estimation with small benchmark on parallel thread (2threads) shows [before] 4.5 cache-miss/faults [after] 4.0 cache-miss/faults Anyway, the most contended object is mmap_sem if the number of threads grows. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Minchan Kim <minchan.kim@gmail.com> Cc: Christoph Lameter <cl@linux-foundation.org> Cc: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: David Rientjes <rientjes@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- Documentation/filesystems/proc.txt | 6 +++ fs/exec.c | 1 + include/linux/mm.h | 8 ++-- include/linux/mm_types.h | 6 +++ include/linux/sched.h | 4 +- kernel/exit.c | 3 +- mm/memory.c | 94 ++++++++++++++++++++++++++++++++++---- 7 files changed, 107 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 0d07513a67a6..e418f3d8f427 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -188,6 +188,12 @@ memory usage. Its seven fields are explained in Table 1-3. The stat file contains details information about the process itself. Its fields are explained in Table 1-4. +(for SMP CONFIG users) +For making accounting scalable, RSS related information are handled in +asynchronous manner and the vaule may not be very precise. To see a precise +snapshot of a moment, you can see /proc/<pid>/smaps file and scan page table. +It's slow but very precise. + Table 1-2: Contents of the statm files (as of 2.6.30-rc7) .............................................................................. Field Content diff --git a/fs/exec.c b/fs/exec.c index cce6bbdbdbb1..ea7861727efd 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -718,6 +718,7 @@ static int exec_mmap(struct mm_struct *mm) /* Notify parent that we're no longer interested in the old VM */ tsk = current; old_mm = current->mm; + sync_mm_rss(tsk, old_mm); mm_release(tsk, old_mm); if (old_mm) { diff --git a/include/linux/mm.h b/include/linux/mm.h index 2124cdb2d1d0..8e580c07d171 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -873,7 +873,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, /* * per-process(per-mm_struct) statistics. */ -#if USE_SPLIT_PTLOCKS +#if defined(SPLIT_RSS_COUNTING) /* * The mm counters are not protected by its page_table_lock, * so must be incremented atomically. @@ -883,10 +883,7 @@ static inline void set_mm_counter(struct mm_struct *mm, int member, long value) atomic_long_set(&mm->rss_stat.count[member], value); } -static inline unsigned long get_mm_counter(struct mm_struct *mm, int member) -{ - return (unsigned long)atomic_long_read(&mm->rss_stat.count[member]); -} +unsigned long get_mm_counter(struct mm_struct *mm, int member); static inline void add_mm_counter(struct mm_struct *mm, int member, long value) { @@ -974,6 +971,7 @@ static inline void setmax_mm_hiwater_rss(unsigned long *maxrss, *maxrss = hiwater_rss; } +void sync_mm_rss(struct task_struct *task, struct mm_struct *mm); /* * A callback you can register to apply pressure to ageable caches. diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index e1ca64be6678..21861239ab0c 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -202,9 +202,15 @@ enum { }; #if USE_SPLIT_PTLOCKS +#define SPLIT_RSS_COUNTING struct mm_rss_stat { atomic_long_t count[NR_MM_COUNTERS]; }; +/* per-thread cached information, */ +struct task_rss_stat { + int events; /* for synchronization threshold */ + int count[NR_MM_COUNTERS]; +}; #else /* !USE_SPLIT_PTLOCKS */ struct mm_rss_stat { unsigned long count[NR_MM_COUNTERS]; diff --git a/include/linux/sched.h b/include/linux/sched.h index cbeafa49a53b..46c6f8d5dc06 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1220,7 +1220,9 @@ struct task_struct { struct plist_node pushable_tasks; struct mm_struct *mm, *active_mm; - +#if defined(SPLIT_RSS_COUNTING) + struct task_rss_stat rss_stat; +#endif /* task state */ int exit_state; int exit_code, exit_signal; diff --git a/kernel/exit.c b/kernel/exit.c index 45ed043b8bf5..10d3c5d5ae44 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -952,7 +952,8 @@ NORET_TYPE void do_exit(long code) preempt_count()); acct_update_integrals(tsk); - + /* sync mm's RSS info before statistics gathering */ + sync_mm_rss(tsk, tsk->mm); group_dead = atomic_dec_and_test(&tsk->signal->live); if (group_dead) { hrtimer_cancel(&tsk->signal->real_timer); diff --git a/mm/memory.c b/mm/memory.c index c57678478801..a4597614f18d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -122,6 +122,79 @@ static int __init init_zero_pfn(void) core_initcall(init_zero_pfn); +#if defined(SPLIT_RSS_COUNTING) + +void __sync_task_rss_stat(struct task_struct *task, struct mm_struct *mm) +{ + int i; + + for (i = 0; i < NR_MM_COUNTERS; i++) { + if (task->rss_stat.count[i]) { + add_mm_counter(mm, i, task->rss_stat.count[i]); + task->rss_stat.count[i] = 0; + } + } + task->rss_stat.events = 0; +} + +static void add_mm_counter_fast(struct mm_struct *mm, int member, int val) +{ + struct task_struct *task = current; + + if (likely(task->mm == mm)) + task->rss_stat.count[member] += val; + else + add_mm_counter(mm, member, val); +} +#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1) +#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1) + +/* sync counter once per 64 page faults */ +#define TASK_RSS_EVENTS_THRESH (64) +static void check_sync_rss_stat(struct task_struct *task) +{ + if (unlikely(task != current)) + return; + if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH)) + __sync_task_rss_stat(task, task->mm); +} + +unsigned long get_mm_counter(struct mm_struct *mm, int member) +{ + long val = 0; + + /* + * Don't use task->mm here...for avoiding to use task_get_mm().. + * The caller must guarantee task->mm is not invalid. + */ + val = atomic_long_read(&mm->rss_stat.count[member]); + /* + * counter is updated in asynchronous manner and may go to minus. + * But it's never be expected number for users. + */ + if (val < 0) + return 0; + return (unsigned long)val; +} + +void sync_mm_rss(struct task_struct *task, struct mm_struct *mm) +{ + __sync_task_rss_stat(task, mm); +} +#else + +#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member) +#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member) + +static void check_sync_rss_stat(struct task_struct *task) +{ +} + +void sync_mm_rss(struct task_struct *task, struct mm_struct *mm) +{ +} +#endif + /* * If a p?d_bad entry is found while walking page tables, report * the error, before resetting entry to p?d_none. Usually (but @@ -386,6 +459,8 @@ static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss) { int i; + if (current->mm == mm) + sync_mm_rss(current, mm); for (i = 0; i < NR_MM_COUNTERS; i++) if (rss[i]) add_mm_counter(mm, i, rss[i]); @@ -1539,7 +1614,7 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr, /* Ok, finally just insert the thing.. */ get_page(page); - inc_mm_counter(mm, MM_FILEPAGES); + inc_mm_counter_fast(mm, MM_FILEPAGES); page_add_file_rmap(page); set_pte_at(mm, addr, pte, mk_pte(page, prot)); @@ -2175,11 +2250,11 @@ gotten: if (likely(pte_same(*page_table, orig_pte))) { if (old_page) { if (!PageAnon(old_page)) { - dec_mm_counter(mm, MM_FILEPAGES); - inc_mm_counter(mm, MM_ANONPAGES); + dec_mm_counter_fast(mm, MM_FILEPAGES); + inc_mm_counter_fast(mm, MM_ANONPAGES); } } else - inc_mm_counter(mm, MM_ANONPAGES); + inc_mm_counter_fast(mm, MM_ANONPAGES); flush_cache_page(vma, address, pte_pfn(orig_pte)); entry = mk_pte(new_page, vma->vm_page_prot); entry = maybe_mkwrite(pte_mkdirty(entry), vma); @@ -2616,7 +2691,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, * discarded at swap_free(). */ - inc_mm_counter(mm, MM_ANONPAGES); + inc_mm_counter_fast(mm, MM_ANONPAGES); pte = mk_pte(page, vma->vm_page_prot); if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) { pte = maybe_mkwrite(pte_mkdirty(pte), vma); @@ -2700,7 +2775,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, if (!pte_none(*page_table)) goto release; - inc_mm_counter(mm, MM_ANONPAGES); + inc_mm_counter_fast(mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, address); setpte: set_pte_at(mm, address, page_table, entry); @@ -2854,10 +2929,10 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (flags & FAULT_FLAG_WRITE) entry = maybe_mkwrite(pte_mkdirty(entry), vma); if (anon) { - inc_mm_counter(mm, MM_ANONPAGES); + inc_mm_counter_fast(mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, address); } else { - inc_mm_counter(mm, MM_FILEPAGES); + inc_mm_counter_fast(mm, MM_FILEPAGES); page_add_file_rmap(page); if (flags & FAULT_FLAG_WRITE) { dirty_page = page; @@ -3035,6 +3110,9 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, count_vm_event(PGFAULT); + /* do counter updates before entering really critical section. */ + check_sync_rss_stat(current); + if (unlikely(is_vm_hugetlb_page(vma))) return hugetlb_fault(mm, vma, address, flags); -- cgit v1.2.3 From b084d4353ff99d824d3bc5a5c2c22c70b1fba722 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Date: Fri, 5 Mar 2010 13:41:42 -0800 Subject: mm: count swap usage A frequent questions from users about memory management is what numbers of swap ents are user for processes. And this information will give some hints to oom-killer. Besides we can count the number of swapents per a process by scanning /proc/<pid>/smaps, this is very slow and not good for usual process information handler which works like 'ps' or 'top'. (ps or top is now enough slow..) This patch adds a counter of swapents to mm_counter and update is at each swap events. Information is exported via /proc/<pid>/status file as [kamezawa@bluextal memory]$ cat /proc/self/status Name: cat State: R (running) Tgid: 2910 Pid: 2910 PPid: 2823 TracerPid: 0 Uid: 500 500 500 500 Gid: 500 500 500 500 FDSize: 256 Groups: 500 VmPeak: 82696 kB VmSize: 82696 kB VmLck: 0 kB VmHWM: 432 kB VmRSS: 432 kB VmData: 172 kB VmStk: 84 kB VmExe: 48 kB VmLib: 1568 kB VmPTE: 40 kB VmSwap: 0 kB <=============== this. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Reviewed-by: Minchan Kim <minchan.kim@gmail.com> Reviewed-by: Christoph Lameter <cl@linux-foundation.org> Cc: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: David Rientjes <rientjes@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- Documentation/filesystems/proc.txt | 2 ++ fs/proc/task_mmu.c | 9 ++++++--- include/linux/mm_types.h | 1 + mm/memory.c | 16 ++++++++++++---- mm/rmap.c | 1 + mm/swapfile.c | 1 + 6 files changed, 23 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index e418f3d8f427..b5c5fc657a88 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -164,6 +164,7 @@ read the file /proc/PID/status: VmExe: 68 kB VmLib: 1412 kB VmPTE: 20 kb + VmSwap: 0 kB Threads: 1 SigQ: 0/28578 SigPnd: 0000000000000000 @@ -219,6 +220,7 @@ Table 1-2: Contents of the statm files (as of 2.6.30-rc7) VmExe size of text segment VmLib size of shared library code VmPTE size of page table entries + VmSwap size of swap usage (the number of referred swapents) Threads number of threads SigQ number of signals queued/max. number for queue SigPnd bitmap of pending signals for the thread diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 375581276011..183f8ff5f400 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -16,7 +16,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) { - unsigned long data, text, lib; + unsigned long data, text, lib, swap; unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss; /* @@ -36,6 +36,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) data = mm->total_vm - mm->shared_vm - mm->stack_vm; text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10; lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text; + swap = get_mm_counter(mm, MM_SWAPENTS); seq_printf(m, "VmPeak:\t%8lu kB\n" "VmSize:\t%8lu kB\n" @@ -46,7 +47,8 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) "VmStk:\t%8lu kB\n" "VmExe:\t%8lu kB\n" "VmLib:\t%8lu kB\n" - "VmPTE:\t%8lu kB\n", + "VmPTE:\t%8lu kB\n" + "VmSwap:\t%8lu kB\n", hiwater_vm << (PAGE_SHIFT-10), (total_vm - mm->reserved_vm) << (PAGE_SHIFT-10), mm->locked_vm << (PAGE_SHIFT-10), @@ -54,7 +56,8 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) total_rss << (PAGE_SHIFT-10), data << (PAGE_SHIFT-10), mm->stack_vm << (PAGE_SHIFT-10), text, lib, - (PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10); + (PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10, + swap << (PAGE_SHIFT-10)); } unsigned long task_vsize(struct mm_struct *mm) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 21861239ab0c..19549d7275ab 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -198,6 +198,7 @@ struct core_state { enum { MM_FILEPAGES, MM_ANONPAGES, + MM_SWAPENTS, NR_MM_COUNTERS }; diff --git a/mm/memory.c b/mm/memory.c index a4597614f18d..77d9f840936b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -679,7 +679,9 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, &src_mm->mmlist); spin_unlock(&mmlist_lock); } - if (is_write_migration_entry(entry) && + if (likely(!non_swap_entry(entry))) + rss[MM_SWAPENTS]++; + else if (is_write_migration_entry(entry) && is_cow_mapping(vm_flags)) { /* * COW mappings require pages in both parent @@ -974,9 +976,14 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, if (pte_file(ptent)) { if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) print_bad_pte(vma, addr, ptent, NULL); - } else if - (unlikely(!free_swap_and_cache(pte_to_swp_entry(ptent)))) - print_bad_pte(vma, addr, ptent, NULL); + } else { + swp_entry_t entry = pte_to_swp_entry(ptent); + + if (!non_swap_entry(entry)) + rss[MM_SWAPENTS]--; + if (unlikely(!free_swap_and_cache(entry))) + print_bad_pte(vma, addr, ptent, NULL); + } pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0)); @@ -2692,6 +2699,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, */ inc_mm_counter_fast(mm, MM_ANONPAGES); + dec_mm_counter_fast(mm, MM_SWAPENTS); pte = mk_pte(page, vma->vm_page_prot); if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) { pte = maybe_mkwrite(pte_mkdirty(pte), vma); diff --git a/mm/rmap.c b/mm/rmap.c index 73d0472884c2..5cb47111f79e 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -840,6 +840,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, spin_unlock(&mmlist_lock); } dec_mm_counter(mm, MM_ANONPAGES); + inc_mm_counter(mm, MM_SWAPENTS); } else if (PAGE_MIGRATION) { /* * Store the pfn of the page in a special migration diff --git a/mm/swapfile.c b/mm/swapfile.c index 893984946a2c..187a21f8b7bd 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -840,6 +840,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, goto out; } + dec_mm_counter(vma->vm_mm, MM_SWAPENTS); inc_mm_counter(vma->vm_mm, MM_ANONPAGES); get_page(page); set_pte_at(vma->vm_mm, addr, pte, -- cgit v1.2.3 From 42e49608683ab25fbbbf9c40edb944601e543882 Mon Sep 17 00:00:00 2001 From: Wu Fengguang <fengguang.wu@intel.com> Date: Fri, 5 Mar 2010 13:42:01 -0800 Subject: vfs: take f_lock on modifying f_mode after open time We'll introduce FMODE_RANDOM which will be runtime modified. So protect all runtime modification to f_mode with f_lock to avoid races. Signed-off-by: Wu Fengguang <fengguang.wu@intel.com> Cc: Al Viro <viro@zeniv.linux.org.uk> Cc: Christoph Hellwig <hch@infradead.org> Cc: Trond Myklebust <Trond.Myklebust@netapp.com> Cc: Chuck Lever <chuck.lever@oracle.com> Cc: <stable@kernel.org> [2.6.33.x] Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- fs/file_table.c | 2 ++ fs/nfsd/nfs4state.c | 2 ++ 2 files changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/file_table.c b/fs/file_table.c index b98404b54383..32d12b78bac8 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -393,7 +393,9 @@ retry: continue; if (!(f->f_mode & FMODE_WRITE)) continue; + spin_lock(&f->f_lock); f->f_mode &= ~FMODE_WRITE; + spin_unlock(&f->f_lock); if (file_check_writeable(f) != 0) continue; file_release_write(f); diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index f19ed866c95f..fcafe6087f69 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1998,7 +1998,9 @@ nfs4_file_downgrade(struct file *filp, unsigned int share_access) { if (share_access & NFS4_SHARE_ACCESS_WRITE) { drop_file_write_access(filp); + spin_lock(&filp->f_lock); filp->f_mode = (filp->f_mode | FMODE_READ) & ~FMODE_WRITE; + spin_unlock(&filp->f_lock); } } -- cgit v1.2.3 From 5beb49305251e5669852ed541e8e2f2f7696c53e Mon Sep 17 00:00:00 2001 From: Rik van Riel <riel@redhat.com> Date: Fri, 5 Mar 2010 13:42:07 -0800 Subject: mm: change anon_vma linking to fix multi-process server scalability issue The old anon_vma code can lead to scalability issues with heavily forking workloads. Specifically, each anon_vma will be shared between the parent process and all its child processes. In a workload with 1000 child processes and a VMA with 1000 anonymous pages per process that get COWed, this leads to a system with a million anonymous pages in the same anon_vma, each of which is mapped in just one of the 1000 processes. However, the current rmap code needs to walk them all, leading to O(N) scanning complexity for each page. This can result in systems where one CPU is walking the page tables of 1000 processes in page_referenced_one, while all other CPUs are stuck on the anon_vma lock. This leads to catastrophic failure for a benchmark like AIM7, where the total number of processes can reach in the tens of thousands. Real workloads are still a factor 10 less process intensive than AIM7, but they are catching up. This patch changes the way anon_vmas and VMAs are linked, which allows us to associate multiple anon_vmas with a VMA. At fork time, each child process gets its own anon_vmas, in which its COWed pages will be instantiated. The parents' anon_vma is also linked to the VMA, because non-COWed pages could be present in any of the children. This reduces rmap scanning complexity to O(1) for the pages of the 1000 child processes, with O(N) complexity for at most 1/N pages in the system. This reduces the average scanning cost in heavily forking workloads from O(N) to 2. The only real complexity in this patch stems from the fact that linking a VMA to anon_vmas now involves memory allocations. This means vma_adjust can fail, if it needs to attach a VMA to anon_vma structures. This in turn means error handling needs to be added to the calling functions. A second source of complexity is that, because there can be multiple anon_vmas, the anon_vma linking in vma_adjust can no longer be done under "the" anon_vma lock. To prevent the rmap code from walking up an incomplete VMA, this patch introduces the VM_LOCK_RMAP VMA flag. This bit flag uses the same slot as the NOMMU VM_MAPPED_COPY, with an ifdef in mm.h to make sure it is impossible to compile a kernel that needs both symbolic values for the same bitflag. Some test results: Without the anon_vma changes, when AIM7 hits around 9.7k users (on a test box with 16GB RAM and not quite enough IO), the system ends up running >99% in system time, with every CPU on the same anon_vma lock in the pageout code. With these changes, AIM7 hits the cross-over point around 29.7k users. This happens with ~99% IO wait time, there never seems to be any spike in system time. The anon_vma lock contention appears to be resolved. [akpm@linux-foundation.org: cleanups] Signed-off-by: Rik van Riel <riel@redhat.com> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Larry Woodman <lwoodman@redhat.com> Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com> Cc: Minchan Kim <minchan.kim@gmail.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Hugh Dickins <hugh.dickins@tiscali.co.uk> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- arch/ia64/kernel/perfmon.c | 1 + arch/ia64/mm/init.c | 2 + fs/exec.c | 6 +- include/linux/mm.h | 6 +- include/linux/mm_types.h | 3 +- include/linux/rmap.h | 35 ++++++++-- kernel/fork.c | 6 +- mm/ksm.c | 12 +++- mm/memory-failure.c | 5 +- mm/memory.c | 4 +- mm/mmap.c | 138 +++++++++++++++++++++++++++------------ mm/mremap.c | 7 +- mm/nommu.c | 2 +- mm/rmap.c | 156 +++++++++++++++++++++++++++++++++++++-------- 14 files changed, 298 insertions(+), 85 deletions(-) (limited to 'fs') diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c index b81e46b1629b..703062c44fb9 100644 --- a/arch/ia64/kernel/perfmon.c +++ b/arch/ia64/kernel/perfmon.c @@ -2315,6 +2315,7 @@ pfm_smpl_buffer_alloc(struct task_struct *task, struct file *filp, pfm_context_t DPRINT(("Cannot allocate vma\n")); goto error_kmem; } + INIT_LIST_HEAD(&vma->anon_vma_chain); /* * partially initialize the vma for the sampling buffer diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index ca3335ea56cc..ed41759efcac 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -117,6 +117,7 @@ ia64_init_addr_space (void) */ vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); if (vma) { + INIT_LIST_HEAD(&vma->anon_vma_chain); vma->vm_mm = current->mm; vma->vm_start = current->thread.rbs_bot & PAGE_MASK; vma->vm_end = vma->vm_start + PAGE_SIZE; @@ -135,6 +136,7 @@ ia64_init_addr_space (void) if (!(current->personality & MMAP_PAGE_ZERO)) { vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); if (vma) { + INIT_LIST_HEAD(&vma->anon_vma_chain); vma->vm_mm = current->mm; vma->vm_end = PAGE_SIZE; vma->vm_page_prot = __pgprot(pgprot_val(PAGE_READONLY) | _PAGE_MA_NAT); diff --git a/fs/exec.c b/fs/exec.c index ea7861727efd..591030735591 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -246,6 +246,7 @@ static int __bprm_mm_init(struct linux_binprm *bprm) vma->vm_start = vma->vm_end - PAGE_SIZE; vma->vm_flags = VM_STACK_FLAGS; vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); + INIT_LIST_HEAD(&vma->anon_vma_chain); err = insert_vm_struct(mm, vma); if (err) goto err; @@ -516,7 +517,8 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) /* * cover the whole range: [new_start, old_end) */ - vma_adjust(vma, new_start, old_end, vma->vm_pgoff, NULL); + if (vma_adjust(vma, new_start, old_end, vma->vm_pgoff, NULL)) + return -ENOMEM; /* * move the page tables downwards, on failure we rely on @@ -547,7 +549,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) tlb_finish_mmu(tlb, new_end, old_end); /* - * shrink the vma to just the new range. + * Shrink the vma to just the new range. Always succeeds. */ vma_adjust(vma, new_start, new_end, vma->vm_pgoff, NULL); diff --git a/include/linux/mm.h b/include/linux/mm.h index 8e580c07d171..8e2841a2f441 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -97,7 +97,11 @@ extern unsigned int kobjsize(const void *objp); #define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */ #define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ #define VM_NONLINEAR 0x00800000 /* Is non-linear (remap_file_pages) */ +#ifdef CONFIG_MMU +#define VM_LOCK_RMAP 0x01000000 /* Do not follow this rmap (mmu mmap) */ +#else #define VM_MAPPED_COPY 0x01000000 /* T if mapped copy of data (nommu mmap) */ +#endif #define VM_INSERTPAGE 0x02000000 /* The vma has had "vm_insert_page()" done on it */ #define VM_ALWAYSDUMP 0x04000000 /* Always include in core dumps */ @@ -1216,7 +1220,7 @@ static inline void vma_nonlinear_insert(struct vm_area_struct *vma, /* mmap.c */ extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin); -extern void vma_adjust(struct vm_area_struct *vma, unsigned long start, +extern int vma_adjust(struct vm_area_struct *vma, unsigned long start, unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert); extern struct vm_area_struct *vma_merge(struct mm_struct *, struct vm_area_struct *prev, unsigned long addr, unsigned long end, diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 19549d7275ab..048b46270aa5 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -163,7 +163,8 @@ struct vm_area_struct { * can only be in the i_mmap tree. An anonymous MAP_PRIVATE, stack * or brk vma (with NULL file) can only be in an anon_vma list. */ - struct list_head anon_vma_node; /* Serialized by anon_vma->lock */ + struct list_head anon_vma_chain; /* Serialized by mmap_sem & + * page_table_lock */ struct anon_vma *anon_vma; /* Serialized by page_table_lock */ /* Function pointers to deal with this struct. */ diff --git a/include/linux/rmap.h b/include/linux/rmap.h index b019ae64e2ab..62da2001d55c 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -37,7 +37,27 @@ struct anon_vma { * is serialized by a system wide lock only visible to * mm_take_all_locks() (mm_all_locks_mutex). */ - struct list_head head; /* List of private "related" vmas */ + struct list_head head; /* Chain of private "related" vmas */ +}; + +/* + * The copy-on-write semantics of fork mean that an anon_vma + * can become associated with multiple processes. Furthermore, + * each child process will have its own anon_vma, where new + * pages for that process are instantiated. + * + * This structure allows us to find the anon_vmas associated + * with a VMA, or the VMAs associated with an anon_vma. + * The "same_vma" list contains the anon_vma_chains linking + * all the anon_vmas associated with this VMA. + * The "same_anon_vma" list contains the anon_vma_chains + * which link all the VMAs associated with this anon_vma. + */ +struct anon_vma_chain { + struct vm_area_struct *vma; + struct anon_vma *anon_vma; + struct list_head same_vma; /* locked by mmap_sem & page_table_lock */ + struct list_head same_anon_vma; /* locked by anon_vma->lock */ }; #ifdef CONFIG_MMU @@ -89,12 +109,19 @@ static inline void anon_vma_unlock(struct vm_area_struct *vma) */ void anon_vma_init(void); /* create anon_vma_cachep */ int anon_vma_prepare(struct vm_area_struct *); -void __anon_vma_merge(struct vm_area_struct *, struct vm_area_struct *); -void anon_vma_unlink(struct vm_area_struct *); -void anon_vma_link(struct vm_area_struct *); +void unlink_anon_vmas(struct vm_area_struct *); +int anon_vma_clone(struct vm_area_struct *, struct vm_area_struct *); +int anon_vma_fork(struct vm_area_struct *, struct vm_area_struct *); void __anon_vma_link(struct vm_area_struct *); void anon_vma_free(struct anon_vma *); +static inline void anon_vma_merge(struct vm_area_struct *vma, + struct vm_area_struct *next) +{ + VM_BUG_ON(vma->anon_vma != next->anon_vma); + unlink_anon_vmas(next); +} + /* * rmap interfaces called when adding or removing pte of page */ diff --git a/kernel/fork.c b/kernel/fork.c index 7616bcf107b9..bab7b254ad39 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -329,15 +329,17 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) if (!tmp) goto fail_nomem; *tmp = *mpnt; + INIT_LIST_HEAD(&tmp->anon_vma_chain); pol = mpol_dup(vma_policy(mpnt)); retval = PTR_ERR(pol); if (IS_ERR(pol)) goto fail_nomem_policy; vma_set_policy(tmp, pol); + if (anon_vma_fork(tmp, mpnt)) + goto fail_nomem_anon_vma_fork; tmp->vm_flags &= ~VM_LOCKED; tmp->vm_mm = mm; tmp->vm_next = NULL; - anon_vma_link(tmp); file = tmp->vm_file; if (file) { struct inode *inode = file->f_path.dentry->d_inode; @@ -392,6 +394,8 @@ out: flush_tlb_mm(oldmm); up_write(&oldmm->mmap_sem); return retval; +fail_nomem_anon_vma_fork: + mpol_put(pol); fail_nomem_policy: kmem_cache_free(vm_area_cachep, tmp); fail_nomem: diff --git a/mm/ksm.c b/mm/ksm.c index 56a0da1f9979..a93f1b7f508c 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1563,10 +1563,12 @@ int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg, again: hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { struct anon_vma *anon_vma = rmap_item->anon_vma; + struct anon_vma_chain *vmac; struct vm_area_struct *vma; spin_lock(&anon_vma->lock); - list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { + list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { + vma = vmac->vma; if (rmap_item->address < vma->vm_start || rmap_item->address >= vma->vm_end) continue; @@ -1614,10 +1616,12 @@ int try_to_unmap_ksm(struct page *page, enum ttu_flags flags) again: hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { struct anon_vma *anon_vma = rmap_item->anon_vma; + struct anon_vma_chain *vmac; struct vm_area_struct *vma; spin_lock(&anon_vma->lock); - list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { + list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { + vma = vmac->vma; if (rmap_item->address < vma->vm_start || rmap_item->address >= vma->vm_end) continue; @@ -1664,10 +1668,12 @@ int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *, again: hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { struct anon_vma *anon_vma = rmap_item->anon_vma; + struct anon_vma_chain *vmac; struct vm_area_struct *vma; spin_lock(&anon_vma->lock); - list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { + list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { + vma = vmac->vma; if (rmap_item->address < vma->vm_start || rmap_item->address >= vma->vm_end) continue; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 17299fd4577c..d1f335162976 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -383,9 +383,12 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, if (av == NULL) /* Not actually mapped anymore */ goto out; for_each_process (tsk) { + struct anon_vma_chain *vmac; + if (!task_early_kill(tsk)) continue; - list_for_each_entry (vma, &av->head, anon_vma_node) { + list_for_each_entry(vmac, &av->head, same_anon_vma) { + vma = vmac->vma; if (!page_mapped_in_vma(page, vma)) continue; if (vma->vm_mm == tsk->mm) diff --git a/mm/memory.c b/mm/memory.c index 77d9f840936b..dc785b438d70 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -374,7 +374,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, * Hide vma from rmap and truncate_pagecache before freeing * pgtables */ - anon_vma_unlink(vma); + unlink_anon_vmas(vma); unlink_file_vma(vma); if (is_vm_hugetlb_page(vma)) { @@ -388,7 +388,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, && !is_vm_hugetlb_page(next)) { vma = next; next = vma->vm_next; - anon_vma_unlink(vma); + unlink_anon_vmas(vma); unlink_file_vma(vma); } free_pgd_range(tlb, addr, vma->vm_end, diff --git a/mm/mmap.c b/mm/mmap.c index 31656147128e..6a0c15db7f60 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -437,7 +437,6 @@ __vma_link(struct mm_struct *mm, struct vm_area_struct *vma, { __vma_link_list(mm, vma, prev, rb_parent); __vma_link_rb(mm, vma, rb_link, rb_parent); - __anon_vma_link(vma); } static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma, @@ -499,7 +498,7 @@ __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, * are necessary. The "insert" vma (if any) is to be inserted * before we drop the necessary locks. */ -void vma_adjust(struct vm_area_struct *vma, unsigned long start, +int vma_adjust(struct vm_area_struct *vma, unsigned long start, unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert) { struct mm_struct *mm = vma->vm_mm; @@ -542,6 +541,28 @@ again: remove_next = 1 + (end > next->vm_end); } } + /* + * When changing only vma->vm_end, we don't really need anon_vma lock. + */ + if (vma->anon_vma && (insert || importer || start != vma->vm_start)) + anon_vma = vma->anon_vma; + if (anon_vma) { + /* + * Easily overlooked: when mprotect shifts the boundary, + * make sure the expanding vma has anon_vma set if the + * shrinking vma had, to cover any anon pages imported. + */ + if (importer && !importer->anon_vma) { + /* Block reverse map lookups until things are set up. */ + importer->vm_flags |= VM_LOCK_RMAP; + if (anon_vma_clone(importer, vma)) { + importer->vm_flags &= ~VM_LOCK_RMAP; + return -ENOMEM; + } + importer->anon_vma = anon_vma; + } + } + if (file) { mapping = file->f_mapping; if (!(vma->vm_flags & VM_NONLINEAR)) @@ -567,25 +588,6 @@ again: remove_next = 1 + (end > next->vm_end); } } - /* - * When changing only vma->vm_end, we don't really need - * anon_vma lock. - */ - if (vma->anon_vma && (insert || importer || start != vma->vm_start)) - anon_vma = vma->anon_vma; - if (anon_vma) { - spin_lock(&anon_vma->lock); - /* - * Easily overlooked: when mprotect shifts the boundary, - * make sure the expanding vma has anon_vma set if the - * shrinking vma had, to cover any anon pages imported. - */ - if (importer && !importer->anon_vma) { - importer->anon_vma = anon_vma; - __anon_vma_link(importer); - } - } - if (root) { flush_dcache_mmap_lock(mapping); vma_prio_tree_remove(vma, root); @@ -616,8 +618,11 @@ again: remove_next = 1 + (end > next->vm_end); __vma_unlink(mm, next, vma); if (file) __remove_shared_vm_struct(next, file, mapping); - if (next->anon_vma) - __anon_vma_merge(vma, next); + /* + * This VMA is now dead, no need for rmap to follow it. + * Call anon_vma_merge below, outside of i_mmap_lock. + */ + next->vm_flags |= VM_LOCK_RMAP; } else if (insert) { /* * split_vma has split insert from vma, and needs @@ -627,17 +632,25 @@ again: remove_next = 1 + (end > next->vm_end); __insert_vm_struct(mm, insert); } - if (anon_vma) - spin_unlock(&anon_vma->lock); if (mapping) spin_unlock(&mapping->i_mmap_lock); + /* + * The current VMA has been set up. It is now safe for the + * rmap code to get from the pages to the ptes. + */ + if (anon_vma && importer) + importer->vm_flags &= ~VM_LOCK_RMAP; + if (remove_next) { if (file) { fput(file); if (next->vm_flags & VM_EXECUTABLE) removed_exe_file_vma(mm); } + /* Protected by mmap_sem and VM_LOCK_RMAP. */ + if (next->anon_vma) + anon_vma_merge(vma, next); mm->map_count--; mpol_put(vma_policy(next)); kmem_cache_free(vm_area_cachep, next); @@ -653,6 +666,8 @@ again: remove_next = 1 + (end > next->vm_end); } validate_mm(mm); + + return 0; } /* @@ -759,6 +774,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, { pgoff_t pglen = (end - addr) >> PAGE_SHIFT; struct vm_area_struct *area, *next; + int err; /* * We later require that vma->vm_flags == vm_flags, @@ -792,11 +808,13 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, is_mergeable_anon_vma(prev->anon_vma, next->anon_vma)) { /* cases 1, 6 */ - vma_adjust(prev, prev->vm_start, + err = vma_adjust(prev, prev->vm_start, next->vm_end, prev->vm_pgoff, NULL); } else /* cases 2, 5, 7 */ - vma_adjust(prev, prev->vm_start, + err = vma_adjust(prev, prev->vm_start, end, prev->vm_pgoff, NULL); + if (err) + return NULL; return prev; } @@ -808,11 +826,13 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, can_vma_merge_before(next, vm_flags, anon_vma, file, pgoff+pglen)) { if (prev && addr < prev->vm_end) /* case 4 */ - vma_adjust(prev, prev->vm_start, + err = vma_adjust(prev, prev->vm_start, addr, prev->vm_pgoff, NULL); else /* cases 3, 8 */ - vma_adjust(area, addr, next->vm_end, + err = vma_adjust(area, addr, next->vm_end, next->vm_pgoff - pglen, NULL); + if (err) + return NULL; return area; } @@ -1205,6 +1225,7 @@ munmap_back: vma->vm_flags = vm_flags; vma->vm_page_prot = vm_get_page_prot(vm_flags); vma->vm_pgoff = pgoff; + INIT_LIST_HEAD(&vma->anon_vma_chain); if (file) { error = -EINVAL; @@ -1865,6 +1886,7 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma, { struct mempolicy *pol; struct vm_area_struct *new; + int err = -ENOMEM; if (is_vm_hugetlb_page(vma) && (addr & ~(huge_page_mask(hstate_vma(vma))))) @@ -1872,11 +1894,13 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma, new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); if (!new) - return -ENOMEM; + goto out_err; /* most fields are the same, copy all, and then fixup */ *new = *vma; + INIT_LIST_HEAD(&new->anon_vma_chain); + if (new_below) new->vm_end = addr; else { @@ -1886,11 +1910,14 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma, pol = mpol_dup(vma_policy(vma)); if (IS_ERR(pol)) { - kmem_cache_free(vm_area_cachep, new); - return PTR_ERR(pol); + err = PTR_ERR(pol); + goto out_free_vma; } vma_set_policy(new, pol); + if (anon_vma_clone(new, vma)) + goto out_free_mpol; + if (new->vm_file) { get_file(new->vm_file); if (vma->vm_flags & VM_EXECUTABLE) @@ -1901,12 +1928,28 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma, new->vm_ops->open(new); if (new_below) - vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff + + err = vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff + ((addr - new->vm_start) >> PAGE_SHIFT), new); else - vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new); + err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new); - return 0; + /* Success. */ + if (!err) + return 0; + + /* Clean everything up if vma_adjust failed. */ + new->vm_ops->close(new); + if (new->vm_file) { + if (vma->vm_flags & VM_EXECUTABLE) + removed_exe_file_vma(mm); + fput(new->vm_file); + } + out_free_mpol: + mpol_put(pol); + out_free_vma: + kmem_cache_free(vm_area_cachep, new); + out_err: + return err; } /* @@ -2116,6 +2159,7 @@ unsigned long do_brk(unsigned long addr, unsigned long len) return -ENOMEM; } + INIT_LIST_HEAD(&vma->anon_vma_chain); vma->vm_mm = mm; vma->vm_start = addr; vma->vm_end = addr + len; @@ -2252,10 +2296,11 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, if (new_vma) { *new_vma = *vma; pol = mpol_dup(vma_policy(vma)); - if (IS_ERR(pol)) { - kmem_cache_free(vm_area_cachep, new_vma); - return NULL; - } + if (IS_ERR(pol)) + goto out_free_vma; + INIT_LIST_HEAD(&new_vma->anon_vma_chain); + if (anon_vma_clone(new_vma, vma)) + goto out_free_mempol; vma_set_policy(new_vma, pol); new_vma->vm_start = addr; new_vma->vm_end = addr + len; @@ -2271,6 +2316,12 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, } } return new_vma; + + out_free_mempol: + mpol_put(pol); + out_free_vma: + kmem_cache_free(vm_area_cachep, new_vma); + return NULL; } /* @@ -2348,6 +2399,7 @@ int install_special_mapping(struct mm_struct *mm, if (unlikely(vma == NULL)) return -ENOMEM; + INIT_LIST_HEAD(&vma->anon_vma_chain); vma->vm_mm = mm; vma->vm_start = addr; vma->vm_end = addr + len; @@ -2448,6 +2500,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) int mm_take_all_locks(struct mm_struct *mm) { struct vm_area_struct *vma; + struct anon_vma_chain *avc; int ret = -EINTR; BUG_ON(down_read_trylock(&mm->mmap_sem)); @@ -2465,7 +2518,8 @@ int mm_take_all_locks(struct mm_struct *mm) if (signal_pending(current)) goto out_unlock; if (vma->anon_vma) - vm_lock_anon_vma(mm, vma->anon_vma); + list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) + vm_lock_anon_vma(mm, avc->anon_vma); } ret = 0; @@ -2520,13 +2574,15 @@ static void vm_unlock_mapping(struct address_space *mapping) void mm_drop_all_locks(struct mm_struct *mm) { struct vm_area_struct *vma; + struct anon_vma_chain *avc; BUG_ON(down_read_trylock(&mm->mmap_sem)); BUG_ON(!mutex_is_locked(&mm_all_locks_mutex)); for (vma = mm->mmap; vma; vma = vma->vm_next) { if (vma->anon_vma) - vm_unlock_anon_vma(vma->anon_vma); + list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) + vm_unlock_anon_vma(avc->anon_vma); if (vma->vm_file && vma->vm_file->f_mapping) vm_unlock_mapping(vma->vm_file->f_mapping); } diff --git a/mm/mremap.c b/mm/mremap.c index 4c4c803453f3..e9c75efce609 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -460,8 +460,11 @@ unsigned long do_mremap(unsigned long addr, if (vma_expandable(vma, new_len - old_len)) { int pages = (new_len - old_len) >> PAGE_SHIFT; - vma_adjust(vma, vma->vm_start, - addr + new_len, vma->vm_pgoff, NULL); + if (vma_adjust(vma, vma->vm_start, addr + new_len, + vma->vm_pgoff, NULL)) { + ret = -ENOMEM; + goto out; + } mm->total_vm += pages; vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages); diff --git a/mm/nommu.c b/mm/nommu.c index 48a2ecfaf059..55727a74af98 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1209,7 +1209,7 @@ unsigned long do_mmap_pgoff(struct file *file, region->vm_flags = vm_flags; region->vm_pgoff = pgoff; - INIT_LIST_HEAD(&vma->anon_vma_node); + INIT_LIST_HEAD(&vma->anon_vma_chain); vma->vm_flags = vm_flags; vma->vm_pgoff = pgoff; diff --git a/mm/rmap.c b/mm/rmap.c index 5cb47111f79e..be34094e4595 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -62,6 +62,7 @@ #include "internal.h" static struct kmem_cache *anon_vma_cachep; +static struct kmem_cache *anon_vma_chain_cachep; static inline struct anon_vma *anon_vma_alloc(void) { @@ -73,6 +74,16 @@ void anon_vma_free(struct anon_vma *anon_vma) kmem_cache_free(anon_vma_cachep, anon_vma); } +static inline struct anon_vma_chain *anon_vma_chain_alloc(void) +{ + return kmem_cache_alloc(anon_vma_chain_cachep, GFP_KERNEL); +} + +void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain) +{ + kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain); +} + /** * anon_vma_prepare - attach an anon_vma to a memory region * @vma: the memory region in question @@ -103,18 +114,23 @@ void anon_vma_free(struct anon_vma *anon_vma) int anon_vma_prepare(struct vm_area_struct *vma) { struct anon_vma *anon_vma = vma->anon_vma; + struct anon_vma_chain *avc; might_sleep(); if (unlikely(!anon_vma)) { struct mm_struct *mm = vma->vm_mm; struct anon_vma *allocated; + avc = anon_vma_chain_alloc(); + if (!avc) + goto out_enomem; + anon_vma = find_mergeable_anon_vma(vma); allocated = NULL; if (!anon_vma) { anon_vma = anon_vma_alloc(); if (unlikely(!anon_vma)) - return -ENOMEM; + goto out_enomem_free_avc; allocated = anon_vma; } spin_lock(&anon_vma->lock); @@ -123,53 +139,113 @@ int anon_vma_prepare(struct vm_area_struct *vma) spin_lock(&mm->page_table_lock); if (likely(!vma->anon_vma)) { vma->anon_vma = anon_vma; - list_add_tail(&vma->anon_vma_node, &anon_vma->head); + avc->anon_vma = anon_vma; + avc->vma = vma; + list_add(&avc->same_vma, &vma->anon_vma_chain); + list_add(&avc->same_anon_vma, &anon_vma->head); allocated = NULL; } spin_unlock(&mm->page_table_lock); spin_unlock(&anon_vma->lock); - if (unlikely(allocated)) + if (unlikely(allocated)) { anon_vma_free(allocated); + anon_vma_chain_free(avc); + } } return 0; + + out_enomem_free_avc: + anon_vma_chain_free(avc); + out_enomem: + return -ENOMEM; } -void __anon_vma_merge(struct vm_area_struct *vma, struct vm_area_struct *next) +static void anon_vma_chain_link(struct vm_area_struct *vma, + struct anon_vma_chain *avc, + struct anon_vma *anon_vma) { - BUG_ON(vma->anon_vma != next->anon_vma); - list_del(&next->anon_vma_node); + avc->vma = vma; + avc->anon_vma = anon_vma; + list_add(&avc->same_vma, &vma->anon_vma_chain); + + spin_lock(&anon_vma->lock); + list_add_tail(&avc->same_anon_vma, &anon_vma->head); + spin_unlock(&anon_vma->lock); } -void __anon_vma_link(struct vm_area_struct *vma) +/* + * Attach the anon_vmas from src to dst. + * Returns 0 on success, -ENOMEM on failure. + */ +int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) { - struct anon_vma *anon_vma = vma->anon_vma; + struct anon_vma_chain *avc, *pavc; + + list_for_each_entry(pavc, &src->anon_vma_chain, same_vma) { + avc = anon_vma_chain_alloc(); + if (!avc) + goto enomem_failure; + anon_vma_chain_link(dst, avc, pavc->anon_vma); + } + return 0; - if (anon_vma) - list_add_tail(&vma->anon_vma_node, &anon_vma->head); + enomem_failure: + unlink_anon_vmas(dst); + return -ENOMEM; } -void anon_vma_link(struct vm_area_struct *vma) +/* + * Attach vma to its own anon_vma, as well as to the anon_vmas that + * the corresponding VMA in the parent process is attached to. + * Returns 0 on success, non-zero on failure. + */ +int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) { - struct anon_vma *anon_vma = vma->anon_vma; + struct anon_vma_chain *avc; + struct anon_vma *anon_vma; - if (anon_vma) { - spin_lock(&anon_vma->lock); - list_add_tail(&vma->anon_vma_node, &anon_vma->head); - spin_unlock(&anon_vma->lock); - } + /* Don't bother if the parent process has no anon_vma here. */ + if (!pvma->anon_vma) + return 0; + + /* + * First, attach the new VMA to the parent VMA's anon_vmas, + * so rmap can find non-COWed pages in child processes. + */ + if (anon_vma_clone(vma, pvma)) + return -ENOMEM; + + /* Then add our own anon_vma. */ + anon_vma = anon_vma_alloc(); + if (!anon_vma) + goto out_error; + avc = anon_vma_chain_alloc(); + if (!avc) + goto out_error_free_anon_vma; + anon_vma_chain_link(vma, avc, anon_vma); + /* Mark this anon_vma as the one where our new (COWed) pages go. */ + vma->anon_vma = anon_vma; + + return 0; + + out_error_free_anon_vma: + anon_vma_free(anon_vma); + out_error: + return -ENOMEM; } -void anon_vma_unlink(struct vm_area_struct *vma) +static void anon_vma_unlink(struct anon_vma_chain *anon_vma_chain) { - struct anon_vma *anon_vma = vma->anon_vma; + struct anon_vma *anon_vma = anon_vma_chain->anon_vma; int empty; + /* If anon_vma_fork fails, we can get an empty anon_vma_chain. */ if (!anon_vma) return; spin_lock(&anon_vma->lock); - list_del(&vma->anon_vma_node); + list_del(&anon_vma_chain->same_anon_vma); /* We must garbage collect the anon_vma if it's empty */ empty = list_empty(&anon_vma->head) && !ksm_refcount(anon_vma); @@ -179,6 +255,18 @@ void anon_vma_unlink(struct vm_area_struct *vma) anon_vma_free(anon_vma); } +void unlink_anon_vmas(struct vm_area_struct *vma) +{ + struct anon_vma_chain *avc, *next; + + /* Unlink each anon_vma chained to the VMA. */ + list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { + anon_vma_unlink(avc); + list_del(&avc->same_vma); + anon_vma_chain_free(avc); + } +} + static void anon_vma_ctor(void *data) { struct anon_vma *anon_vma = data; @@ -192,6 +280,7 @@ void __init anon_vma_init(void) { anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma), 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor); + anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain, SLAB_PANIC); } /* @@ -240,6 +329,18 @@ vma_address(struct page *page, struct vm_area_struct *vma) /* page should be within @vma mapping range */ return -EFAULT; } + if (unlikely(vma->vm_flags & VM_LOCK_RMAP)) { + /* + * This VMA is being unlinked or is not yet linked into the + * VMA tree. Do not try to follow this rmap. This race + * condition can result in page_referenced() ignoring a + * reference or in try_to_unmap() failing to unmap a page. + * The VMA cannot be freed under us because we hold the + * anon_vma->lock, which the munmap code takes while + * unlinking the anon_vmas from the VMA. + */ + return -EFAULT; + } return address; } @@ -396,7 +497,7 @@ static int page_referenced_anon(struct page *page, { unsigned int mapcount; struct anon_vma *anon_vma; - struct vm_area_struct *vma; + struct anon_vma_chain *avc; int referenced = 0; anon_vma = page_lock_anon_vma(page); @@ -404,7 +505,8 @@ static int page_referenced_anon(struct page *page, return referenced; mapcount = page_mapcount(page); - list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { + list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { + struct vm_area_struct *vma = avc->vma; unsigned long address = vma_address(page, vma); if (address == -EFAULT) continue; @@ -1025,14 +1127,15 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) { struct anon_vma *anon_vma; - struct vm_area_struct *vma; + struct anon_vma_chain *avc; int ret = SWAP_AGAIN; anon_vma = page_lock_anon_vma(page); if (!anon_vma) return ret; - list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { + list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { + struct vm_area_struct *vma = avc->vma; unsigned long address = vma_address(page, vma); if (address == -EFAULT) continue; @@ -1223,7 +1326,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, struct vm_area_struct *, unsigned long, void *), void *arg) { struct anon_vma *anon_vma; - struct vm_area_struct *vma; + struct anon_vma_chain *avc; int ret = SWAP_AGAIN; /* @@ -1238,7 +1341,8 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, if (!anon_vma) return ret; spin_lock(&anon_vma->lock); - list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { + list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { + struct vm_area_struct *vma = avc->vma; unsigned long address = vma_address(page, vma); if (address == -EFAULT) continue; -- cgit v1.2.3 From d554ed895dc8f293cc712c71f14b101ace82579a Mon Sep 17 00:00:00 2001 From: Jiri Slaby <jslaby@suse.cz> Date: Fri, 5 Mar 2010 13:42:42 -0800 Subject: fs: use rlimit helpers Make sure compiler won't do weird things with limits. E.g. fetching them twice may return 2 different values after writable limits are implemented. I.e. either use rlimit helpers added in commit 3e10e716abf3 ("resource: add helpers for fetching rlimits") or ACCESS_ONCE if not applicable. Signed-off-by: Jiri Slaby <jslaby@suse.cz> Cc: Alexander Viro <viro@zeniv.linux.org.uk> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- fs/attr.c | 2 +- fs/binfmt_aout.c | 2 +- fs/binfmt_flat.c | 2 +- fs/exec.c | 8 ++++---- fs/fcntl.c | 2 +- fs/file.c | 2 +- fs/proc/array.c | 4 ++-- fs/select.c | 2 +- 8 files changed, 12 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/attr.c b/fs/attr.c index 0a6ea54cde7d..0815e93bb487 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -81,7 +81,7 @@ int inode_newsize_ok(const struct inode *inode, loff_t offset) if (inode->i_size < offset) { unsigned long limit; - limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; + limit = rlimit(RLIMIT_FSIZE); if (limit != RLIM_INFINITY && offset > limit) goto out_sig; if (offset > inode->i_sb->s_maxbytes) diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c index fdd397099172..61dd00a6c7b4 100644 --- a/fs/binfmt_aout.c +++ b/fs/binfmt_aout.c @@ -247,7 +247,7 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs) * size limits imposed on them by creating programs with large * arrays in the data or bss. */ - rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur; + rlim = rlimit(RLIMIT_DATA); if (rlim >= RLIM_INFINITY) rlim = ~0; if (ex.a_data + ex.a_bss > rlim) diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index 42c6b4a54445..e0e769bdca59 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -501,7 +501,7 @@ static int load_flat_file(struct linux_binprm * bprm, * size limits imposed on them by creating programs with large * arrays in the data or bss. */ - rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur; + rlim = rlimit(RLIMIT_DATA); if (rlim >= RLIM_INFINITY) rlim = ~0; if (data_len + bss_len > rlim) { diff --git a/fs/exec.c b/fs/exec.c index 591030735591..6348d79401de 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -195,7 +195,7 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, * to work from. */ rlim = current->signal->rlim; - if (size > rlim[RLIMIT_STACK].rlim_cur / 4) { + if (size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur) / 4) { put_page(page); return NULL; } @@ -579,7 +579,7 @@ int setup_arg_pages(struct linux_binprm *bprm, #ifdef CONFIG_STACK_GROWSUP /* Limit stack size to 1GB */ - stack_base = current->signal->rlim[RLIMIT_STACK].rlim_max; + stack_base = rlimit_max(RLIMIT_STACK); if (stack_base > (1 << 30)) stack_base = 1 << 30; @@ -1535,7 +1535,7 @@ static int format_corename(char *corename, long signr) /* core limit size */ case 'c': rc = snprintf(out_ptr, out_end - out_ptr, - "%lu", current->signal->rlim[RLIMIT_CORE].rlim_cur); + "%lu", rlimit(RLIMIT_CORE)); if (rc > out_end - out_ptr) goto out; out_ptr += rc; @@ -1800,7 +1800,7 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs) struct coredump_params cprm = { .signr = signr, .regs = regs, - .limit = current->signal->rlim[RLIMIT_CORE].rlim_cur, + .limit = rlimit(RLIMIT_CORE), }; audit_core_dumps(signr); diff --git a/fs/fcntl.c b/fs/fcntl.c index 97e01dc0d95f..452d02f9075e 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -344,7 +344,7 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, switch (cmd) { case F_DUPFD: case F_DUPFD_CLOEXEC: - if (arg >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur) + if (arg >= rlimit(RLIMIT_NOFILE)) break; err = alloc_fd(arg, cmd == F_DUPFD_CLOEXEC ? O_CLOEXEC : 0); if (err >= 0) { diff --git a/fs/file.c b/fs/file.c index 38039af67663..34bb7f71d994 100644 --- a/fs/file.c +++ b/fs/file.c @@ -257,7 +257,7 @@ int expand_files(struct files_struct *files, int nr) * N.B. For clone tasks sharing a files structure, this test * will limit the total number of files that can be opened. */ - if (nr >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur) + if (nr >= rlimit(RLIMIT_NOFILE)) return -EMFILE; /* Do we need to expand? */ diff --git a/fs/proc/array.c b/fs/proc/array.c index 18e20feee251..aa8637b81028 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -273,7 +273,7 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p) rcu_read_lock(); /* FIXME: is this correct? */ qsize = atomic_read(&__task_cred(p)->user->sigpending); rcu_read_unlock(); - qlim = p->signal->rlim[RLIMIT_SIGPENDING].rlim_cur; + qlim = task_rlimit(p, RLIMIT_SIGPENDING); unlock_task_sighand(p, &flags); } @@ -420,7 +420,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, cutime = sig->cutime; cstime = sig->cstime; cgtime = sig->cgtime; - rsslim = sig->rlim[RLIMIT_RSS].rlim_cur; + rsslim = ACCESS_ONCE(sig->rlim[RLIMIT_RSS].rlim_cur); /* add up live thread stats at the group level */ if (whole) { diff --git a/fs/select.c b/fs/select.c index fd38ce2e32e3..73715e90030f 100644 --- a/fs/select.c +++ b/fs/select.c @@ -821,7 +821,7 @@ int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, struct poll_list *walk = head; unsigned long todo = nfds; - if (nfds > current->signal->rlim[RLIMIT_NOFILE].rlim_cur) + if (nfds > rlimit(RLIMIT_NOFILE)) return -EINVAL; len = min_t(unsigned int, nfds, N_STACK_PPS); -- cgit v1.2.3 From 5ef097dd7ba4eab8b4f0026d85fcef9fe23b821f Mon Sep 17 00:00:00 2001 From: Michael Neuling <mikey@neuling.org> Date: Fri, 5 Mar 2010 13:42:57 -0800 Subject: exec: create initial stack independent of PAGE_SIZE Currently we create the initial stack based on the PAGE_SIZE. This is unnecessary. This creates this initial stack independent of the PAGE_SIZE. It also bumps up the number of 4k pages allocated from 20 to 32, to align with 64K page systems. Signed-off-by: Michael Neuling <mikey@neuling.org> Cc: Helge Deller <deller@gmx.de> Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Americo Wang <xiyou.wangcong@gmail.com> Cc: Anton Blanchard <anton@samba.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- fs/exec.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index 6348d79401de..da2b31dc4e1c 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -556,8 +556,6 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) return 0; } -#define EXTRA_STACK_VM_PAGES 20 /* random */ - /* * Finalizes the stack vm_area_struct. The flags and permissions are updated, * the stack is optionally relocated, and some extra space is added. @@ -632,7 +630,7 @@ int setup_arg_pages(struct linux_binprm *bprm, goto out_unlock; } - stack_expand = EXTRA_STACK_VM_PAGES * PAGE_SIZE; + stack_expand = 131072UL; /* randomly 32*4k (or 2*64k) pages */ stack_size = vma->vm_end - vma->vm_start; /* * Align this down to a page boundary as expand_stack -- cgit v1.2.3 From a069c266ae5fdfbf5b4aecf2c672413aa33b2504 Mon Sep 17 00:00:00 2001 From: Don Mullis <don.mullis@gmail.com> Date: Fri, 5 Mar 2010 13:43:16 -0800 Subject: lib: build list_sort() only if needed Build list_sort() only for configs that need it -- those that don't save ~581 bytes (i386). Signed-off-by: Don Mullis <don.mullis@gmail.com> Cc: Dave Airlie <airlied@redhat.com> Cc: Andi Kleen <andi@firstfloor.org> Cc: Dave Chinner <david@fromorbit.com> Cc: Artem Bityutskiy <dedekind@infradead.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- drivers/gpu/drm/Kconfig | 1 + fs/ubifs/Kconfig | 1 + lib/Kconfig | 3 +++ lib/Makefile | 3 ++- 4 files changed, 7 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/drivers/gpu/drm/Kconfig b/drivers/gpu/drm/Kconfig index 305c59003963..3d2ab03f1296 100644 --- a/drivers/gpu/drm/Kconfig +++ b/drivers/gpu/drm/Kconfig @@ -9,6 +9,7 @@ menuconfig DRM depends on (AGP || AGP=n) && PCI && !EMULATED_CMPXCHG && MMU select I2C select I2C_ALGOBIT + select LIST_SORT help Kernel-level support for the Direct Rendering Infrastructure (DRI) introduced in XFree86 4.0. If you say Y here, you need to select diff --git a/fs/ubifs/Kconfig b/fs/ubifs/Kconfig index 830e3f76f442..430c69f39842 100644 --- a/fs/ubifs/Kconfig +++ b/fs/ubifs/Kconfig @@ -7,6 +7,7 @@ config UBIFS_FS select CRYPTO if UBIFS_FS_ZLIB select CRYPTO_LZO if UBIFS_FS_LZO select CRYPTO_DEFLATE if UBIFS_FS_ZLIB + select LIST_SORT depends on MTD_UBI help UBIFS is a file system for flash devices which works on top of UBI. diff --git a/lib/Kconfig b/lib/Kconfig index 97b136ff117e..8034c46327cb 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -160,6 +160,9 @@ config TEXTSEARCH_BM config TEXTSEARCH_FSM tristate +config LIST_SORT + boolean + config HAS_IOMEM boolean depends on !NO_IOMEM diff --git a/lib/Makefile b/lib/Makefile index 3b0b4a696db9..e39c361b0be3 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -21,7 +21,7 @@ lib-y += kobject.o kref.o klist.o obj-y += bcd.o div64.o sort.o parser.o halfmd4.o debug_locks.o random32.o \ bust_spinlocks.o hexdump.o kasprintf.o bitmap.o scatterlist.o \ - string_helpers.o gcd.o list_sort.o + string_helpers.o gcd.o ifeq ($(CONFIG_DEBUG_KOBJECT),y) CFLAGS_kobject.o += -DDEBUG @@ -40,6 +40,7 @@ lib-$(CONFIG_GENERIC_FIND_FIRST_BIT) += find_next_bit.o lib-$(CONFIG_GENERIC_FIND_NEXT_BIT) += find_next_bit.o obj-$(CONFIG_GENERIC_FIND_LAST_BIT) += find_last_bit.o obj-$(CONFIG_GENERIC_HWEIGHT) += hweight.o +obj-$(CONFIG_LIST_SORT) += list_sort.o obj-$(CONFIG_LOCK_KERNEL) += kernel_lock.o obj-$(CONFIG_DEBUG_PREEMPT) += smp_processor_id.o obj-$(CONFIG_DEBUG_LIST) += list_debug.o -- cgit v1.2.3 From 45bf5cd7be624712ef1591e9de71f0ff7ad21cf1 Mon Sep 17 00:00:00 2001 From: Andrew Morton <akpm@linux-foundation.org> Date: Fri, 5 Mar 2010 13:43:19 -0800 Subject: fs/compat_ioctl.c: suppress two warnings fs/compat_ioctl.c: In function 'do_ioctl_trans': fs/compat_ioctl.c:534: warning: 'karg' may be used uninitialized in this function fs/compat_ioctl.c:533: warning: 'kcmd' may be used uninitialized in this function fs/compat_ioctl.c:656: warning: 'ret' may be used uninitialized in this function Reduces text size by 44 bytes. If someone calls one of these functions with an unexpected argument, the code's buggy as-is. Amerigo Wang <amwang@redhat.com> Cc: Alexander Viro <viro@zeniv.linux.org.uk> Acked-by: Arnd Bergmann <arnd@arndb.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- fs/compat_ioctl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index 0ca9ec4a79c3..6d55b61bfa79 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -545,7 +545,7 @@ static int mt_ioctl_trans(unsigned int fd, unsigned int cmd, void __user *argp) kcmd = MTIOCPOS; karg = &pos; break; - case MTIOCGET32: + default: /* MTIOCGET32 */ kcmd = MTIOCGET; karg = &get; break; @@ -663,7 +663,7 @@ static int raw_ioctl(unsigned fd, unsigned cmd, switch (cmd) { case RAW_SETBIND: - case RAW_GETBIND: { + default: { /* RAW_GETBIND */ struct raw_config_request req; mm_segment_t oldfs = get_fs(); -- cgit v1.2.3 From e17a5765f20d1219c3f05eb17aab11671978e0ec Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan <adobriyan@gmail.com> Date: Fri, 5 Mar 2010 13:43:59 -0800 Subject: proc: do translation + unlink atomically at remove_proc_entry() remove_proc_entry() does lock lookup parent unlock lock unlink proc entry from lists unlock which can be made bit more correct by doing parent translation + unlink without dropping lock. Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com> Cc: Al Viro <viro@zeniv.linux.org.uk> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- fs/proc/generic.c | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 9580abeadeb3..ce2d95477701 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -291,19 +291,17 @@ static const struct inode_operations proc_file_inode_operations = { * returns the struct proc_dir_entry for "/proc/tty/driver", and * returns "serial" in residual. */ -static int xlate_proc_name(const char *name, - struct proc_dir_entry **ret, const char **residual) +static int __xlate_proc_name(const char *name, struct proc_dir_entry **ret, + const char **residual) { const char *cp = name, *next; struct proc_dir_entry *de; int len; - int rtn = 0; de = *ret; if (!de) de = &proc_root; - spin_lock(&proc_subdir_lock); while (1) { next = strchr(cp, '/'); if (!next) @@ -314,17 +312,24 @@ static int xlate_proc_name(const char *name, if (proc_match(len, cp, de)) break; } - if (!de) { - rtn = -ENOENT; - goto out; - } + if (!de) + return -ENOENT; cp += len + 1; } *residual = cp; *ret = de; -out: + return 0; +} + +static int xlate_proc_name(const char *name, struct proc_dir_entry **ret, + const char **residual) +{ + int rv; + + spin_lock(&proc_subdir_lock); + rv = __xlate_proc_name(name, ret, residual); spin_unlock(&proc_subdir_lock); - return rtn; + return rv; } static DEFINE_IDA(proc_inum_ida); @@ -797,11 +802,13 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent) const char *fn = name; int len; - if (xlate_proc_name(name, &parent, &fn) != 0) + spin_lock(&proc_subdir_lock); + if (__xlate_proc_name(name, &parent, &fn) != 0) { + spin_unlock(&proc_subdir_lock); return; + } len = strlen(fn); - spin_lock(&proc_subdir_lock); for (p = &parent->subdir; *p; p=&(*p)->next ) { if (proc_match(len, fn, *p)) { de = *p; -- cgit v1.2.3 From 12bac0d9f4dbf3445a0319beee848d15fa32775e Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan <adobriyan@gmail.com> Date: Fri, 5 Mar 2010 13:44:00 -0800 Subject: proc: warn on non-existing proc entries * warn if creation goes on to non-existent directory * warn if removal goes on from non-existing directory * warn if non-existing proc entry is removed Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com> Cc: Al Viro <viro@zeniv.linux.org.uk> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- fs/proc/generic.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/proc/generic.c b/fs/proc/generic.c index ce2d95477701..08f4d71dacd7 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -312,8 +312,10 @@ static int __xlate_proc_name(const char *name, struct proc_dir_entry **ret, if (proc_match(len, cp, de)) break; } - if (!de) + if (!de) { + WARN(1, "name '%s'\n", name); return -ENOENT; + } cp += len + 1; } *residual = cp; @@ -818,8 +820,10 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent) } } spin_unlock(&proc_subdir_lock); - if (!de) + if (!de) { + WARN(1, "name '%s'\n", name); return; + } spin_lock(&de->pde_unload_lock); /* -- cgit v1.2.3 From 05f47fda9fc5b17bfab189e9d54228025befc996 Mon Sep 17 00:00:00 2001 From: Daisuke HATAYAMA <d.hatayama@jp.fujitsu.com> Date: Fri, 5 Mar 2010 13:44:05 -0800 Subject: coredump: unify dump_seek() implementations for each binfmt_*.c The current ELF dumper can produce broken corefiles if program headers exceed 65535. In particular, the program in 64-bit environment often demands more than 65535 mmaps. If you google max_map_count, then you can find many users facing this problem. Solaris has already dealt with this issue, and other OSes have also adopted the same method as in Solaris. Currently, Sun's document and AMD 64 ABI include the description for the extension, where they call the extension Extended Numbering. See Reference for further information. I believe that linux kernel should adopt the same way as they did, so I've written this patch. I am also preparing for patches of GDB and binutils. How to fix ========== In new dumping process, there are two cases according to weather or not the number of program headers is equal to or more than 65535. - if less than 65535, the produced corefile format is exactly the same as the ordinary one. - if equal to or more than 65535, then e_phnum field is set to newly introduced constant PN_XNUM(0xffff) and the actual number of program headers is set to sh_info field of the section header at index 0. Compatibility Concern ===================== * As already mentioned in Summary, Sun and AMD64 has already adopted this. See Reference. * There are four combinations according to whether kernel and userland tools are respectively modified or not. The next table summarizes shortly for each combination. --------------------------------------------- Original Kernel | Modified Kernel --------------------------------------------- < 65535 | >= 65535 | < 65535 | >= 65535 ------------------------------------------------------------- Original Tools | OK | broken | OK | broken (#) ------------------------------------------------------------- Modified Tools | OK | broken | OK | OK ------------------------------------------------------------- Note that there is no case that `OK' changes to `broken'. (#) Although this case remains broken, O-M behaves better than O-O. That is, while in O-O case e_phnum field would be extremely small due to integer overflow, in O-M case it is guaranteed to be at least 65535 by being set to PN_XNUM(0xFFFF), much closer to the actual correct value than the O-O case. Test Program ============ Here is a test program mkmmaps.c that is useful to produce the corefile with many mmaps. To use this, please take the following steps: $ ulimit -c unlimited $ sysctl vm.max_map_count=70000 # default 65530 is too small $ sysctl fs.file-max=70000 $ mkmmaps 65535 Then, the program will abort and a corefile will be generated. If failed, there are two cases according to the error message displayed. * ``out of memory'' means vm.max_map_count is still smaller * ``too many open files'' means fs.file-max is still smaller So, please change it to a larger value, and then retry it. mkmmaps.c == #include <stdio.h> #include <stdlib.h> #include <sys/mman.h> #include <fcntl.h> #include <unistd.h> int main(int argc, char **argv) { int maps_num; if (argc < 2) { fprintf(stderr, "mkmmaps [number of maps to be created]\n"); exit(1); } if (sscanf(argv[1], "%d", &maps_num) == EOF) { perror("sscanf"); exit(2); } if (maps_num < 0) { fprintf(stderr, "%d is invalid\n", maps_num); exit(3); } for (; maps_num > 0; --maps_num) { if (MAP_FAILED == mmap((void *)NULL, (size_t) 1, PROT_READ, MAP_SHARED | MAP_ANONYMOUS, (int) -1, (off_t) NULL)) { perror("mmap"); exit(4); } } abort(); { char buffer[128]; sprintf(buffer, "wc -l /proc/%u/maps", getpid()); system(buffer); } return 0; } Tested on i386, ia64 and um/sys-i386. Built on sh4 (which covers fs/binfmt_elf_fdpic.c) References ========== - Sun microsystems: Linker and Libraries. Part No: 817-1984-17, September 2008. URL: http://docs.sun.com/app/docs/doc/817-1984 - System V ABI AMD64 Architecture Processor Supplement Draft Version 0.99., May 11, 2009. URL: http://www.x86-64.org/ This patch: There are three different definitions for dump_seek() functions in binfmt_aout.c, binfmt_elf.c and binfmt_elf_fdpic.c, respectively. The only for binfmt_elf.c. My next patch will move dump_seek() into a header file in order to share the same implementations for dump_write() and dump_seek(). As the first step, this patch unify these three definitions for dump_seek() by applying the past commits that have been applied only for binfmt_elf.c. Specifically, the modification made here is part of the following commits: * d025c9db7f31fc0554ce7fb2dfc78d35a77f3487 * 7f14daa19ea36b200d237ad3ac5826ae25360461 This patch does not change a shape of corefiles. Signed-off-by: Daisuke HATAYAMA <d.hatayama@jp.fujitsu.com> Cc: "Luck, Tony" <tony.luck@intel.com> Cc: Jeff Dike <jdike@addtoit.com> Cc: David Howells <dhowells@redhat.com> Cc: Greg Ungerer <gerg@snapgear.com> Cc: Roland McGrath <roland@redhat.com> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Alexander Viro <viro@zeniv.linux.org.uk> Cc: Andi Kleen <andi@firstfloor.org> Cc: Alan Cox <alan@lxorguk.ukuu.org.uk> Cc: <linux-arch@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- fs/binfmt_aout.c | 31 +++++++++++++++++++++------ fs/binfmt_elf_fdpic.c | 59 +++++++++++++++++++++++++++++++++------------------ 2 files changed, 62 insertions(+), 28 deletions(-) (limited to 'fs') diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c index 61dd00a6c7b4..d2f8872dd767 100644 --- a/fs/binfmt_aout.c +++ b/fs/binfmt_aout.c @@ -69,16 +69,32 @@ static int dump_write(struct file *file, const void *addr, int nr) return file->f_op->write(file, addr, nr, &file->f_pos) == nr; } +static int dump_seek(struct file *file, loff_t off) +{ + if (file->f_op->llseek && file->f_op->llseek != no_llseek) { + if (file->f_op->llseek(file, off, SEEK_CUR) < 0) + return 0; + } else { + char *buf = (char *)get_zeroed_page(GFP_KERNEL); + if (!buf) + return 0; + while (off > 0) { + unsigned long n = off; + if (n > PAGE_SIZE) + n = PAGE_SIZE; + if (!dump_write(file, buf, n)) + return 0; + off -= n; + } + free_page((unsigned long)buf); + } + return 1; +} + #define DUMP_WRITE(addr, nr) \ if (!dump_write(file, (void *)(addr), (nr))) \ goto end_coredump; -#define DUMP_SEEK(offset) \ -if (file->f_op->llseek) { \ - if (file->f_op->llseek(file,(offset),0) != (offset)) \ - goto end_coredump; \ -} else file->f_pos = (offset) - /* * Routine writes a core dump image in the current directory. * Currently only a stub-function. @@ -132,7 +148,8 @@ static int aout_core_dump(struct coredump_params *cprm) /* struct user */ DUMP_WRITE(&dump,sizeof(dump)); /* Now dump all of the user data. Include malloced stuff as well */ - DUMP_SEEK(PAGE_SIZE); + if (!dump_seek(cprm->file, PAGE_SIZE - sizeof(dump))) + goto end_coredump; /* now we start writing out the user space info */ set_fs(USER_DS); /* Dump the data area */ diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 18d77297ccc8..32d9b44c3cb9 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -1226,11 +1226,22 @@ static int dump_write(struct file *file, const void *addr, int nr) static int dump_seek(struct file *file, loff_t off) { - if (file->f_op->llseek) { - if (file->f_op->llseek(file, off, SEEK_SET) != off) + if (file->f_op->llseek && file->f_op->llseek != no_llseek) { + if (file->f_op->llseek(file, off, SEEK_CUR) < 0) return 0; } else { - file->f_pos = off; + char *buf = (char *)get_zeroed_page(GFP_KERNEL); + if (!buf) + return 0; + while (off > 0) { + unsigned long n = off; + if (n > PAGE_SIZE) + n = PAGE_SIZE; + if (!dump_write(file, buf, n)) + return 0; + off -= n; + } + free_page((unsigned long)buf); } return 1; } @@ -1313,30 +1324,35 @@ static int notesize(struct memelfnote *en) /* #define DEBUG */ -#define DUMP_WRITE(addr, nr) \ - do { if (!dump_write(file, (addr), (nr))) return 0; } while(0) -#define DUMP_SEEK(off) \ - do { if (!dump_seek(file, (off))) return 0; } while(0) +#define DUMP_WRITE(addr, nr, foffset) \ + do { if (!dump_write(file, (addr), (nr))) return 0; *foffset += (nr); } while(0) -static int writenote(struct memelfnote *men, struct file *file) +static int alignfile(struct file *file, loff_t *foffset) { - struct elf_note en; + static const char buf[4] = { 0, }; + DUMP_WRITE(buf, roundup(*foffset, 4) - *foffset, foffset); + return 1; +} +static int writenote(struct memelfnote *men, struct file *file, + loff_t *foffset) +{ + struct elf_note en; en.n_namesz = strlen(men->name) + 1; en.n_descsz = men->datasz; en.n_type = men->type; - DUMP_WRITE(&en, sizeof(en)); - DUMP_WRITE(men->name, en.n_namesz); - /* XXX - cast from long long to long to avoid need for libgcc.a */ - DUMP_SEEK(roundup((unsigned long)file->f_pos, 4)); /* XXX */ - DUMP_WRITE(men->data, men->datasz); - DUMP_SEEK(roundup((unsigned long)file->f_pos, 4)); /* XXX */ + DUMP_WRITE(&en, sizeof(en), foffset); + DUMP_WRITE(men->name, en.n_namesz, foffset); + if (!alignfile(file, foffset)) + return 0; + DUMP_WRITE(men->data, men->datasz, foffset); + if (!alignfile(file, foffset)) + return 0; return 1; } #undef DUMP_WRITE -#undef DUMP_SEEK #define DUMP_WRITE(addr, nr) \ if ((size += (nr)) > cprm->limit || \ @@ -1552,7 +1568,7 @@ static int elf_fdpic_dump_segments(struct file *file, size_t *size, err = -EIO; kunmap(page); page_cache_release(page); - } else if (!dump_seek(file, file->f_pos + PAGE_SIZE)) + } else if (!dump_seek(file, PAGE_SIZE)) err = -EFBIG; if (err) goto out; @@ -1605,7 +1621,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) int i; struct vm_area_struct *vma; struct elfhdr *elf = NULL; - loff_t offset = 0, dataoff; + loff_t offset = 0, dataoff, foffset; int numnote; struct memelfnote *notes = NULL; struct elf_prstatus *prstatus = NULL; /* NT_PRSTATUS */ @@ -1730,6 +1746,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) DUMP_WRITE(elf, sizeof(*elf)); offset += sizeof(*elf); /* Elf header */ offset += (segs+1) * sizeof(struct elf_phdr); /* Program headers */ + foffset = offset; /* Write notes phdr entry */ { @@ -1786,7 +1803,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) /* write out the notes section */ for (i = 0; i < numnote; i++) - if (!writenote(notes + i, cprm->file)) + if (!writenote(notes + i, cprm->file, &foffset)) goto end_coredump; /* write out the thread status notes section */ @@ -1795,11 +1812,11 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) list_entry(t, struct elf_thread_status, list); for (i = 0; i < tmp->num_notes; i++) - if (!writenote(&tmp->notes[i], cprm->file)) + if (!writenote(&tmp->notes[i], cprm->file, &foffset)) goto end_coredump; } - if (!dump_seek(cprm->file, dataoff)) + if (!dump_seek(cprm->file, dataoff - foffset)) goto end_coredump; if (elf_fdpic_dump_segments(cprm->file, &size, &cprm->limit, -- cgit v1.2.3 From 088e7af73a962fcc8883b7a6392544d8342553d6 Mon Sep 17 00:00:00 2001 From: Daisuke HATAYAMA <d.hatayama@jp.fujitsu.com> Date: Fri, 5 Mar 2010 13:44:06 -0800 Subject: coredump: move dump_write() and dump_seek() into a header file My next patch will replace ELF_CORE_EXTRA_* macros by functions, putting them into other newly created *.c files. Then, each files will contain dump_write(), where each pair of binfmt_*.c and elfcore.c should be the same. So, this patch moves them into a header file with dump_seek(). Also, the patch deletes confusing DUMP_WRITE macros in each files. Signed-off-by: Daisuke HATAYAMA <d.hatayama@jp.fujitsu.com> Cc: "Luck, Tony" <tony.luck@intel.com> Cc: Jeff Dike <jdike@addtoit.com> Cc: David Howells <dhowells@redhat.com> Cc: Greg Ungerer <gerg@snapgear.com> Cc: Roland McGrath <roland@redhat.com> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Alexander Viro <viro@zeniv.linux.org.uk> Cc: Andi Kleen <andi@firstfloor.org> Cc: Alan Cox <alan@lxorguk.ukuu.org.uk> Cc: <linux-arch@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- fs/binfmt_aout.c | 49 ++++++++----------------------------------- fs/binfmt_elf.c | 52 +++++++++++++--------------------------------- fs/binfmt_elf_fdpic.c | 54 ++++++++++++++---------------------------------- include/linux/coredump.h | 41 ++++++++++++++++++++++++++++++++++++ 4 files changed, 79 insertions(+), 117 deletions(-) create mode 100644 include/linux/coredump.h (limited to 'fs') diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c index d2f8872dd767..15d80bb35d6f 100644 --- a/fs/binfmt_aout.c +++ b/fs/binfmt_aout.c @@ -24,6 +24,7 @@ #include <linux/binfmts.h> #include <linux/personality.h> #include <linux/init.h> +#include <linux/coredump.h> #include <asm/system.h> #include <asm/uaccess.h> @@ -59,42 +60,6 @@ static int set_brk(unsigned long start, unsigned long end) return 0; } -/* - * These are the only things you should do on a core-file: use only these - * macros to write out all the necessary info. - */ - -static int dump_write(struct file *file, const void *addr, int nr) -{ - return file->f_op->write(file, addr, nr, &file->f_pos) == nr; -} - -static int dump_seek(struct file *file, loff_t off) -{ - if (file->f_op->llseek && file->f_op->llseek != no_llseek) { - if (file->f_op->llseek(file, off, SEEK_CUR) < 0) - return 0; - } else { - char *buf = (char *)get_zeroed_page(GFP_KERNEL); - if (!buf) - return 0; - while (off > 0) { - unsigned long n = off; - if (n > PAGE_SIZE) - n = PAGE_SIZE; - if (!dump_write(file, buf, n)) - return 0; - off -= n; - } - free_page((unsigned long)buf); - } - return 1; -} - -#define DUMP_WRITE(addr, nr) \ - if (!dump_write(file, (void *)(addr), (nr))) \ - goto end_coredump; - /* * Routine writes a core dump image in the current directory. * Currently only a stub-function. @@ -146,7 +111,8 @@ static int aout_core_dump(struct coredump_params *cprm) set_fs(KERNEL_DS); /* struct user */ - DUMP_WRITE(&dump,sizeof(dump)); + if (!dump_write(file, &dump, sizeof(dump))) + goto end_coredump; /* Now dump all of the user data. Include malloced stuff as well */ if (!dump_seek(cprm->file, PAGE_SIZE - sizeof(dump))) goto end_coredump; @@ -156,17 +122,20 @@ static int aout_core_dump(struct coredump_params *cprm) if (dump.u_dsize != 0) { dump_start = START_DATA(dump); dump_size = dump.u_dsize << PAGE_SHIFT; - DUMP_WRITE(dump_start,dump_size); + if (!dump_write(file, dump_start, dump_size)) + goto end_coredump; } /* Now prepare to dump the stack area */ if (dump.u_ssize != 0) { dump_start = START_STACK(dump); dump_size = dump.u_ssize << PAGE_SHIFT; - DUMP_WRITE(dump_start,dump_size); + if (!dump_write(file, dump_start, dump_size)) + goto end_coredump; } /* Finally dump the task struct. Not be used by gdb, but could be useful */ set_fs(KERNEL_DS); - DUMP_WRITE(current,sizeof(*current)); + if (!dump_write(file, current, sizeof(*current))) + goto end_coredump; end_coredump: set_fs(fs); return has_dumped; diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index fd5b2ea5d299..0bcfbb05c32d 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -31,6 +31,7 @@ #include <linux/random.h> #include <linux/elf.h> #include <linux/utsname.h> +#include <linux/coredump.h> #include <asm/uaccess.h> #include <asm/param.h> #include <asm/page.h> @@ -1085,36 +1086,6 @@ out: * Modelled on fs/exec.c:aout_core_dump() * Jeremy Fitzhardinge <jeremy@sw.oz.au> */ -/* - * These are the only things you should do on a core-file: use only these - * functions to write out all the necessary info. - */ -static int dump_write(struct file *file, const void *addr, int nr) -{ - return file->f_op->write(file, addr, nr, &file->f_pos) == nr; -} - -static int dump_seek(struct file *file, loff_t off) -{ - if (file->f_op->llseek && file->f_op->llseek != no_llseek) { - if (file->f_op->llseek(file, off, SEEK_CUR) < 0) - return 0; - } else { - char *buf = (char *)get_zeroed_page(GFP_KERNEL); - if (!buf) - return 0; - while (off > 0) { - unsigned long n = off; - if (n > PAGE_SIZE) - n = PAGE_SIZE; - if (!dump_write(file, buf, n)) - return 0; - off -= n; - } - free_page((unsigned long)buf); - } - return 1; -} /* * Decide what to dump of a segment, part, all or none. @@ -1249,11 +1220,6 @@ static int writenote(struct memelfnote *men, struct file *file, } #undef DUMP_WRITE -#define DUMP_WRITE(addr, nr) \ - if ((size += (nr)) > cprm->limit || \ - !dump_write(cprm->file, (addr), (nr))) \ - goto end_coredump; - static void fill_elf_header(struct elfhdr *elf, int segs, u16 machine, u32 flags, u8 osabi) { @@ -1934,7 +1900,10 @@ static int elf_core_dump(struct coredump_params *cprm) fs = get_fs(); set_fs(KERNEL_DS); - DUMP_WRITE(elf, sizeof(*elf)); + size += sizeof(*elf); + if (size > cprm->limit || !dump_write(cprm->file, elf, sizeof(*elf))) + goto end_coredump; + offset += sizeof(*elf); /* Elf header */ offset += (segs + 1) * sizeof(struct elf_phdr); /* Program headers */ foffset = offset; @@ -1948,7 +1917,11 @@ static int elf_core_dump(struct coredump_params *cprm) fill_elf_note_phdr(&phdr, sz, offset); offset += sz; - DUMP_WRITE(&phdr, sizeof(phdr)); + + size += sizeof(phdr); + if (size > cprm->limit + || !dump_write(cprm->file, &phdr, sizeof(phdr))) + goto end_coredump; } dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE); @@ -1979,7 +1952,10 @@ static int elf_core_dump(struct coredump_params *cprm) phdr.p_flags |= PF_X; phdr.p_align = ELF_EXEC_PAGESIZE; - DUMP_WRITE(&phdr, sizeof(phdr)); + size += sizeof(phdr); + if (size > cprm->limit + || !dump_write(cprm->file, &phdr, sizeof(phdr))) + goto end_coredump; } #ifdef ELF_CORE_WRITE_EXTRA_PHDRS diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 32d9b44c3cb9..63edf40b569b 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -34,6 +34,7 @@ #include <linux/elf.h> #include <linux/elf-fdpic.h> #include <linux/elfcore.h> +#include <linux/coredump.h> #include <asm/uaccess.h> #include <asm/param.h> @@ -1215,37 +1216,6 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params, */ #ifdef CONFIG_ELF_CORE -/* - * These are the only things you should do on a core-file: use only these - * functions to write out all the necessary info. - */ -static int dump_write(struct file *file, const void *addr, int nr) -{ - return file->f_op->write(file, addr, nr, &file->f_pos) == nr; -} - -static int dump_seek(struct file *file, loff_t off) -{ - if (file->f_op->llseek && file->f_op->llseek != no_llseek) { - if (file->f_op->llseek(file, off, SEEK_CUR) < 0) - return 0; - } else { - char *buf = (char *)get_zeroed_page(GFP_KERNEL); - if (!buf) - return 0; - while (off > 0) { - unsigned long n = off; - if (n > PAGE_SIZE) - n = PAGE_SIZE; - if (!dump_write(file, buf, n)) - return 0; - off -= n; - } - free_page((unsigned long)buf); - } - return 1; -} - /* * Decide whether a segment is worth dumping; default is yes to be * sure (missing info is worse than too much; etc). @@ -1354,11 +1324,6 @@ static int writenote(struct memelfnote *men, struct file *file, } #undef DUMP_WRITE -#define DUMP_WRITE(addr, nr) \ - if ((size += (nr)) > cprm->limit || \ - !dump_write(cprm->file, (addr), (nr))) \ - goto end_coredump; - static inline void fill_elf_fdpic_header(struct elfhdr *elf, int segs) { memcpy(elf->e_ident, ELFMAG, SELFMAG); @@ -1743,7 +1708,11 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) fs = get_fs(); set_fs(KERNEL_DS); - DUMP_WRITE(elf, sizeof(*elf)); + size += sizeof(*elf); + if (size > cprm->limit + || !dump_write(cprm->file, elf, sizeof(*elf))) + goto end_coredump; + offset += sizeof(*elf); /* Elf header */ offset += (segs+1) * sizeof(struct elf_phdr); /* Program headers */ foffset = offset; @@ -1760,7 +1729,11 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) fill_elf_note_phdr(&phdr, sz, offset); offset += sz; - DUMP_WRITE(&phdr, sizeof(phdr)); + + size += sizeof(phdr); + if (size > cprm->limit + || !dump_write(cprm->file, &phdr, sizeof(phdr))) + goto end_coredump; } /* Page-align dumped data */ @@ -1794,7 +1767,10 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) phdr.p_flags |= PF_X; phdr.p_align = ELF_EXEC_PAGESIZE; - DUMP_WRITE(&phdr, sizeof(phdr)); + size += sizeof(phdr); + if (size > cprm->limit + || !dump_write(cprm->file, &phdr, sizeof(phdr))) + goto end_coredump; } #ifdef ELF_CORE_WRITE_EXTRA_PHDRS diff --git a/include/linux/coredump.h b/include/linux/coredump.h new file mode 100644 index 000000000000..b3c91d7cede4 --- /dev/null +++ b/include/linux/coredump.h @@ -0,0 +1,41 @@ +#ifndef _LINUX_COREDUMP_H +#define _LINUX_COREDUMP_H + +#include <linux/types.h> +#include <linux/mm.h> +#include <linux/fs.h> + +/* + * These are the only things you should do on a core-file: use only these + * functions to write out all the necessary info. + */ +static inline int dump_write(struct file *file, const void *addr, int nr) +{ + return file->f_op->write(file, addr, nr, &file->f_pos) == nr; +} + +static inline int dump_seek(struct file *file, loff_t off) +{ + if (file->f_op->llseek && file->f_op->llseek != no_llseek) { + if (file->f_op->llseek(file, off, SEEK_CUR) < 0) + return 0; + } else { + char *buf = (char *)get_zeroed_page(GFP_KERNEL); + + if (!buf) + return 0; + while (off > 0) { + unsigned long n = off; + + if (n > PAGE_SIZE) + n = PAGE_SIZE; + if (!dump_write(file, buf, n)) + return 0; + off -= n; + } + free_page((unsigned long)buf); + } + return 1; +} + +#endif /* _LINUX_COREDUMP_H */ -- cgit v1.2.3 From 1fcccbac89f5bbc5e41aa72086960059fce372da Mon Sep 17 00:00:00 2001 From: Daisuke HATAYAMA <d.hatayama@jp.fujitsu.com> Date: Fri, 5 Mar 2010 13:44:07 -0800 Subject: elf coredump: replace ELF_CORE_EXTRA_* macros by functions elf_core_dump() and elf_fdpic_core_dump() use #ifdef and the corresponding macro for hiding _multiline_ logics in functions. This patch removes #ifdef and replaces ELF_CORE_EXTRA_* by corresponding functions. For architectures not implemeonting ELF_CORE_EXTRA_*, we use weak functions in order to reduce a range of modification. This cleanup is for my next patches, but I think this cleanup itself is worth doing regardless of my firnal purpose. Signed-off-by: Daisuke HATAYAMA <d.hatayama@jp.fujitsu.com> Cc: "Luck, Tony" <tony.luck@intel.com> Cc: Jeff Dike <jdike@addtoit.com> Cc: David Howells <dhowells@redhat.com> Cc: Greg Ungerer <gerg@snapgear.com> Cc: Roland McGrath <roland@redhat.com> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Alexander Viro <viro@zeniv.linux.org.uk> Cc: Andi Kleen <andi@firstfloor.org> Cc: Alan Cox <alan@lxorguk.ukuu.org.uk> Cc: <linux-arch@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- arch/ia64/include/asm/elf.h | 48 -------------------------------- arch/ia64/kernel/Makefile | 2 ++ arch/ia64/kernel/elfcore.c | 64 +++++++++++++++++++++++++++++++++++++++++++ arch/um/sys-i386/Makefile | 2 ++ arch/um/sys-i386/asm/elf.h | 43 ----------------------------- arch/um/sys-i386/elfcore.c | 67 +++++++++++++++++++++++++++++++++++++++++++++ fs/binfmt_elf.c | 14 ++++------ fs/binfmt_elf_fdpic.c | 14 ++++------ fs/compat_binfmt_elf.c | 2 ++ include/linux/elf.h | 2 ++ include/linux/elfcore.h | 16 +++++++++++ kernel/Makefile | 3 ++ kernel/elfcore.c | 23 ++++++++++++++++ 13 files changed, 191 insertions(+), 109 deletions(-) create mode 100644 arch/ia64/kernel/elfcore.c create mode 100644 arch/um/sys-i386/elfcore.c create mode 100644 kernel/elfcore.c (limited to 'fs') diff --git a/arch/ia64/include/asm/elf.h b/arch/ia64/include/asm/elf.h index 4c41656ede87..b5298eb09adb 100644 --- a/arch/ia64/include/asm/elf.h +++ b/arch/ia64/include/asm/elf.h @@ -219,54 +219,6 @@ do { \ NEW_AUX_ENT(AT_SYSINFO_EHDR, (unsigned long) GATE_EHDR); \ } while (0) - -/* - * These macros parameterize elf_core_dump in fs/binfmt_elf.c to write out - * extra segments containing the gate DSO contents. Dumping its - * contents makes post-mortem fully interpretable later without matching up - * the same kernel and hardware config to see what PC values meant. - * Dumping its extra ELF program headers includes all the other information - * a debugger needs to easily find how the gate DSO was being used. - */ -#define ELF_CORE_EXTRA_PHDRS (GATE_EHDR->e_phnum) -#define ELF_CORE_WRITE_EXTRA_PHDRS \ -do { \ - const struct elf_phdr *const gate_phdrs = \ - (const struct elf_phdr *) (GATE_ADDR + GATE_EHDR->e_phoff); \ - int i; \ - Elf64_Off ofs = 0; \ - for (i = 0; i < GATE_EHDR->e_phnum; ++i) { \ - struct elf_phdr phdr = gate_phdrs[i]; \ - if (phdr.p_type == PT_LOAD) { \ - phdr.p_memsz = PAGE_ALIGN(phdr.p_memsz); \ - phdr.p_filesz = phdr.p_memsz; \ - if (ofs == 0) { \ - ofs = phdr.p_offset = offset; \ - offset += phdr.p_filesz; \ - } \ - else \ - phdr.p_offset = ofs; \ - } \ - else \ - phdr.p_offset += ofs; \ - phdr.p_paddr = 0; /* match other core phdrs */ \ - DUMP_WRITE(&phdr, sizeof(phdr)); \ - } \ -} while (0) -#define ELF_CORE_WRITE_EXTRA_DATA \ -do { \ - const struct elf_phdr *const gate_phdrs = \ - (const struct elf_phdr *) (GATE_ADDR + GATE_EHDR->e_phoff); \ - int i; \ - for (i = 0; i < GATE_EHDR->e_phnum; ++i) { \ - if (gate_phdrs[i].p_type == PT_LOAD) { \ - DUMP_WRITE((void *) gate_phdrs[i].p_vaddr, \ - PAGE_ALIGN(gate_phdrs[i].p_memsz)); \ - break; \ - } \ - } \ -} while (0) - /* * format for entries in the Global Offset Table */ diff --git a/arch/ia64/kernel/Makefile b/arch/ia64/kernel/Makefile index 4138282aefa8..db10b1e378b0 100644 --- a/arch/ia64/kernel/Makefile +++ b/arch/ia64/kernel/Makefile @@ -45,6 +45,8 @@ endif obj-$(CONFIG_DMAR) += pci-dma.o obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o +obj-$(CONFIG_BINFMT_ELF) += elfcore.o + # fp_emulate() expects f2-f5,f16-f31 to contain the user-level state. CFLAGS_traps.o += -mfixed-range=f2-f5,f16-f31 diff --git a/arch/ia64/kernel/elfcore.c b/arch/ia64/kernel/elfcore.c new file mode 100644 index 000000000000..57a2298a8581 --- /dev/null +++ b/arch/ia64/kernel/elfcore.c @@ -0,0 +1,64 @@ +#include <linux/elf.h> +#include <linux/coredump.h> +#include <linux/fs.h> +#include <linux/mm.h> + +#include <asm/elf.h> + + +Elf64_Half elf_core_extra_phdrs(void) +{ + return GATE_EHDR->e_phnum; +} + +int elf_core_write_extra_phdrs(struct file *file, loff_t offset, size_t *size, + unsigned long limit) +{ + const struct elf_phdr *const gate_phdrs = + (const struct elf_phdr *) (GATE_ADDR + GATE_EHDR->e_phoff); + int i; + Elf64_Off ofs = 0; + + for (i = 0; i < GATE_EHDR->e_phnum; ++i) { + struct elf_phdr phdr = gate_phdrs[i]; + + if (phdr.p_type == PT_LOAD) { + phdr.p_memsz = PAGE_ALIGN(phdr.p_memsz); + phdr.p_filesz = phdr.p_memsz; + if (ofs == 0) { + ofs = phdr.p_offset = offset; + offset += phdr.p_filesz; + } else { + phdr.p_offset = ofs; + } + } else { + phdr.p_offset += ofs; + } + phdr.p_paddr = 0; /* match other core phdrs */ + *size += sizeof(phdr); + if (*size > limit || !dump_write(file, &phdr, sizeof(phdr))) + return 0; + } + return 1; +} + +int elf_core_write_extra_data(struct file *file, size_t *size, + unsigned long limit) +{ + const struct elf_phdr *const gate_phdrs = + (const struct elf_phdr *) (GATE_ADDR + GATE_EHDR->e_phoff); + int i; + + for (i = 0; i < GATE_EHDR->e_phnum; ++i) { + if (gate_phdrs[i].p_type == PT_LOAD) { + void *addr = (void *)gate_phdrs[i].p_vaddr; + size_t memsz = PAGE_ALIGN(gate_phdrs[i].p_memsz); + + *size += memsz; + if (*size > limit || !dump_write(file, addr, memsz)) + return 0; + break; + } + } + return 1; +} diff --git a/arch/um/sys-i386/Makefile b/arch/um/sys-i386/Makefile index 1b549bca4645..804b28dd0328 100644 --- a/arch/um/sys-i386/Makefile +++ b/arch/um/sys-i386/Makefile @@ -6,6 +6,8 @@ obj-y = bug.o bugs.o checksum.o delay.o fault.o ksyms.o ldt.o ptrace.o \ ptrace_user.o setjmp.o signal.o stub.o stub_segv.o syscalls.o sysrq.o \ sys_call_table.o tls.o +obj-$(CONFIG_BINFMT_ELF) += elfcore.o + subarch-obj-y = lib/semaphore_32.o lib/string_32.o subarch-obj-$(CONFIG_HIGHMEM) += mm/highmem_32.o subarch-obj-$(CONFIG_MODULES) += kernel/module.o diff --git a/arch/um/sys-i386/asm/elf.h b/arch/um/sys-i386/asm/elf.h index 770885472ed4..e64cd41d7bab 100644 --- a/arch/um/sys-i386/asm/elf.h +++ b/arch/um/sys-i386/asm/elf.h @@ -116,47 +116,4 @@ do { \ } \ } while (0) -/* - * These macros parameterize elf_core_dump in fs/binfmt_elf.c to write out - * extra segments containing the vsyscall DSO contents. Dumping its - * contents makes post-mortem fully interpretable later without matching up - * the same kernel and hardware config to see what PC values meant. - * Dumping its extra ELF program headers includes all the other information - * a debugger needs to easily find how the vsyscall DSO was being used. - */ -#define ELF_CORE_EXTRA_PHDRS \ - (vsyscall_ehdr ? (((struct elfhdr *)vsyscall_ehdr)->e_phnum) : 0 ) - -#define ELF_CORE_WRITE_EXTRA_PHDRS \ -if ( vsyscall_ehdr ) { \ - const struct elfhdr *const ehdrp = (struct elfhdr *)vsyscall_ehdr; \ - const struct elf_phdr *const phdrp = \ - (const struct elf_phdr *) (vsyscall_ehdr + ehdrp->e_phoff); \ - int i; \ - Elf32_Off ofs = 0; \ - for (i = 0; i < ehdrp->e_phnum; ++i) { \ - struct elf_phdr phdr = phdrp[i]; \ - if (phdr.p_type == PT_LOAD) { \ - ofs = phdr.p_offset = offset; \ - offset += phdr.p_filesz; \ - } \ - else \ - phdr.p_offset += ofs; \ - phdr.p_paddr = 0; /* match other core phdrs */ \ - DUMP_WRITE(&phdr, sizeof(phdr)); \ - } \ -} -#define ELF_CORE_WRITE_EXTRA_DATA \ -if ( vsyscall_ehdr ) { \ - const struct elfhdr *const ehdrp = (struct elfhdr *)vsyscall_ehdr; \ - const struct elf_phdr *const phdrp = \ - (const struct elf_phdr *) (vsyscall_ehdr + ehdrp->e_phoff); \ - int i; \ - for (i = 0; i < ehdrp->e_phnum; ++i) { \ - if (phdrp[i].p_type == PT_LOAD) \ - DUMP_WRITE((void *) phdrp[i].p_vaddr, \ - phdrp[i].p_filesz); \ - } \ -} - #endif diff --git a/arch/um/sys-i386/elfcore.c b/arch/um/sys-i386/elfcore.c new file mode 100644 index 000000000000..30cac52a04b4 --- /dev/null +++ b/arch/um/sys-i386/elfcore.c @@ -0,0 +1,67 @@ +#include <linux/elf.h> +#include <linux/coredump.h> +#include <linux/fs.h> +#include <linux/mm.h> + +#include <asm/elf.h> + + +Elf32_Half elf_core_extra_phdrs(void) +{ + return vsyscall_ehdr ? (((struct elfhdr *)vsyscall_ehdr)->e_phnum) : 0; +} + +int elf_core_write_extra_phdrs(struct file *file, loff_t offset, size_t *size, + unsigned long limit) +{ + if ( vsyscall_ehdr ) { + const struct elfhdr *const ehdrp = + (struct elfhdr *) vsyscall_ehdr; + const struct elf_phdr *const phdrp = + (const struct elf_phdr *) (vsyscall_ehdr + ehdrp->e_phoff); + int i; + Elf32_Off ofs = 0; + + for (i = 0; i < ehdrp->e_phnum; ++i) { + struct elf_phdr phdr = phdrp[i]; + + if (phdr.p_type == PT_LOAD) { + ofs = phdr.p_offset = offset; + offset += phdr.p_filesz; + } else { + phdr.p_offset += ofs; + } + phdr.p_paddr = 0; /* match other core phdrs */ + *size += sizeof(phdr); + if (*size > limit + || !dump_write(file, &phdr, sizeof(phdr))) + return 0; + } + } + return 1; +} + +int elf_core_write_extra_data(struct file *file, size_t *size, + unsigned long limit) +{ + if ( vsyscall_ehdr ) { + const struct elfhdr *const ehdrp = + (struct elfhdr *) vsyscall_ehdr; + const struct elf_phdr *const phdrp = + (const struct elf_phdr *) (vsyscall_ehdr + ehdrp->e_phoff); + int i; + + for (i = 0; i < ehdrp->e_phnum; ++i) { + if (phdrp[i].p_type == PT_LOAD) { + void *addr = (void *) phdrp[i].p_vaddr; + size_t filesz = phdrp[i].p_filesz; + + *size += filesz; + if (*size > limit + || !dump_write(file, addr, filesz)) + return 0; + } + } + } + return 1; +} diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 0bcfbb05c32d..c1a499599b7d 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1878,9 +1878,7 @@ static int elf_core_dump(struct coredump_params *cprm) * Please check DEFAULT_MAX_MAP_COUNT definition when you modify here. */ segs = current->mm->map_count; -#ifdef ELF_CORE_EXTRA_PHDRS - segs += ELF_CORE_EXTRA_PHDRS; -#endif + segs += elf_core_extra_phdrs(); gate_vma = get_gate_vma(current); if (gate_vma != NULL) @@ -1958,9 +1956,8 @@ static int elf_core_dump(struct coredump_params *cprm) goto end_coredump; } -#ifdef ELF_CORE_WRITE_EXTRA_PHDRS - ELF_CORE_WRITE_EXTRA_PHDRS; -#endif + if (!elf_core_write_extra_phdrs(cprm->file, offset, &size, cprm->limit)) + goto end_coredump; /* write out the notes section */ if (!write_note_info(&info, cprm->file, &foffset)) @@ -1999,9 +1996,8 @@ static int elf_core_dump(struct coredump_params *cprm) } } -#ifdef ELF_CORE_WRITE_EXTRA_DATA - ELF_CORE_WRITE_EXTRA_DATA; -#endif + if (!elf_core_write_extra_data(cprm->file, &size, cprm->limit)) + goto end_coredump; end_coredump: set_fs(fs); diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 63edf40b569b..952699a86ec3 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -1664,9 +1664,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) elf_core_copy_regs(&prstatus->pr_reg, cprm->regs); segs = current->mm->map_count; -#ifdef ELF_CORE_EXTRA_PHDRS - segs += ELF_CORE_EXTRA_PHDRS; -#endif + segs += elf_core_extra_phdrs(); /* Set up header */ fill_elf_fdpic_header(elf, segs + 1); /* including notes section */ @@ -1773,9 +1771,8 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) goto end_coredump; } -#ifdef ELF_CORE_WRITE_EXTRA_PHDRS - ELF_CORE_WRITE_EXTRA_PHDRS; -#endif + if (!elf_core_write_extra_phdrs(cprm->file, offset, &size, cprm->limit)) + goto end_coredump; /* write out the notes section */ for (i = 0; i < numnote; i++) @@ -1799,9 +1796,8 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) mm_flags) < 0) goto end_coredump; -#ifdef ELF_CORE_WRITE_EXTRA_DATA - ELF_CORE_WRITE_EXTRA_DATA; -#endif + if (!elf_core_write_extra_data(cprm->file, &size, cprm->limit)) + goto end_coredump; if (cprm->file->f_pos != offset) { /* Sanity check */ diff --git a/fs/compat_binfmt_elf.c b/fs/compat_binfmt_elf.c index 0adced2f296f..112e45a17e99 100644 --- a/fs/compat_binfmt_elf.c +++ b/fs/compat_binfmt_elf.c @@ -28,10 +28,12 @@ #undef elfhdr #undef elf_phdr +#undef elf_shdr #undef elf_note #undef elf_addr_t #define elfhdr elf32_hdr #define elf_phdr elf32_phdr +#define elf_shdr elf32_shdr #define elf_note elf32_note #define elf_addr_t Elf32_Addr diff --git a/include/linux/elf.h b/include/linux/elf.h index ad990c5f63f6..ccde3fd45f36 100644 --- a/include/linux/elf.h +++ b/include/linux/elf.h @@ -396,6 +396,7 @@ extern Elf32_Dyn _DYNAMIC []; #define elf_phdr elf32_phdr #define elf_note elf32_note #define elf_addr_t Elf32_Off +#define Elf_Half Elf32_Half #else @@ -404,6 +405,7 @@ extern Elf64_Dyn _DYNAMIC []; #define elf_phdr elf64_phdr #define elf_note elf64_note #define elf_addr_t Elf64_Off +#define Elf_Half Elf64_Half #endif diff --git a/include/linux/elfcore.h b/include/linux/elfcore.h index 00d6a68d0421..cfda74f521b5 100644 --- a/include/linux/elfcore.h +++ b/include/linux/elfcore.h @@ -8,6 +8,8 @@ #include <linux/user.h> #endif #include <linux/ptrace.h> +#include <linux/elf.h> +#include <linux/fs.h> struct elf_siginfo { @@ -150,5 +152,19 @@ static inline int elf_core_copy_task_xfpregs(struct task_struct *t, elf_fpxregse #endif /* __KERNEL__ */ +/* + * These functions parameterize elf_core_dump in fs/binfmt_elf.c to write out + * extra segments containing the gate DSO contents. Dumping its + * contents makes post-mortem fully interpretable later without matching up + * the same kernel and hardware config to see what PC values meant. + * Dumping its extra ELF program headers includes all the other information + * a debugger needs to easily find how the gate DSO was being used. + */ +extern Elf_Half elf_core_extra_phdrs(void); +extern int +elf_core_write_extra_phdrs(struct file *file, loff_t offset, size_t *size, + unsigned long limit); +extern int +elf_core_write_extra_data(struct file *file, size_t *size, unsigned long limit); #endif /* _LINUX_ELFCORE_H */ diff --git a/kernel/Makefile b/kernel/Makefile index 7b974699f8c2..a987aa1676b5 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -91,6 +91,9 @@ obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o obj-$(CONFIG_TRACEPOINTS) += tracepoint.o obj-$(CONFIG_LATENCYTOP) += latencytop.o +obj-$(CONFIG_BINFMT_ELF) += elfcore.o +obj-$(CONFIG_COMPAT_BINFMT_ELF) += elfcore.o +obj-$(CONFIG_BINFMT_ELF_FDPIC) += elfcore.o obj-$(CONFIG_FUNCTION_TRACER) += trace/ obj-$(CONFIG_TRACING) += trace/ obj-$(CONFIG_X86_DS) += trace/ diff --git a/kernel/elfcore.c b/kernel/elfcore.c new file mode 100644 index 000000000000..5445741f4b4c --- /dev/null +++ b/kernel/elfcore.c @@ -0,0 +1,23 @@ +#include <linux/elf.h> +#include <linux/fs.h> +#include <linux/mm.h> + +#include <asm/elf.h> + + +Elf_Half __weak elf_core_extra_phdrs(void) +{ + return 0; +} + +int __weak elf_core_write_extra_phdrs(struct file *file, loff_t offset, size_t *size, + unsigned long limit) +{ + return 1; +} + +int __weak elf_core_write_extra_data(struct file *file, size_t *size, + unsigned long limit) +{ + return 1; +} -- cgit v1.2.3 From 93eb211e6c9ff6054fcf9c5b9e344d8d9ad29175 Mon Sep 17 00:00:00 2001 From: Daisuke HATAYAMA <d.hatayama@jp.fujitsu.com> Date: Fri, 5 Mar 2010 13:44:09 -0800 Subject: elf coredump: make offset calculation process and writing process explicit By the next patch, elf_core_dump() and elf_fdpic_core_dump() will support extended numbering and so will produce the corefiles with section header table in a special case. The problem is the process of writing a file header offset of the section header table into e_shoff field of the ELF header. ELF header is positioned at the beginning of the corefile, while section header at the end. So, we need to take which of the following ways: 1. Seek backward to retry writing operation for ELF header after writing process for a whole part 2. Make offset calculation process and writing process totally sequential The clause 1. is not always possible: one cannot assume that file system supports seek function. Consider the no_llseek case. Therefore, this patch adopts the clause 2. Signed-off-by: Daisuke HATAYAMA <d.hatayama@jp.fujitsu.com> Cc: "Luck, Tony" <tony.luck@intel.com> Cc: Jeff Dike <jdike@addtoit.com> Cc: David Howells <dhowells@redhat.com> Cc: Greg Ungerer <gerg@snapgear.com> Cc: Roland McGrath <roland@redhat.com> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Alexander Viro <viro@zeniv.linux.org.uk> Cc: Andi Kleen <andi@firstfloor.org> Cc: Alan Cox <alan@lxorguk.ukuu.org.uk> Cc: <linux-arch@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- fs/binfmt_elf.c | 27 ++++++++++++++++----------- fs/binfmt_elf_fdpic.c | 29 ++++++++++++++++------------- 2 files changed, 32 insertions(+), 24 deletions(-) (limited to 'fs') diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index c1a499599b7d..6fc49b6ed936 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1856,6 +1856,7 @@ static int elf_core_dump(struct coredump_params *cprm) loff_t offset = 0, dataoff, foffset; unsigned long mm_flags; struct elf_note_info info; + struct elf_phdr *phdr4note = NULL; /* * We no longer stop all VM operations. @@ -1898,28 +1899,22 @@ static int elf_core_dump(struct coredump_params *cprm) fs = get_fs(); set_fs(KERNEL_DS); - size += sizeof(*elf); - if (size > cprm->limit || !dump_write(cprm->file, elf, sizeof(*elf))) - goto end_coredump; - offset += sizeof(*elf); /* Elf header */ offset += (segs + 1) * sizeof(struct elf_phdr); /* Program headers */ foffset = offset; /* Write notes phdr entry */ { - struct elf_phdr phdr; size_t sz = get_note_info_size(&info); sz += elf_coredump_extra_notes_size(); - fill_elf_note_phdr(&phdr, sz, offset); - offset += sz; - - size += sizeof(phdr); - if (size > cprm->limit - || !dump_write(cprm->file, &phdr, sizeof(phdr))) + phdr4note = kmalloc(sizeof(*phdr4note), GFP_KERNEL); + if (!phdr4note) goto end_coredump; + + fill_elf_note_phdr(phdr4note, sz, offset); + offset += sz; } dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE); @@ -1931,6 +1926,15 @@ static int elf_core_dump(struct coredump_params *cprm) */ mm_flags = current->mm->flags; + size += sizeof(*elf); + if (size > cprm->limit || !dump_write(cprm->file, elf, sizeof(*elf))) + goto end_coredump; + + size += sizeof(*phdr4note); + if (size > cprm->limit + || !dump_write(cprm->file, phdr4note, sizeof(*phdr4note))) + goto end_coredump; + /* Write program headers for segments dump */ for (vma = first_vma(current, gate_vma); vma != NULL; vma = next_vma(vma, gate_vma)) { @@ -2004,6 +2008,7 @@ end_coredump: cleanup: free_note_info(&info); + kfree(phdr4note); kfree(elf); out: return has_dumped; diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 952699a86ec3..112da491d75d 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -1600,6 +1600,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) int thread_status_size = 0; elf_addr_t *auxv; unsigned long mm_flags; + struct elf_phdr *phdr4note = NULL; /* * We no longer stop all VM operations. @@ -1706,18 +1707,12 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) fs = get_fs(); set_fs(KERNEL_DS); - size += sizeof(*elf); - if (size > cprm->limit - || !dump_write(cprm->file, elf, sizeof(*elf))) - goto end_coredump; - offset += sizeof(*elf); /* Elf header */ offset += (segs+1) * sizeof(struct elf_phdr); /* Program headers */ foffset = offset; /* Write notes phdr entry */ { - struct elf_phdr phdr; int sz = 0; for (i = 0; i < numnote; i++) @@ -1725,13 +1720,12 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) sz += thread_status_size; - fill_elf_note_phdr(&phdr, sz, offset); - offset += sz; - - size += sizeof(phdr); - if (size > cprm->limit - || !dump_write(cprm->file, &phdr, sizeof(phdr))) + phdr4note = kmalloc(sizeof(*phdr4note), GFP_KERNEL); + if (!phdr4note) goto end_coredump; + + fill_elf_note_phdr(phdr4note, sz, offset); + offset += sz; } /* Page-align dumped data */ @@ -1744,6 +1738,15 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) */ mm_flags = current->mm->flags; + size += sizeof(*elf); + if (size > cprm->limit || !dump_write(cprm->file, elf, sizeof(*elf))) + goto end_coredump; + + size += sizeof(*phdr4note); + if (size > cprm->limit + || !dump_write(cprm->file, phdr4note, sizeof(*phdr4note))) + goto end_coredump; + /* write program headers for segments dump */ for (vma = current->mm->mmap; vma; vma = vma->vm_next) { struct elf_phdr phdr; @@ -1815,7 +1818,7 @@ cleanup: list_del(tmp); kfree(list_entry(tmp, struct elf_thread_status, list)); } - + kfree(phdr4note); kfree(elf); kfree(prstatus); kfree(psinfo); -- cgit v1.2.3 From 8d9032bbe4671dc481261ccd4e161cd96e54b118 Mon Sep 17 00:00:00 2001 From: Daisuke HATAYAMA <d.hatayama@jp.fujitsu.com> Date: Fri, 5 Mar 2010 13:44:10 -0800 Subject: elf coredump: add extended numbering support The current ELF dumper implementation can produce broken corefiles if program headers exceed 65535. This number is determined by the number of vmas which the process have. In particular, some extreme programs may use more than 65535 vmas. (If you google max_map_count, you can find some users facing this problem.) This kind of program never be able to generate correct coredumps. This patch implements ``extended numbering'' that uses sh_info field of the first section header instead of e_phnum field in order to represent upto 4294967295 vmas. This is supported by AMD64-ABI(http://www.x86-64.org/documentation.html) and Solaris(http://docs.sun.com/app/docs/doc/817-1984/). Of course, we are preparing patches for gdb and binutils. Signed-off-by: Daisuke HATAYAMA <d.hatayama@jp.fujitsu.com> Cc: "Luck, Tony" <tony.luck@intel.com> Cc: Jeff Dike <jdike@addtoit.com> Cc: David Howells <dhowells@redhat.com> Cc: Greg Ungerer <gerg@snapgear.com> Cc: Roland McGrath <roland@redhat.com> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Alexander Viro <viro@zeniv.linux.org.uk> Cc: Andi Kleen <andi@firstfloor.org> Cc: Alan Cox <alan@lxorguk.ukuu.org.uk> Cc: <linux-arch@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- arch/ia64/kernel/elfcore.c | 16 +++++++++++ arch/um/sys-i386/elfcore.c | 16 +++++++++++ fs/binfmt_elf.c | 66 +++++++++++++++++++++++++++++++++++++++++++--- fs/binfmt_elf_fdpic.c | 63 +++++++++++++++++++++++++++++++++++++++++-- include/linux/elf.h | 26 +++++++++++++++++- include/linux/elfcore.h | 1 + kernel/elfcore.c | 5 ++++ 7 files changed, 187 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/arch/ia64/kernel/elfcore.c b/arch/ia64/kernel/elfcore.c index 57a2298a8581..bac1639bc320 100644 --- a/arch/ia64/kernel/elfcore.c +++ b/arch/ia64/kernel/elfcore.c @@ -62,3 +62,19 @@ int elf_core_write_extra_data(struct file *file, size_t *size, } return 1; } + +size_t elf_core_extra_data_size(void) +{ + const struct elf_phdr *const gate_phdrs = + (const struct elf_phdr *) (GATE_ADDR + GATE_EHDR->e_phoff); + int i; + size_t size = 0; + + for (i = 0; i < GATE_EHDR->e_phnum; ++i) { + if (gate_phdrs[i].p_type == PT_LOAD) { + size += PAGE_ALIGN(gate_phdrs[i].p_memsz); + break; + } + } + return size; +} diff --git a/arch/um/sys-i386/elfcore.c b/arch/um/sys-i386/elfcore.c index 30cac52a04b4..6bb49b687c97 100644 --- a/arch/um/sys-i386/elfcore.c +++ b/arch/um/sys-i386/elfcore.c @@ -65,3 +65,19 @@ int elf_core_write_extra_data(struct file *file, size_t *size, } return 1; } + +size_t elf_core_extra_data_size(void) +{ + if ( vsyscall_ehdr ) { + const struct elfhdr *const ehdrp = + (struct elfhdr *)vsyscall_ehdr; + const struct elf_phdr *const phdrp = + (const struct elf_phdr *) (vsyscall_ehdr + ehdrp->e_phoff); + int i; + + for (i = 0; i < ehdrp->e_phnum; ++i) + if (phdrp[i].p_type == PT_LOAD) + return (size_t) phdrp[i].p_filesz; + } + return 0; +} diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 6fc49b6ed936..78de530cfb02 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1838,6 +1838,34 @@ static struct vm_area_struct *next_vma(struct vm_area_struct *this_vma, return gate_vma; } +static void fill_extnum_info(struct elfhdr *elf, struct elf_shdr *shdr4extnum, + elf_addr_t e_shoff, int segs) +{ + elf->e_shoff = e_shoff; + elf->e_shentsize = sizeof(*shdr4extnum); + elf->e_shnum = 1; + elf->e_shstrndx = SHN_UNDEF; + + memset(shdr4extnum, 0, sizeof(*shdr4extnum)); + + shdr4extnum->sh_type = SHT_NULL; + shdr4extnum->sh_size = elf->e_shnum; + shdr4extnum->sh_link = elf->e_shstrndx; + shdr4extnum->sh_info = segs; +} + +static size_t elf_core_vma_data_size(struct vm_area_struct *gate_vma, + unsigned long mm_flags) +{ + struct vm_area_struct *vma; + size_t size = 0; + + for (vma = first_vma(current, gate_vma); vma != NULL; + vma = next_vma(vma, gate_vma)) + size += vma_dump_size(vma, mm_flags); + return size; +} + /* * Actual dumper * @@ -1857,6 +1885,9 @@ static int elf_core_dump(struct coredump_params *cprm) unsigned long mm_flags; struct elf_note_info info; struct elf_phdr *phdr4note = NULL; + struct elf_shdr *shdr4extnum = NULL; + Elf_Half e_phnum; + elf_addr_t e_shoff; /* * We no longer stop all VM operations. @@ -1885,12 +1916,19 @@ static int elf_core_dump(struct coredump_params *cprm) if (gate_vma != NULL) segs++; + /* for notes section */ + segs++; + + /* If segs > PN_XNUM(0xffff), then e_phnum overflows. To avoid + * this, kernel supports extended numbering. Have a look at + * include/linux/elf.h for further information. */ + e_phnum = segs > PN_XNUM ? PN_XNUM : segs; + /* * Collect all the non-memory information about the process for the * notes. This also sets up the file header. */ - if (!fill_note_info(elf, segs + 1, /* including notes section */ - &info, cprm->signr, cprm->regs)) + if (!fill_note_info(elf, e_phnum, &info, cprm->signr, cprm->regs)) goto cleanup; has_dumped = 1; @@ -1900,7 +1938,7 @@ static int elf_core_dump(struct coredump_params *cprm) set_fs(KERNEL_DS); offset += sizeof(*elf); /* Elf header */ - offset += (segs + 1) * sizeof(struct elf_phdr); /* Program headers */ + offset += segs * sizeof(struct elf_phdr); /* Program headers */ foffset = offset; /* Write notes phdr entry */ @@ -1926,6 +1964,19 @@ static int elf_core_dump(struct coredump_params *cprm) */ mm_flags = current->mm->flags; + offset += elf_core_vma_data_size(gate_vma, mm_flags); + offset += elf_core_extra_data_size(); + e_shoff = offset; + + if (e_phnum == PN_XNUM) { + shdr4extnum = kmalloc(sizeof(*shdr4extnum), GFP_KERNEL); + if (!shdr4extnum) + goto end_coredump; + fill_extnum_info(elf, shdr4extnum, e_shoff, segs); + } + + offset = dataoff; + size += sizeof(*elf); if (size > cprm->limit || !dump_write(cprm->file, elf, sizeof(*elf))) goto end_coredump; @@ -2003,11 +2054,20 @@ static int elf_core_dump(struct coredump_params *cprm) if (!elf_core_write_extra_data(cprm->file, &size, cprm->limit)) goto end_coredump; + if (e_phnum == PN_XNUM) { + size += sizeof(*shdr4extnum); + if (size > cprm->limit + || !dump_write(cprm->file, shdr4extnum, + sizeof(*shdr4extnum))) + goto end_coredump; + } + end_coredump: set_fs(fs); cleanup: free_note_info(&info); + kfree(shdr4extnum); kfree(phdr4note); kfree(elf); out: diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 112da491d75d..e49d9c06a4b6 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -1505,6 +1505,22 @@ static int elf_dump_thread_status(long signr, struct elf_thread_status *t) return sz; } +static void fill_extnum_info(struct elfhdr *elf, struct elf_shdr *shdr4extnum, + elf_addr_t e_shoff, int segs) +{ + elf->e_shoff = e_shoff; + elf->e_shentsize = sizeof(*shdr4extnum); + elf->e_shnum = 1; + elf->e_shstrndx = SHN_UNDEF; + + memset(shdr4extnum, 0, sizeof(*shdr4extnum)); + + shdr4extnum->sh_type = SHT_NULL; + shdr4extnum->sh_size = elf->e_shnum; + shdr4extnum->sh_link = elf->e_shstrndx; + shdr4extnum->sh_info = segs; +} + /* * dump the segments for an MMU process */ @@ -1569,6 +1585,17 @@ static int elf_fdpic_dump_segments(struct file *file, size_t *size, } #endif +static size_t elf_core_vma_data_size(unsigned long mm_flags) +{ + struct vm_area_struct *vma; + size_t size = 0; + + for (vma = current->mm->mmap; vma; vma->vm_next) + if (maydump(vma, mm_flags)) + size += vma->vm_end - vma->vm_start; + return size; +} + /* * Actual dumper * @@ -1601,6 +1628,9 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) elf_addr_t *auxv; unsigned long mm_flags; struct elf_phdr *phdr4note = NULL; + struct elf_shdr *shdr4extnum = NULL; + Elf_Half e_phnum; + elf_addr_t e_shoff; /* * We no longer stop all VM operations. @@ -1667,8 +1697,16 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) segs = current->mm->map_count; segs += elf_core_extra_phdrs(); + /* for notes section */ + segs++; + + /* If segs > PN_XNUM(0xffff), then e_phnum overflows. To avoid + * this, kernel supports extended numbering. Have a look at + * include/linux/elf.h for further information. */ + e_phnum = segs > PN_XNUM ? PN_XNUM : segs; + /* Set up header */ - fill_elf_fdpic_header(elf, segs + 1); /* including notes section */ + fill_elf_fdpic_header(elf, e_phnum); has_dumped = 1; current->flags |= PF_DUMPCORE; @@ -1708,7 +1746,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) set_fs(KERNEL_DS); offset += sizeof(*elf); /* Elf header */ - offset += (segs+1) * sizeof(struct elf_phdr); /* Program headers */ + offset += segs * sizeof(struct elf_phdr); /* Program headers */ foffset = offset; /* Write notes phdr entry */ @@ -1738,6 +1776,19 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) */ mm_flags = current->mm->flags; + offset += elf_core_vma_data_size(mm_flags); + offset += elf_core_extra_data_size(); + e_shoff = offset; + + if (e_phnum == PN_XNUM) { + shdr4extnum = kmalloc(sizeof(*shdr4extnum), GFP_KERNEL); + if (!shdr4extnum) + goto end_coredump; + fill_extnum_info(elf, shdr4extnum, e_shoff, segs); + } + + offset = dataoff; + size += sizeof(*elf); if (size > cprm->limit || !dump_write(cprm->file, elf, sizeof(*elf))) goto end_coredump; @@ -1802,6 +1853,14 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) if (!elf_core_write_extra_data(cprm->file, &size, cprm->limit)) goto end_coredump; + if (e_phnum == PN_XNUM) { + size += sizeof(*shdr4extnum); + if (size > cprm->limit + || !dump_write(cprm->file, shdr4extnum, + sizeof(*shdr4extnum))) + goto end_coredump; + } + if (cprm->file->f_pos != offset) { /* Sanity check */ printk(KERN_WARNING diff --git a/include/linux/elf.h b/include/linux/elf.h index ccde3fd45f36..597858418051 100644 --- a/include/linux/elf.h +++ b/include/linux/elf.h @@ -50,6 +50,28 @@ typedef __s64 Elf64_Sxword; #define PT_GNU_STACK (PT_LOOS + 0x474e551) +/* + * Extended Numbering + * + * If the real number of program header table entries is larger than + * or equal to PN_XNUM(0xffff), it is set to sh_info field of the + * section header at index 0, and PN_XNUM is set to e_phnum + * field. Otherwise, the section header at index 0 is zero + * initialized, if it exists. + * + * Specifications are available in: + * + * - Sun microsystems: Linker and Libraries. + * Part No: 817-1984-17, September 2008. + * URL: http://docs.sun.com/app/docs/doc/817-1984 + * + * - System V ABI AMD64 Architecture Processor Supplement + * Draft Version 0.99., + * May 11, 2009. + * URL: http://www.x86-64.org/ + */ +#define PN_XNUM 0xffff + /* These constants define the different elf file types */ #define ET_NONE 0 #define ET_REL 1 @@ -286,7 +308,7 @@ typedef struct elf64_phdr { #define SHN_COMMON 0xfff2 #define SHN_HIRESERVE 0xffff -typedef struct { +typedef struct elf32_shdr { Elf32_Word sh_name; Elf32_Word sh_type; Elf32_Word sh_flags; @@ -394,6 +416,7 @@ typedef struct elf64_note { extern Elf32_Dyn _DYNAMIC []; #define elfhdr elf32_hdr #define elf_phdr elf32_phdr +#define elf_shdr elf32_shdr #define elf_note elf32_note #define elf_addr_t Elf32_Off #define Elf_Half Elf32_Half @@ -403,6 +426,7 @@ extern Elf32_Dyn _DYNAMIC []; extern Elf64_Dyn _DYNAMIC []; #define elfhdr elf64_hdr #define elf_phdr elf64_phdr +#define elf_shdr elf64_shdr #define elf_note elf64_note #define elf_addr_t Elf64_Off #define Elf_Half Elf64_Half diff --git a/include/linux/elfcore.h b/include/linux/elfcore.h index cfda74f521b5..e687bc3ba4da 100644 --- a/include/linux/elfcore.h +++ b/include/linux/elfcore.h @@ -166,5 +166,6 @@ elf_core_write_extra_phdrs(struct file *file, loff_t offset, size_t *size, unsigned long limit); extern int elf_core_write_extra_data(struct file *file, size_t *size, unsigned long limit); +extern size_t elf_core_extra_data_size(void); #endif /* _LINUX_ELFCORE_H */ diff --git a/kernel/elfcore.c b/kernel/elfcore.c index 5445741f4b4c..ff915efef66d 100644 --- a/kernel/elfcore.c +++ b/kernel/elfcore.c @@ -21,3 +21,8 @@ int __weak elf_core_write_extra_data(struct file *file, size_t *size, { return 1; } + +size_t __weak elf_core_extra_data_size(void) +{ + return 0; +} -- cgit v1.2.3 From 30736a4d43f4af7f1a7836d6a266be17082195c4 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu <mhiramat@redhat.com> Date: Fri, 5 Mar 2010 13:44:12 -0800 Subject: coredump: pass mm->flags as a coredump parameter for consistency Pass mm->flags as a coredump parameter for consistency. --- 1787 if (mm->core_state || !get_dumpable(mm)) { <- (1) 1788 up_write(&mm->mmap_sem); 1789 put_cred(cred); 1790 goto fail; 1791 } 1792 [...] 1798 if (get_dumpable(mm) == 2) { /* Setuid core dump mode */ <-(2) 1799 flag = O_EXCL; /* Stop rewrite attacks */ 1800 cred->fsuid = 0; /* Dump root private */ 1801 } --- Since dumpable bits are not protected by lock, there is a chance to change these bits between (1) and (2). To solve this issue, this patch copies mm->flags to coredump_params.mm_flags at the beginning of do_coredump() and uses it instead of get_dumpable() while dumping core. This copy is also passed to binfmt->core_dump, since elf*_core_dump() uses dump_filter bits in mm->flags. [akpm@linux-foundation.org: fix merge] Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com> Acked-by: Roland McGrath <roland@redhat.com> Cc: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Ingo Molnar <mingo@elte.hu> Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- fs/binfmt_elf.c | 14 +++----------- fs/binfmt_elf_fdpic.c | 14 +++----------- fs/exec.c | 20 ++++++++++++++++---- include/linux/binfmts.h | 1 + 4 files changed, 23 insertions(+), 26 deletions(-) (limited to 'fs') diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 78de530cfb02..535e763ab1a6 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1882,7 +1882,6 @@ static int elf_core_dump(struct coredump_params *cprm) struct vm_area_struct *vma, *gate_vma; struct elfhdr *elf = NULL; loff_t offset = 0, dataoff, foffset; - unsigned long mm_flags; struct elf_note_info info; struct elf_phdr *phdr4note = NULL; struct elf_shdr *shdr4extnum = NULL; @@ -1957,14 +1956,7 @@ static int elf_core_dump(struct coredump_params *cprm) dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE); - /* - * We must use the same mm->flags while dumping core to avoid - * inconsistency between the program headers and bodies, otherwise an - * unusable core file can be generated. - */ - mm_flags = current->mm->flags; - - offset += elf_core_vma_data_size(gate_vma, mm_flags); + offset += elf_core_vma_data_size(gate_vma, cprm->mm_flags); offset += elf_core_extra_data_size(); e_shoff = offset; @@ -1995,7 +1987,7 @@ static int elf_core_dump(struct coredump_params *cprm) phdr.p_offset = offset; phdr.p_vaddr = vma->vm_start; phdr.p_paddr = 0; - phdr.p_filesz = vma_dump_size(vma, mm_flags); + phdr.p_filesz = vma_dump_size(vma, cprm->mm_flags); phdr.p_memsz = vma->vm_end - vma->vm_start; offset += phdr.p_filesz; phdr.p_flags = vma->vm_flags & VM_READ ? PF_R : 0; @@ -2030,7 +2022,7 @@ static int elf_core_dump(struct coredump_params *cprm) unsigned long addr; unsigned long end; - end = vma->vm_start + vma_dump_size(vma, mm_flags); + end = vma->vm_start + vma_dump_size(vma, cprm->mm_flags); for (addr = vma->vm_start; addr < end; addr += PAGE_SIZE) { struct page *page; diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index e49d9c06a4b6..6d6a16c5e9bb 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -1626,7 +1626,6 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) #endif int thread_status_size = 0; elf_addr_t *auxv; - unsigned long mm_flags; struct elf_phdr *phdr4note = NULL; struct elf_shdr *shdr4extnum = NULL; Elf_Half e_phnum; @@ -1769,14 +1768,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) /* Page-align dumped data */ dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE); - /* - * We must use the same mm->flags while dumping core to avoid - * inconsistency between the program headers and bodies, otherwise an - * unusable core file can be generated. - */ - mm_flags = current->mm->flags; - - offset += elf_core_vma_data_size(mm_flags); + offset += elf_core_vma_data_size(cprm->mm_flags); offset += elf_core_extra_data_size(); e_shoff = offset; @@ -1809,7 +1801,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) phdr.p_offset = offset; phdr.p_vaddr = vma->vm_start; phdr.p_paddr = 0; - phdr.p_filesz = maydump(vma, mm_flags) ? sz : 0; + phdr.p_filesz = maydump(vma, cprm->mm_flags) ? sz : 0; phdr.p_memsz = sz; offset += phdr.p_filesz; phdr.p_flags = vma->vm_flags & VM_READ ? PF_R : 0; @@ -1847,7 +1839,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) goto end_coredump; if (elf_fdpic_dump_segments(cprm->file, &size, &cprm->limit, - mm_flags) < 0) + cprm->mm_flags) < 0) goto end_coredump; if (!elf_core_write_extra_data(cprm->file, &size, cprm->limit)) diff --git a/fs/exec.c b/fs/exec.c index da2b31dc4e1c..89d4080c1435 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1748,14 +1748,19 @@ void set_dumpable(struct mm_struct *mm, int value) } } -int get_dumpable(struct mm_struct *mm) +static int __get_dumpable(unsigned long mm_flags) { int ret; - ret = mm->flags & 0x3; + ret = mm_flags & MMF_DUMPABLE_MASK; return (ret >= 2) ? 2 : ret; } +int get_dumpable(struct mm_struct *mm) +{ + return __get_dumpable(mm->flags); +} + static void wait_for_dump_helpers(struct file *file) { struct pipe_inode_info *pipe; @@ -1799,6 +1804,12 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs) .signr = signr, .regs = regs, .limit = rlimit(RLIMIT_CORE), + /* + * We must use the same mm->flags while dumping core to avoid + * inconsistency of bit flags, since this flag is not protected + * by any locks. + */ + .mm_flags = mm->flags, }; audit_core_dumps(signr); @@ -1817,7 +1828,7 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs) /* * If another thread got here first, or we are not dumpable, bail out. */ - if (mm->core_state || !get_dumpable(mm)) { + if (mm->core_state || !__get_dumpable(cprm.mm_flags)) { up_write(&mm->mmap_sem); put_cred(cred); goto fail; @@ -1828,7 +1839,8 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs) * process nor do we know its entire history. We only know it * was tainted so we dump it as root in mode 2. */ - if (get_dumpable(mm) == 2) { /* Setuid core dump mode */ + if (__get_dumpable(cprm.mm_flags) == 2) { + /* Setuid core dump mode */ flag = O_EXCL; /* Stop rewrite attacks */ cred->fsuid = 0; /* Dump root private */ } diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index 89c6249fc561..c809e286d213 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -74,6 +74,7 @@ struct coredump_params { struct pt_regs *regs; struct file *file; unsigned long limit; + unsigned long mm_flags; }; /* -- cgit v1.2.3 From 5c99cbf49a6e1a1efd25b11f4604c65c455e1612 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov <oleg@redhat.com> Date: Fri, 5 Mar 2010 13:44:14 -0800 Subject: coredump: set ->group_exit_code for other CLONE_VM tasks too User visible change. do_coredump() kills all threads which share the same ->mm but only the coredumping process gets the proper exit_code. Other tasks which share the same ->mm die "silently" and return status == 0 to parent. This is historical behaviour, not actually a bug. But I think Frank Heckenbach rightly dislikes the current behaviour. Simple test-case: #include <stdio.h> #include <unistd.h> #include <signal.h> #include <sys/wait.h> int main(void) { int stat; if (!fork()) { if (!vfork()) kill(getpid(), SIGQUIT); } wait(&stat); printf("stat=%x\n", stat); return 0; } Before this patch it prints "stat=0" despite the fact the child was killed by SIGQUIT. After this patch the output is "stat=3" which obviously makes more sense. Even with this patch, only the task which originates the coredumping gets "|= 0x80" if the core was actually dumped, but at least the coredumping signal is visible to do_wait/etc. Reported-by: Frank Heckenbach <f.heckenbach@fh-soft.de> Signed-off-by: Oleg Nesterov <oleg@redhat.com> Acked-by: WANG Cong <xiyou.wangcong@gmail.com> Cc: Roland McGrath <roland@redhat.com> Cc: Neil Horman <nhorman@tuxdriver.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- fs/exec.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index 89d4080c1435..829a6c6d1803 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1561,12 +1561,13 @@ out: return ispipe; } -static int zap_process(struct task_struct *start) +static int zap_process(struct task_struct *start, int exit_code) { struct task_struct *t; int nr = 0; start->signal->flags = SIGNAL_GROUP_EXIT; + start->signal->group_exit_code = exit_code; start->signal->group_stop_count = 0; t = start; @@ -1591,8 +1592,7 @@ static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm, spin_lock_irq(&tsk->sighand->siglock); if (!signal_group_exit(tsk->signal)) { mm->core_state = core_state; - tsk->signal->group_exit_code = exit_code; - nr = zap_process(tsk); + nr = zap_process(tsk, exit_code); } spin_unlock_irq(&tsk->sighand->siglock); if (unlikely(nr < 0)) @@ -1641,7 +1641,7 @@ static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm, if (p->mm) { if (unlikely(p->mm == mm)) { lock_task_sighand(p, &flags); - nr += zap_process(p); + nr += zap_process(p, exit_code); unlock_task_sighand(p, &flags); } break; -- cgit v1.2.3 From 76595f79d76fbe6267a51b3a866a028d150f06d4 Mon Sep 17 00:00:00 2001 From: Neil Horman <nhorman@tuxdriver.com> Date: Fri, 5 Mar 2010 13:44:16 -0800 Subject: coredump: suppress uid comparison test if core output files are pipes Modify uid check in do_coredump so as to not apply it in the case of pipes. This just got noticed in testing. The end of do_coredump validates the uid of the inode for the created file against the uid of the crashing process to ensure that no one can pre-create a core file with different ownership and grab the information contained in the core when they shouldn' tbe able to. This causes failures when using pipes for a core dumps if the crashing process is not root, which is the uid of the pipe when it is created. The fix is simple. Since the check for matching uid's isn't relevant for pipes (a process can't create a pipe that the uermodehelper code will open anyway), we can just just skip it in the event ispipe is non-zero Reverts a pipe-affecting change which was accidentally made in : commit c46f739dd39db3b07ab5deb4e3ec81e1c04a91af : Author: Ingo Molnar <mingo@elte.hu> : AuthorDate: Wed Nov 28 13:59:18 2007 +0100 : Commit: Linus Torvalds <torvalds@woody.linux-foundation.org> : CommitDate: Wed Nov 28 10:58:01 2007 -0800 : : vfs: coredumping fix Signed-off-by: Neil Horman <nhorman@tuxdriver.com> Cc: Andi Kleen <andi@firstfloor.org> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Alan Cox <alan@lxorguk.ukuu.org.uk> Cc: Al Viro <viro@zeniv.linux.org.uk> Cc: Ingo Molnar <mingo@elte.hu> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- fs/exec.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index 829a6c6d1803..49cdaa19e5b9 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1936,8 +1936,9 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs) /* * Dont allow local users get cute and trick others to coredump * into their pre-created files: + * Note, this is not relevant for pipes */ - if (inode->i_uid != current_fsuid()) + if (!ispipe && (inode->i_uid != current_fsuid())) goto close_fail; if (!cprm.file->f_op) goto close_fail; -- cgit v1.2.3 From b8fa05719ba4349be80ce929237249b57886a203 Mon Sep 17 00:00:00 2001 From: Linus Torvalds <torvalds@linux-foundation.org> Date: Sun, 7 Mar 2010 09:54:44 -0800 Subject: Revert "lib: build list_sort() only if needed" This reverts commit a069c266ae5fdfbf5b4aecf2c672413aa33b2504. It turns ou that not only was it missing a case (XFS) that needed it, but perhaps more importantly, people sometimes want to enable new modules that they hadn't had enabled before, and if such a module uses list_sort(), it can't easily be inserted any more. So rather than add a "select LIST_SORT" to the XFS case, just leave it compiled in. It's not all _that_ big, after all, and the inconvenience isn't worth it. Requested-by: Alexey Dobriyan <adobriyan@gmail.com> Cc: Christoph Hellwig <hch@infradead.org> Cc: Don Mullis <don.mullis@gmail.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Dave Chinner <david@fromorbit.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- drivers/gpu/drm/Kconfig | 1 - fs/ubifs/Kconfig | 1 - lib/Kconfig | 3 --- lib/Makefile | 3 +-- 4 files changed, 1 insertion(+), 7 deletions(-) (limited to 'fs') diff --git a/drivers/gpu/drm/Kconfig b/drivers/gpu/drm/Kconfig index 3d2ab03f1296..305c59003963 100644 --- a/drivers/gpu/drm/Kconfig +++ b/drivers/gpu/drm/Kconfig @@ -9,7 +9,6 @@ menuconfig DRM depends on (AGP || AGP=n) && PCI && !EMULATED_CMPXCHG && MMU select I2C select I2C_ALGOBIT - select LIST_SORT help Kernel-level support for the Direct Rendering Infrastructure (DRI) introduced in XFree86 4.0. If you say Y here, you need to select diff --git a/fs/ubifs/Kconfig b/fs/ubifs/Kconfig index 430c69f39842..830e3f76f442 100644 --- a/fs/ubifs/Kconfig +++ b/fs/ubifs/Kconfig @@ -7,7 +7,6 @@ config UBIFS_FS select CRYPTO if UBIFS_FS_ZLIB select CRYPTO_LZO if UBIFS_FS_LZO select CRYPTO_DEFLATE if UBIFS_FS_ZLIB - select LIST_SORT depends on MTD_UBI help UBIFS is a file system for flash devices which works on top of UBI. diff --git a/lib/Kconfig b/lib/Kconfig index 496d16e1fa2c..170d8ca901d8 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -160,9 +160,6 @@ config TEXTSEARCH_BM config TEXTSEARCH_FSM tristate -config LIST_SORT - boolean - config BTREE boolean diff --git a/lib/Makefile b/lib/Makefile index 59e46a014bc6..2e152aed7198 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -21,7 +21,7 @@ lib-y += kobject.o kref.o klist.o obj-y += bcd.o div64.o sort.o parser.o halfmd4.o debug_locks.o random32.o \ bust_spinlocks.o hexdump.o kasprintf.o bitmap.o scatterlist.o \ - string_helpers.o gcd.o + string_helpers.o gcd.o list_sort.o ifeq ($(CONFIG_DEBUG_KOBJECT),y) CFLAGS_kobject.o += -DDEBUG @@ -40,7 +40,6 @@ lib-$(CONFIG_GENERIC_FIND_FIRST_BIT) += find_next_bit.o lib-$(CONFIG_GENERIC_FIND_NEXT_BIT) += find_next_bit.o obj-$(CONFIG_GENERIC_FIND_LAST_BIT) += find_last_bit.o obj-$(CONFIG_GENERIC_HWEIGHT) += hweight.o -obj-$(CONFIG_LIST_SORT) += list_sort.o obj-$(CONFIG_LOCK_KERNEL) += kernel_lock.o obj-$(CONFIG_BTREE) += btree.o obj-$(CONFIG_DEBUG_PREEMPT) += smp_processor_id.o -- cgit v1.2.3 From 138860b95359feee49a31fb12be87825e4cd84ac Mon Sep 17 00:00:00 2001 From: Randy Dunlap <randy.dunlap@oracle.com> Date: Thu, 4 Mar 2010 09:37:12 -0800 Subject: seq_file: fix new kernel-doc warnings Fix kernel-doc notation in new seq-file functions and correct spelling. Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com> Cc: Li Zefan <lizf@cn.fujitsu.com> Cc: Alexander Viro <viro@zeniv.linux.org.uk> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- fs/seq_file.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/seq_file.c b/fs/seq_file.c index 5afd554efad3..e1f437be6c3c 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -734,7 +734,7 @@ EXPORT_SYMBOL(seq_hlist_start_head); * seq_hlist_next - move to the next position of the hlist * @v: the current iterator * @head: the head of the hlist - * @pos: the current posision + * @ppos: the current position * * Called at seq_file->op->next(). */ @@ -800,7 +800,7 @@ EXPORT_SYMBOL(seq_hlist_start_head_rcu); * seq_hlist_next_rcu - move to the next position of the hlist protected by RCU * @v: the current iterator * @head: the head of the hlist - * @pos: the current posision + * @ppos: the current position * * Called at seq_file->op->next(). * -- cgit v1.2.3